diff --git a/Jenkinsfile b/Jenkinsfile index 02f25c019..aa76f0799 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -15,6 +15,8 @@ stage("Build and Publish") { sh label: "Build Environment", script: """set -ex conda env update -n ${ENV_NAME} -f static/build.yml + pip uninstall -y d2lbook + pip install git+https://github.com/d2l-ai/d2l-book pip list nvidia-smi """ @@ -35,6 +37,7 @@ stage("Build and Publish") { conda activate ${ENV_NAME} ./static/cache.sh restore _build/eval_pytorch/data d2lbook build eval --tab pytorch + d2lbook build slides --tab pytorch ./static/cache.sh store _build/eval_pytorch/data """ @@ -60,13 +63,13 @@ stage("Build and Publish") { sh label:"Release", script:"""set -ex conda activate ${ENV_NAME} d2lbook build pkg - d2lbook deploy html pdf --s3 s3://zh-v2.d2l.ai + d2lbook deploy html pdf slides --s3 s3://zh-v2.d2l.ai """ } else { sh label:"Publish", script:"""set -ex conda activate ${ENV_NAME} - d2lbook deploy html pdf --s3 s3://preview.d2l.ai/${JOB_NAME}/ + d2lbook deploy html pdf slides --s3 s3://preview.d2l.ai/${JOB_NAME}/ """ if (env.BRANCH_NAME.startsWith("PR-")) { pullRequest.comment("Job ${JOB_NAME}/${BUILD_NUMBER} is complete. \nCheck the results at http://preview.d2l.ai/${JOB_NAME}/") diff --git a/Jenkinsfile_origin b/Jenkinsfile_origin index d043e5c95..dd4bb88c9 100644 --- a/Jenkinsfile_origin +++ b/Jenkinsfile_origin @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/Jenkinsfile -commit: 9bf95b1 ---- - stage("Build and Publish") { // such as d2l-en and d2l-zh def REPO_NAME = env.JOB_NAME.split('/')[0] @@ -17,12 +12,12 @@ stage("Build and Publish") { checkout scm // conda environment def ENV_NAME = "${TASK}-${EXECUTOR_NUMBER}"; - // assign two GPUs to each build - def EID = EXECUTOR_NUMBER.toInteger() - def CUDA_VISIBLE_DEVICES=(EID*2).toString() + ',' + (EID*2+1).toString(); sh label: "Build Environment", script: """set -ex conda env update -n ${ENV_NAME} -f static/build.yml + conda activate ${ENV_NAME} + pip uninstall -y d2lbook + pip install git+https://github.com/d2l-ai/d2l-book pip list nvidia-smi """ @@ -34,7 +29,6 @@ stage("Build and Publish") { sh label: "Execute Notebooks", script: """set -ex conda activate ${ENV_NAME} - export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} ./static/cache.sh restore _build/eval/data d2lbook build eval ./static/cache.sh store _build/eval/data @@ -42,15 +36,14 @@ stage("Build and Publish") { sh label: "Execute Notebooks [PyTorch]", script: """set -ex conda activate ${ENV_NAME} - export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} ./static/cache.sh restore _build/eval_pytorch/data d2lbook build eval --tab pytorch + d2lbook build slides --tab pytorch ./static/cache.sh store _build/eval_pytorch/data """ sh label: "Execute Notebooks [TensorFlow]", script: """set -ex conda activate ${ENV_NAME} - export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} ./static/cache.sh restore _build/eval_tensorflow/data export TF_CPP_MIN_LOG_LEVEL=3 d2lbook build eval --tab tensorflow @@ -71,7 +64,7 @@ stage("Build and Publish") { sh label:"Release", script:"""set -ex conda activate ${ENV_NAME} d2lbook build pkg - d2lbook deploy html pdf pkg colab sagemaker --s3 s3://preview.d2l.ai/${JOB_NAME}/ + d2lbook deploy html pdf pkg colab sagemaker slides --s3 s3://en.d2l.ai/ """ sh label:"Release d2l", script:"""set -ex @@ -83,7 +76,7 @@ stage("Build and Publish") { } else { sh label:"Publish", script:"""set -ex conda activate ${ENV_NAME} - d2lbook deploy html pdf --s3 s3://preview.d2l.ai/${JOB_NAME}/ + d2lbook deploy html pdf slides --s3 s3://preview.d2l.ai/${JOB_NAME}/ """ if (env.BRANCH_NAME.startsWith("PR-")) { pullRequest.comment("Job ${JOB_NAME}/${BUILD_NUMBER} is complete. \nCheck the results at http://preview.d2l.ai/${JOB_NAME}/") diff --git a/chapter_preliminaries/autograd.md b/chapter_preliminaries/autograd.md index 29be0644a..afc7d66d4 100644 --- a/chapter_preliminaries/autograd.md +++ b/chapter_preliminaries/autograd.md @@ -6,43 +6,36 @@ 深度学习框架通过自动计算导数(即 *自动求导*(automatic differentiation))来加快这项工作。实际中,根据我们设计的模型,系统会构建一个 *计算图*(computational graph),来跟踪数据通过若干操作组合起来产生输出。自动求导使系统能够随后反向传播梯度。 这里,*反向传播*(backpropagate)只是意味着跟踪整个计算图,填充关于每个参数的偏导数。 -```{.python .input} -from mxnet import autograd, np, npx -npx.set_np() -``` - -```{.python .input} -#@tab pytorch -import torch -``` - -```{.python .input} -#@tab tensorflow -import tensorflow as tf -``` ## 一个简单的例子 -作为一个演示例子,假设我们想对函数 $y = 2\mathbf{x}^{\top}\mathbf{x}$关于列向量 $\mathbf{x}$求导。首先,我们创建变量 `x` 并为其分配一个初始值。 +作为一个演示例子,(**假设我们想对函数 $y = 2\mathbf{x}^{\top}\mathbf{x}$关于列向量 $\mathbf{x}$求导**)。首先,我们创建变量 `x` 并为其分配一个初始值。 ```{.python .input} +from mxnet import autograd, np, npx +npx.set_np() + x = np.arange(4.0) x ``` ```{.python .input} #@tab pytorch +import torch + x = torch.arange(4.0) x ``` ```{.python .input} #@tab tensorflow +import tensorflow as tf + x = tf.range(4, dtype=tf.float32) x ``` -在我们计算$y$关于$\mathbf{x}$的梯度之前,我们需要一个地方来存储梯度。 +[**在我们计算$y$关于$\mathbf{x}$的梯度之前,我们需要一个地方来存储梯度。**] 重要的是,我们不会在每次对一个参数求导时都分配新的内存。因为我们经常会成千上万次地更新相同的参数,每次都分配新的内存可能很快就会将内存耗尽。注意,标量函数关于向量$\mathbf{x}$的梯度是向量,并且与$\mathbf{x}$具有相同的形状。 ```{.python .input} @@ -64,7 +57,7 @@ x.grad # 默认值是None x = tf.Variable(x) ``` -现在让我们计算 $y$。 +(**现在让我们计算 $y$。**) ```{.python .input} # 把代码放到`autograd.record`内,以建立计算图 @@ -87,7 +80,7 @@ with tf.GradientTape() as t: y ``` -`x` 是一个长度为 4 的向量,计算 `x` 和 `x` 的内积,得到了我们赋值给 `y` 的标量输出。接下来,我们可以通过调用反向传播函数来自动计算`y`关于`x` 每个分量的梯度,并打印这些梯度。 +`x` 是一个长度为 4 的向量,计算 `x` 和 `x` 的内积,得到了我们赋值给 `y` 的标量输出。接下来,我们可以[**通过调用反向传播函数来自动计算`y`关于`x` 每个分量的梯度**],并打印这些梯度。 ```{.python .input} y.backward() @@ -122,7 +115,7 @@ x.grad == 4 * x x_grad == 4 * x ``` -现在让我们计算 `x` 的另一个函数。 +[**现在让我们计算 `x` 的另一个函数。**] ```{.python .input} with autograd.record(): @@ -151,7 +144,7 @@ t.gradient(y, x) # 被新计算的梯度覆盖 当 `y` 不是标量时,向量`y`关于向量`x`的导数的最自然解释是一个矩阵。对于高阶和高维的 `y` 和 `x`,求导的结果可以是一个高阶张量。 -然而,虽然这些更奇特的对象确实出现在高级机器学习中(包括深度学习中),但当我们调用向量的反向计算时,我们通常会试图计算一批训练样本中每个组成部分的损失函数的导数。这里,我们的目的不是计算微分矩阵,而是批量中每个样本单独计算的偏导数之和。 +然而,虽然这些更奇特的对象确实出现在高级机器学习中(包括[**深度学习中**]),但当我们调用向量的反向计算时,我们通常会试图计算一批训练样本中每个组成部分的损失函数的导数。这里(**,我们的目的不是计算微分矩阵,而是批量中每个样本单独计算的偏导数之和。**) ```{.python .input} # 当我们对向量值变量`y`(关于`x`的函数)调用`backward`时, @@ -167,7 +160,7 @@ x.grad # 等价于y = sum(x * x) # 对非标量调用`backward`需要传入一个`gradient`参数,该参数指定微分函数关于`self`的梯度。在我们的例子中,我们只想求偏导数的和,所以传递一个1的梯度是合适的 x.grad.zero_() y = x * x -# 等价于y.backward(torch.ones(len(x))) +# 等价于y.backward(torch.ones(len(x))) y.sum().backward() x.grad ``` @@ -181,7 +174,7 @@ t.gradient(y, x) # 等价于 `y = tf.reduce_sum(x * x)` ## 分离计算 -有时,我们希望将某些计算移动到记录的计算图之外。 +有时,我们希望[**将某些计算移动到记录的计算图之外**]。 例如,假设`y`是作为`x`的函数计算的,而`z`则是作为`y`和`x`的函数计算的。 现在,想象一下,我们想计算 `z` 关于 `x` 的梯度,但由于某种原因,我们希望将 `y` 视为一个常数,并且只考虑到 `x` 在`y`被计算后发挥的作用。 diff --git a/chapter_preliminaries/autograd_origin.md b/chapter_preliminaries/autograd_origin.md index 56fcf26ad..446ef31b6 100644 --- a/chapter_preliminaries/autograd_origin.md +++ b/chapter_preliminaries/autograd_origin.md @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/autograd.md -commit: 9e55a9c ---- - # Automatic Differentiation :label:`sec_autograd` @@ -24,49 +19,42 @@ Automatic differentiation enables the system to subsequently backpropagate gradi Here, *backpropagate* simply means to trace through the computational graph, filling in the partial derivatives with respect to each parameter. -```{.python .input} -from mxnet import autograd, np, npx -npx.set_np() -``` - -```{.python .input} -#@tab pytorch -import torch -``` - -```{.python .input} -#@tab tensorflow -import tensorflow as tf -``` ## A Simple Example As a toy example, say that we are interested -in differentiating the function +in (**differentiating the function $y = 2\mathbf{x}^{\top}\mathbf{x}$ -with respect to the column vector $\mathbf{x}$. +with respect to the column vector $\mathbf{x}$.**) To start, let us create the variable `x` and assign it an initial value. ```{.python .input} +from mxnet import autograd, np, npx +npx.set_np() + x = np.arange(4.0) x ``` ```{.python .input} #@tab pytorch +import torch + x = torch.arange(4.0) x ``` ```{.python .input} #@tab tensorflow +import tensorflow as tf + x = tf.range(4, dtype=tf.float32) x ``` -Before we even calculate the gradient +[**Before we even calculate the gradient of $y$ with respect to $\mathbf{x}$, -we will need a place to store it. +we will need a place to store it.**] It is important that we do not allocate new memory every time we take a derivative with respect to a parameter because we will often update the same parameters @@ -95,7 +83,7 @@ x.grad # The default value is None x = tf.Variable(x) ``` -Now let us calculate $y$. +(**Now let us calculate $y$.**) ```{.python .input} # Place our code inside an `autograd.record` scope to build the computational @@ -122,8 +110,8 @@ y Since `x` is a vector of length 4, an inner product of `x` and `x` is performed, yielding the scalar output that we assign to `y`. -Next, we can automatically calculate the gradient of `y` -with respect to each component of `x` +Next, [**we can automatically calculate the gradient of `y` +with respect to each component of `x`**] by calling the function for backpropagation and printing the gradient. ```{.python .input} @@ -143,8 +131,8 @@ x_grad = t.gradient(y, x) x_grad ``` -The gradient of the function $y = 2\mathbf{x}^{\top}\mathbf{x}$ -with respect to $\mathbf{x}$ should be $4\mathbf{x}$. +(**The gradient of the function $y = 2\mathbf{x}^{\top}\mathbf{x}$ +with respect to $\mathbf{x}$ should be $4\mathbf{x}$.**) Let us quickly verify that our desired gradient was calculated correctly. ```{.python .input} @@ -161,7 +149,7 @@ x.grad == 4 * x x_grad == 4 * x ``` -Now let us calculate another function of `x`. +[**Now let us calculate another function of `x`.**] ```{.python .input} with autograd.record(): @@ -172,9 +160,9 @@ x.grad # Overwritten by the newly calculated gradient ```{.python .input} #@tab pytorch -# PyTorch accumulates the gradient in default, we need to clear the previous +# PyTorch accumulates the gradient in default, we need to clear the previous # values -x.grad.zero_() +x.grad.zero_() y = x.sum() y.backward() x.grad @@ -196,13 +184,13 @@ For higher-order and higher-dimensional `y` and `x`, the differentiation result could be a high-order tensor. However, while these more exotic objects do show up -in advanced machine learning (including in deep learning), -more often when we are calling backward on a vector, +in advanced machine learning (including [**in deep learning**]), +more often (**when we are calling backward on a vector,**) we are trying to calculate the derivatives of the loss functions for each constituent of a *batch* of training examples. -Here, our intent is not to calculate the differentiation matrix -but rather the sum of the partial derivatives -computed individually for each example in the batch. +Here, (**our intent is**) not to calculate the differentiation matrix +but rather (**the sum of the partial derivatives +computed individually for each example**) in the batch. ```{.python .input} # When we invoke `backward` on a vector-valued variable `y` (function of `x`), @@ -236,8 +224,8 @@ t.gradient(y, x) # Same as `y = tf.reduce_sum(x * x)` ## Detaching Computation -Sometimes, we wish to move some calculations -outside of the recorded computational graph. +Sometimes, we wish to [**move some calculations +outside of the recorded computational graph.**] For example, say that `y` was calculated as a function of `x`, and that subsequently `z` was calculated as a function of both `y` and `x`. Now, imagine that we wanted to calculate @@ -309,10 +297,10 @@ t.gradient(y, x) == 2 * x ## Computing the Gradient of Python Control Flow One benefit of using automatic differentiation -is that even if building the computational graph of a function -required passing through a maze of Python control flow +is that [**even if**] building the computational graph of (**a function +required passing through a maze of Python control flow**) (e.g., conditionals, loops, and arbitrary function calls), -we can still calculate the gradient of the resulting variable. +(**we can still calculate the gradient of the resulting variable.**) In the following snippet, note that the number of iterations of the `while` loop and the evaluation of the `if` statement diff --git a/chapter_preliminaries/calculus.md b/chapter_preliminaries/calculus.md index 9275a29b0..0bf22d531 100644 --- a/chapter_preliminaries/calculus.md +++ b/chapter_preliminaries/calculus.md @@ -18,15 +18,15 @@ 我们首先讨论导数的计算,这是几乎所有深度学习优化算法的关键步骤。在深度学习中,我们通常选择对于模型参数可微的损失函数。简而言之,这意味着,对于每个参数, 如果我们把这个参数*增加*或*减少*一个无穷小的量,我们可以知道损失会以多快的速度增加或减少, -假设我们有一个函数 $f: \mathbb{R} \rightarrow \mathbb{R}$,其输入和输出都是标量。$f$ 的 *导数* 被定义为 +假设我们有一个函数 $f: \mathbb{R} \rightarrow \mathbb{R}$,其输入和输出都是标量。(**$f$ 的 *导数* 被定义为**) -$$f'(x) = \lim_{h \rightarrow 0} \frac{f(x+h) - f(x)}{h},$$ +(**$$f'(x) = \lim_{h \rightarrow 0} \frac{f(x+h) - f(x)}{h},$$**) :eqlabel:`eq_derivative` 如果这个极限存在。如果$f'(a)$存在,则称$f$在$a$处是*可微*(differentiable)的。如果 $f$ 在一个区间内的每个数上都是可微的,则此函数在此区间中是可微的。我们可以将 :eqref:`eq_derivative` 中的导数 $f'(x)$ 解释为$f(x)$相对于 $x$ 的 *瞬时*(instantaneous) 变化率。所谓的瞬时变化率是基于$x$中的变化$h$,且$h$接近$0$。 为了更好地解释导数,让我们用一个例子来做实验。 -定义$u = f(x) = 3x^2-4x$. +(**定义$u = f(x) = 3x^2-4x$.**) ```{.python .input} %matplotlib inline @@ -61,7 +61,7 @@ def f(x): return 3 * x ** 2 - 4 * x ``` -通过令 $x=1$ 并让 $h$ 接近 $0$, :eqref:`eq_derivative` 中$\frac{f(x+h) - f(x)}{h}$ 的数值结果接近 $2$。虽然这个实验不是一个数学证明,但我们稍后会看到,当 $x=1$时,导数 $u'$是 $2$ 。 +[**通过令 $x=1$ 并让 $h$ 接近 $0$,**] :eqref:`eq_derivative` 中$(**\frac{f(x+h) - f(x)}{h}$ 的数值结果接近 $2$**)。虽然这个实验不是一个数学证明,但我们稍后会看到,当 $x=1$时,导数 $u'$是 $2$ 。 ```{.python .input} #@tab all @@ -104,7 +104,7 @@ $$\frac{d}{dx} \left[\frac{f(x)}{g(x)}\right] = \frac{g(x) \frac{d}{dx} [f(x)] - 现在我们可以应用上述几个法则来计算 $u' = f'(x) = 3 \frac{d}{dx} x^2-4\frac{d}{dx}x = 6x-4$。因此,通过令 $x = 1$ ,我们有 $u' = 2$ :这一点得到了我们在本节前面的实验的支持,在这个实验中,数值结果接近$2$。当 $x=1$ 时,此导数也是曲线 $u = f(x)$ 切线的斜率。 -为了对导数的这种解释进行可视化,我们将使用 `matplotlib`,一个Python中流行的绘图库。要配置`matplotlib`生成图形的属性,我们需要定义几个函数。 +[**为了对导数的这种解释进行可视化,**]我们将使用 `matplotlib`,一个Python中流行的绘图库。要配置`matplotlib`生成图形的属性,我们需要(**定义几个函数**)。 在下面,`use_svg_display` 函数指定 `matplotlib` 软件包输出svg图表以获得更清晰的图像。 注意,注释`#@save`是一个特殊的标记,会将对应的函数、类或语句保存在`d2l`包中 @@ -182,7 +182,7 @@ def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend) ``` -现在我们可以绘制函数 $u = f(x)$ 及其在 $x=1$ 处的切线 $y = 2x - 3$,其中系数$2$是切线的斜率。 +现在我们可以[**绘制函数 $u = f(x)$ 及其在 $x=1$ 处的切线 $y = 2x - 3$**],其中系数$2$是切线的斜率。 ```{.python .input} #@tab all diff --git a/chapter_preliminaries/calculus_origin.md b/chapter_preliminaries/calculus_origin.md index 6848c7f0f..8ccd0311c 100644 --- a/chapter_preliminaries/calculus_origin.md +++ b/chapter_preliminaries/calculus_origin.md @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/calculus.md -commit: 9e55a9c ---- - # Calculus :label:`sec_calculus` @@ -39,14 +34,13 @@ Thus we can decompose the task of fitting models into two key concerns: i) *optimization*: the process of fitting our models to observed data; ii) *generalization*: the mathematical principles and practitioners' wisdom that guide as to how to produce models whose validity extends -beyond the exact set of data points used to train them. +beyond the exact set of data examples used to train them. To help you understand optimization problems and methods in later chapters, here we give a very brief primer on differential calculus that is commonly used in deep learning. - ## Derivatives and Differentiation We begin by addressing the calculation of derivatives, @@ -60,9 +54,10 @@ by an infinitesimally small amount. Suppose that we have a function $f: \mathbb{R} \rightarrow \mathbb{R}$, whose input and output are both scalars. -The *derivative* of $f$ is defined as +[**The *derivative* of $f$ is defined as**] -$$f'(x) = \lim_{h \rightarrow 0} \frac{f(x+h) - f(x)}{h},$$ + +(**$$f'(x) = \lim_{h \rightarrow 0} \frac{f(x+h) - f(x)}{h},$$**) :eqlabel:`eq_derivative` if this limit exists. @@ -78,7 +73,7 @@ the variation $h$ in $x$, which approaches $0$. To illustrate derivatives, let us experiment with an example. -Define $u = f(x) = 3x^2-4x$. +(**Define $u = f(x) = 3x^2-4x$.**) ```{.python .input} %matplotlib inline @@ -113,9 +108,10 @@ def f(x): return 3 * x ** 2 - 4 * x ``` -By setting $x=1$ and letting $h$ approach $0$, -the numerical result of $\frac{f(x+h) - f(x)}{h}$ -in :eqref:`eq_derivative` approaches $2$. +[**By setting $x=1$ and letting $h$ approach $0$, +the numerical result of $\frac{f(x+h) - f(x)}{h}$**] +in :eqref:`eq_derivative` +(**approaches $2$.**) Though this experiment is not a mathematical proof, we will see later that the derivative $u'$ is $2$ when $x=1$. @@ -170,13 +166,17 @@ where the numerical result approaches $2$. This derivative is also the slope of the tangent line to the curve $u = f(x)$ when $x = 1$. -To visualize such an interpretation of derivatives, -we will use `matplotlib`, +[**To visualize such an interpretation of derivatives, +we will use `matplotlib`,**] + a popular plotting library in Python. To configure properties of the figures produced by `matplotlib`, we need to define a few functions. In the following, the `use_svg_display` function specifies the `matplotlib` package to output the svg figures for sharper images. +Note that the comment `#@save` is a special mark where the following function, +class, or statements are saved in the `d2l` package +so later they can be directly invoked (e.g., `d2l.use_svg_display()`) without being redefined. ```{.python .input} #@tab all @@ -253,7 +253,7 @@ def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend) ``` -Now we can plot the function $u = f(x)$ and its tangent line $y = 2x - 3$ at $x=1$, where the coefficient $2$ is the slope of the tangent line. +Now we can [**plot the function $u = f(x)$ and its tangent line $y = 2x - 3$ at $x=1$**], where the coefficient $2$ is the slope of the tangent line. ```{.python .input} #@tab all diff --git a/chapter_preliminaries/index_origin.md b/chapter_preliminaries/index_origin.md index a9ba78fe4..1ab979216 100644 --- a/chapter_preliminaries/index_origin.md +++ b/chapter_preliminaries/index_origin.md @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/index.md -commit: 9e55a9c ---- - # Preliminaries :label:`chap_preliminaries` diff --git a/chapter_preliminaries/linear-algebra.md b/chapter_preliminaries/linear-algebra.md index 02080b5d3..e1b52719d 100644 --- a/chapter_preliminaries/linear-algebra.md +++ b/chapter_preliminaries/linear-algebra.md @@ -10,7 +10,7 @@ 在本书中,我们采用了数学表示法,其中标量变量由普通小写字母表示(例如,$x$、$y$ 和 $z$)。我们用 $\mathbb{R}$ 表示所有(连续)*实数* 标量的空间。为了方便,我们之后将严格定义 *空间*(space)是什么,但现在只要记住,表达式 $x \in \mathbb{R}$ 是表示$x$是一个实值标量的严格形式。符号 $\in$ 称为 “属于”,它表示“是集合中的成员”。我们可以用 $x, y \in \{0, 1\}$ 来表明 $x$ 和 $y$ 是值只能为 $0$ 或 $1$的数字。 -标量由只有一个元素的张量表示。在下面的代码中,我们实例化两个标量,并使用它们执行一些熟悉的算术运算,即加法,乘法,除法和指数。 +(**标量由只有一个元素的张量表示**)。在下面的代码中,我们实例化两个标量,并使用它们执行一些熟悉的算术运算,即加法,乘法,除法和指数。 ```{.python .input} from mxnet import np, npx @@ -44,7 +44,7 @@ x + y, x * y, x / y, x**y ## 向量 -你可以将向量视为标量值列表。我们将这些值称为向量的 *元素*(elements)或*分量*(components)。当我们的向量表示数据集中的样本时,它们的值具有一定的现实意义。例如,如果我们正在训练一个模型来预测贷款违约风险,我们可能会将每个申请人与一个向量相关联,其分量与其收入、工作年限、过往违约次数和其他因素相对应。如果我们正在研究医院患者可能面临的心脏病发作风险,我们可能会用一个向量来表示每个患者,其分量为最近的生命体征、胆固醇水平、每天运动时间等。在数学表示法中,我们通常将向量表示为粗体、小写的符号(例如,$\mathbf{x}$、$\mathbf{y}$和$\mathbf{z})$)。 +[**你可以将向量视为标量值列表**]。我们将这些值称为向量的 *元素*(elements)或*分量*(components)。当我们的向量表示数据集中的样本时,它们的值具有一定的现实意义。例如,如果我们正在训练一个模型来预测贷款违约风险,我们可能会将每个申请人与一个向量相关联,其分量与其收入、工作年限、过往违约次数和其他因素相对应。如果我们正在研究医院患者可能面临的心脏病发作风险,我们可能会用一个向量来表示每个患者,其分量为最近的生命体征、胆固醇水平、每天运动时间等。在数学表示法中,我们通常将向量表示为粗体、小写的符号(例如,$\mathbf{x}$、$\mathbf{y}$和$\mathbf{z})$)。 我们通过一维张量处理向量。一般来说,张量可以具有任意长度,取决于机器的内存限制。 @@ -70,7 +70,7 @@ x $$\mathbf{x} =\begin{bmatrix}x_{1} \\x_{2} \\ \vdots \\x_{n}\end{bmatrix},$$ :eqlabel:`eq_vec_def` -其中 $x_1, \ldots, x_n$ 是向量的元素。在代码中,我们通过张量的索引来访问任一元素。 +其中 $x_1, \ldots, x_n$ 是向量的元素。在代码中,我们(**通过张量的索引来访问任一元素**)。 ```{.python .input} x[3] @@ -90,7 +90,7 @@ x[3] 让我们回顾一下 :numref:`sec_ndarray` 中的一些概念。向量只是一个数字数组。就像每个数组都有一个长度一样,每个向量也是如此。在数学表示法中,如果我们想说一个向量 $\mathbf{x}$ 由 $n$ 个实值标量组成,我们可以将其表示为 $\mathbf{x} \in \mathbb{R}^n$。向量的长度通常称为向量的 *维度*(dimension)。 -与普通的 Python 数组一样,我们可以通过调用 Python 的内置 `len()` 函数来访问张量的长度。 +与普通的 Python 数组一样,我们可以通过调用 Python 的内置 `len()` 函数来[**访问张量的长度**]。 ```{.python .input} len(x) @@ -106,7 +106,7 @@ len(x) len(x) ``` -当用张量表示一个向量(只有一个轴)时,我们也可以通过 `.shape` 属性访问向量的长度。形状(shape)是一个元组,列出了张量沿每个轴的长度(维数)。对于只有一个轴的张量,形状只有一个元素。 +当用张量表示一个向量(只有一个轴)时,我们也可以通过 `.shape` 属性访问向量的长度。形状(shape)是一个元组,列出了张量沿每个轴的长度(维数)。对于(**只有一个轴的张量,形状只有一个元素。**) ```{.python .input} x.shape @@ -135,7 +135,7 @@ $$\mathbf{A}=\begin{bmatrix} a_{11} & a_{12} & \cdots & a_{1n} \\ a_{21} & a_{22 对于任意$\mathbf{A} \in \mathbb{R}^{m \times n}$,$\mathbf{A}$的形状是($m$, $n$)或$m \times n$。当矩阵具有相同数量的行和列时,其形状将变为正方形;因此,它被称为 *方矩阵*(square matrix)。 -当调用函数来实例化张量时,我们可以通过指定两个分量$m$ 和 $n$来创建一个形状为$m \times n$ 的矩阵。 +当调用函数来实例化张量时,我们可以[**通过指定两个分量$m$ 和 $n$来创建一个形状为$m \times n$ 的矩阵**]。 ```{.python .input} A = np.arange(20).reshape(5, 4) @@ -168,7 +168,7 @@ $$ \end{bmatrix}. $$ -现在我们在代码中访问矩阵的转置。 +现在我们在代码中访问(**矩阵的转置**)。 ```{.python .input} A.T @@ -184,7 +184,7 @@ A.T tf.transpose(A) ``` -作为方矩阵的一种特殊类型,*对称矩阵*(symmetric matrix) $\mathbf{A}$ 等于其转置:$\mathbf{A} = \mathbf{A}^\top$。这里我们定义一个对称矩阵 `B`: +作为方矩阵的一种特殊类型,[***对称矩阵*(symmetric matrix) $\mathbf{A}$ 等于其转置:$\mathbf{A} = \mathbf{A}^\top$**]。这里我们定义一个对称矩阵 `B`: ```{.python .input} B = np.array([[1, 2, 3], [2, 0, 4], [3, 4, 5]]) @@ -223,7 +223,7 @@ B == tf.transpose(B) ## 张量 -就像向量是标量的推广,矩阵是向量的推广一样,我们可以构建具有更多轴的数据结构。张量(本小节中的 “张量” 指代数对象)为我们提供了描述具有任意数量轴的$n$维数组的通用方法。例如,向量是一阶张量,矩阵是二阶张量。张量用特殊字体的大写字母(例如,$\mathsf{X}$、$\mathsf{Y}$ 和 $\mathsf{Z}$)表示,它们的索引机制(例如 $x_{ijk}$ 和 $[\mathsf{X}]_{1, 2i-1, 3}$)与矩阵类似。 +[**就像向量是标量的推广,矩阵是向量的推广一样,我们可以构建具有更多轴的数据结构**]。张量(本小节中的 “张量” 指代数对象)为我们提供了描述具有任意数量轴的$n$维数组的通用方法。例如,向量是一阶张量,矩阵是二阶张量。张量用特殊字体的大写字母(例如,$\mathsf{X}$、$\mathsf{Y}$ 和 $\mathsf{Z}$)表示,它们的索引机制(例如 $x_{ijk}$ 和 $[\mathsf{X}]_{1, 2i-1, 3}$)与矩阵类似。 当我们开始处理图像时,张量将变得更加重要,图像以$n$维数组形式出现,其中3个轴对应于高度、宽度,以及一个*通道*(channel)轴,用于堆叠颜色通道(红色、绿色和蓝色)。现在,我们将跳过高阶张量,集中在基础知识上。 @@ -246,7 +246,7 @@ X ## 张量算法的基本性质 -标量、向量、矩阵和任意数量轴的张量(本小节中的 “张量” 指代数对象)有一些很好的属性,通常会派上用场。例如,你可能已经从逐元素操作的定义中注意到,任何逐元素的一元运算都不会改变其操作数的形状。同样,给定具有相同形状的任何两个张量,任何逐元素二元运算的结果都将是相同形状的张量。例如,将两个相同形状的矩阵相加会在这两个矩阵上执行元素加法。 +标量、向量、矩阵和任意数量轴的张量(本小节中的 “张量” 指代数对象)有一些很好的属性,通常会派上用场。例如,你可能已经从逐元素操作的定义中注意到,任何逐元素的一元运算都不会改变其操作数的形状。同样,[**给定具有相同形状的任何两个张量,任何逐元素二元运算的结果都将是相同形状的张量**]。例如,将两个相同形状的矩阵相加会在这两个矩阵上执行元素加法。 ```{.python .input} A = np.arange(20).reshape(5, 4) @@ -268,7 +268,7 @@ B = A # 不能通过分配新内存将A克隆到B A, A + B ``` -具体而言,两个矩阵的逐元素乘法称为 *哈达玛积*(Hadamard product)(数学符号 $\odot$)。对于矩阵 $\mathbf{B} \in \mathbb{R}^{m \times n}$,其中第 $i$ 行和第 $j$ 列的元素是 $b_{ij}$。矩阵$\mathbf{A}$(在 :eqref:`eq_matrix_def` 中定义)和 $\mathbf{B}$的哈达玛积为: +具体而言,[**两个矩阵的逐元素乘法称为 *哈达玛积*(Hadamard product)(数学符号 $\odot$)**]。对于矩阵 $\mathbf{B} \in \mathbb{R}^{m \times n}$,其中第 $i$ 行和第 $j$ 列的元素是 $b_{ij}$。矩阵$\mathbf{A}$(在 :eqref:`eq_matrix_def` 中定义)和 $\mathbf{B}$的哈达玛积为: $$ \mathbf{A} \odot \mathbf{B} = @@ -319,7 +319,7 @@ a + X, (a * X).shape ## 汇总 :label:`subseq_lin-alg-reduction` -我们可以对任意张量进行的一个有用的操作是计算其元素的和。在数学表示法中,我们使用 $\sum$ 符号表示求和。为了表示长度为$d$的向量中元素的总和,可以记为 $\sum_{i=1}^d x_i$。在代码中,我们可以调用计算求和的函数: +我们可以对任意张量进行的一个有用的操作是[**计算其元素的和**]。在数学表示法中,我们使用 $\sum$ 符号表示求和。为了表示长度为$d$的向量中元素的总和,可以记为 $\sum_{i=1}^d x_i$。在代码中,我们可以调用计算求和的函数: ```{.python .input} x = np.arange(4) @@ -338,7 +338,7 @@ x = tf.range(4, dtype=tf.float32) x, tf.reduce_sum(x) ``` -我们可以表示任意形状张量的元素和。例如,矩阵 $\mathbf{A}$ 中元素的和可以记为$\sum_{i=1}^{m} \sum_{j=1}^{n} a_{ij}$。 +我们可以(**表示任意形状张量的元素和**)。例如,矩阵 $\mathbf{A}$ 中元素的和可以记为$\sum_{i=1}^{m} \sum_{j=1}^{n} a_{ij}$。 ```{.python .input} A.shape, A.sum() @@ -355,7 +355,7 @@ A.shape, tf.reduce_sum(A) ``` 默认情况下,调用求和函数会将一个张量在所有轴上汇总为一个标量。 -我们还可以指定求和汇总张量的轴。以矩阵为例。为了通过求和所有行的元素来汇总行维度(轴0),我们可以在调用函数时指定`axis=0`。 +我们还可以[**指定求和汇总张量的轴**]。以矩阵为例。为了通过求和所有行的元素来汇总行维度(轴0),我们可以在调用函数时指定`axis=0`。 由于输入矩阵沿0轴汇总以生成输出向量,因此输入的轴0的维数在输出形状中丢失。 ```{.python .input} @@ -410,7 +410,7 @@ A.sum(axis=[0, 1]) # Same as `A.sum()` tf.reduce_sum(A, axis=[0, 1]) # Same as `tf.reduce_sum(A)` ``` -一个与求和相关的量是 *平均值*(mean或average)。我们通过将总和除以元素总数来计算平均值。在代码中,我们可以调用函数来计算任意形状张量的平均值。 +[**一个与求和相关的量是 *平均值*(mean或average)**]。我们通过将总和除以元素总数来计算平均值。在代码中,我们可以调用函数来计算任意形状张量的平均值。 ```{.python .input} A.mean(), A.sum() / A.size @@ -445,7 +445,7 @@ tf.reduce_mean(A, axis=0), tf.reduce_sum(A, axis=0) / A.shape[0] ### 非汇总求和 :label:`subseq_lin-alg-non-reduction` -但是,有时在调用函数来计算总和或均值时保持轴数不变会很有用。 +但是,有时在调用函数来[**计算总和或均值时保持轴数不变**]会很有用。 ```{.python .input} sum_A = A.sum(axis=1, keepdims=True) @@ -464,7 +464,7 @@ sum_A = tf.reduce_sum(A, axis=1, keepdims=True) sum_A ``` -例如,由于 `sum_A` 在对每行进行求和后仍保持两个轴,我们可以通过广播将 `A` 除以 `sum_A` 。 +例如,由于 `sum_A` 在对每行进行求和后仍保持两个轴,我们可以(**通过广播将 `A` 除以 `sum_A`**) 。 ```{.python .input} A / sum_A @@ -480,7 +480,7 @@ A / sum_A A / sum_A ``` -如果我们想沿某个轴计算 `A` 元素的累积总和,比如 `axis=0`(逐行计算),我们可以调用 `cumsum` 函数。此函数不会沿任何轴汇总输入张量。 +如果我们想沿[**某个轴计算 `A` 元素的累积总和**],比如 `axis=0`(逐行计算),我们可以调用 `cumsum` 函数。此函数不会沿任何轴汇总输入张量。 ```{.python .input} A.cumsum(axis=0) @@ -501,6 +501,8 @@ tf.cumsum(A, axis=0) 到目前为止,我们只执行了逐元素操作、求和及平均值。如果这就是我们所能做的,那么线性代数可能就不需要单独一节了。 但是,最基本的操作之一是点积。给定两个向量 $\mathbf{x}, \mathbf{y} \in \mathbb{R}^d$,它们的 *点积*(dot product) $\mathbf{x}^\top \mathbf{y}$(或 $\langle \mathbf{x}, \mathbf{y} \rangle$)是相同位置的逐元素乘积的和:$\mathbf{x}^\top \mathbf{y} = \sum_{i=1}^{d} x_i y_i$。 +[~~点积是相同位置的逐元素乘积的和~~] + ```{.python .input} y = np.ones(4) x, y, np.dot(x, y) @@ -518,7 +520,7 @@ y = tf.ones(4, dtype=tf.float32) x, y, tf.tensordot(x, y, axes=1) ``` -注意,我们可以通过执行逐元素乘法,然后进行求和来表示两个向量的点积: +注意,(**我们可以通过执行逐元素乘法,然后进行求和来表示两个向量的点积**): ```{.python .input} np.sum(x * y) @@ -548,7 +550,7 @@ $$\mathbf{A}= \mathbf{a}^\top_m \\ \end{bmatrix},$$ -其中每个$\mathbf{a}^\top_{i} \in \mathbb{R}^n$ 都是行向量,表示矩阵的 $i^\mathrm{th}$ 行。矩阵向量积 $\mathbf{A}\mathbf{x}$ 是一个长度为 $m$ 的列向量,其 $i^\mathrm{th}$ 元素是点积 $\mathbf{a}^\top_i \mathbf{x}$: +其中每个$\mathbf{a}^\top_{i} \in \mathbb{R}^n$ 都是行向量,表示矩阵的 $i^\mathrm{th}$ 行。[**矩阵向量积 $\mathbf{A}\mathbf{x}$ 是一个长度为 $m$ 的列向量,其 $i^\mathrm{th}$ 元素是点积 $\mathbf{a}^\top_i \mathbf{x}$**]: $$ \mathbf{A}\mathbf{x} @@ -637,7 +639,7 @@ $$\mathbf{C} = \mathbf{AB} = \begin{bmatrix} \end{bmatrix}. $$ -我们可以将矩阵-矩阵乘法 $\mathbf{AB}$ 看作是简单地执行 $m$次矩阵-向量积,并将结果拼接在一起,形成一个 $n \times m$ 矩阵。在下面的代码中,我们在 `A` 和 `B` 上执行矩阵乘法。这里的`A` 是一个5行4列的矩阵,`B`是一个4行3列的矩阵。相乘后,我们得到了一个5行3列的矩阵。 +[**我们可以将矩阵-矩阵乘法 $\mathbf{AB}$ 看作是简单地执行 $m$次矩阵-向量积,并将结果拼接在一起,形成一个 $n \times m$ 矩阵**]。在下面的代码中,我们在 `A` 和 `B` 上执行矩阵乘法。这里的`A` 是一个5行4列的矩阵,`B`是一个4行3列的矩阵。相乘后,我们得到了一个5行3列的矩阵。 ```{.python .input} B = np.ones(shape=(4, 3)) @@ -682,9 +684,9 @@ $$f(\mathbf{x}) \geq 0.$$ $$\forall i, [\mathbf{x}]_i = 0 \Leftrightarrow f(\mathbf{x})=0.$$ 你可能会注意到,范数听起来很像距离的度量。如果你还记得小学时的欧几里得距离(想想毕达哥拉斯定理),那么非负性的概念和三角不等式可能会给你一些启发。 -事实上,欧几里得距离是一个范数:具体而言,它是 $L_2$ 范数。假设$n$-维向量$\mathbf{x}$中的元素是$x_1, \ldots, x_n$ 的 $L_2$ *范数* 是向量元素平方和的平方根: +事实上,欧几里得距离是一个范数:具体而言,它是 $L_2$ 范数。假设$n$-维向量$\mathbf{x}$中的元素是$x_1, \ldots, x_n$ 的 [**$L_2$ *范数* 是向量元素平方和的平方根:**] -$$\|\mathbf{x}\|_2 = \sqrt{\sum_{i=1}^n x_i^2},$$ +(**$$\|\mathbf{x}\|_2 = \sqrt{\sum_{i=1}^n x_i^2},$$**) 其中,在 $L_2$ 范数中常常省略下标 $2$,也就是说,$\|\mathbf{x}\|$ 等同于 $\|\mathbf{x}\|_2$。在代码中,我们可以按如下方式计算向量的 $L_2$ 范数。 @@ -705,9 +707,9 @@ u = tf.constant([3.0, -4.0]) tf.norm(u) ``` -在深度学习中,我们更经常地使用平方 $L_2$ 范数。你还会经常遇到 $L_1$ 范数,它表示为向量元素的绝对值之和: +在深度学习中,我们更经常地使用平方 $L_2$ 范数。你还会经常遇到 [**$L_1$ 范数,它表示为向量元素的绝对值之和:**] -$$\|\mathbf{x}\|_1 = \sum_{i=1}^n \left|x_i \right|.$$ +(**$$\|\mathbf{x}\|_1 = \sum_{i=1}^n \left|x_i \right|.$$**) 与 $L_2$ 范数相比,$L_1$ 范数受异常值的影响较小。为了计算 $L_1$ 范数,我们将绝对值函数和逐元素求和组合起来。 @@ -729,9 +731,9 @@ $L_2$ 范数和 $L_1$ 范数都是更一般的$L_p$范数的特例: $$\|\mathbf{x}\|_p = \left(\sum_{i=1}^n \left|x_i \right|^p \right)^{1/p}.$$ -类似于向量的$L_2$ 范数,矩阵 $\mathbf{X} \in \mathbb{R}^{m \times n}$ 的 *弗罗贝尼乌斯范数*(Frobenius norm) 是矩阵元素的平方和的平方根: +类似于向量的$L_2$ 范数,[**矩阵**] $\mathbf{X} \in \mathbb{R}^{m \times n}$ (**的 *弗罗贝尼乌斯范数*(Frobenius norm) 是矩阵元素的平方和的平方根:**) -$$\|\mathbf{X}\|_F = \sqrt{\sum_{i=1}^m \sum_{j=1}^n x_{ij}^2}.$$ +(**$$\|\mathbf{X}\|_F = \sqrt{\sum_{i=1}^m \sum_{j=1}^n x_{ij}^2}.$$**) 弗罗贝尼乌斯范数满足向量范数的所有性质。它的行为就好像它是矩阵形向量的 $L_2$ 范数。调用以下函数将计算矩阵的弗罗贝尼乌斯范数。 diff --git a/chapter_preliminaries/linear-algebra_origin.md b/chapter_preliminaries/linear-algebra_origin.md index cd8be46e2..1804f6c19 100644 --- a/chapter_preliminaries/linear-algebra_origin.md +++ b/chapter_preliminaries/linear-algebra_origin.md @@ -1,11 +1,7 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/linear-algebra.md -commit: 9e55a9c ---- - # Linear Algebra :label:`sec_linear-algebra` + Now that you can store and manipulate data, let us briefly review the subset of basic linear algebra that you will need to understand and implement @@ -15,7 +11,6 @@ and operations in linear algebra, expressing each of them through mathematical notation and the corresponding implementation in code. - ## Scalars If you never studied linear algebra or machine learning, @@ -49,7 +44,7 @@ Analogously, we could write $x, y \in \{0, 1\}$ to state that $x$ and $y$ are numbers whose value can only be $0$ or $1$. -A scalar is represented by a tensor with just one element. +(**A scalar is represented by a tensor with just one element.**) In the next snippet, we instantiate two scalars and perform some familiar arithmetic operations with them, namely addition, multiplication, division, and exponentiation. @@ -86,7 +81,7 @@ x + y, x * y, x / y, x**y ## Vectors -You can think of a vector as simply a list of scalar values. +[**You can think of a vector as simply a list of scalar values.**] We call these values the *elements* (*entries* or *components*) of the vector. When our vectors represent examples from our dataset, their values hold some real-world significance. @@ -136,7 +131,8 @@ $$\mathbf{x} =\begin{bmatrix}x_{1} \\x_{2} \\ \vdots \\x_{n}\end{bmatrix},$$ where $x_1, \ldots, x_n$ are elements of the vector. -In code, we access any element by indexing into the tensor. +In code, +we (**access any element by indexing into the tensor.**) ```{.python .input} x[3] @@ -162,7 +158,8 @@ consists of $n$ real-valued scalars, we can express this as $\mathbf{x} \in \mathbb{R}^n$. The length of a vector is commonly called the *dimension* of the vector. -As with an ordinary Python array, we can access the length of a tensor +As with an ordinary Python array, +we [**can access the length of a tensor**] by calling Python's built-in `len()` function. ```{.python .input} @@ -183,7 +180,7 @@ When a tensor represents a vector (with precisely one axis), we can also access its length via the `.shape` attribute. The shape is a tuple that lists the length (dimensionality) along each axis of the tensor. -For tensors with just one axis, the shape has just one element. +(**For tensors with just one axis, the shape has just one element.**) ```{.python .input} x.shape @@ -231,7 +228,7 @@ is ($m$, $n$) or $m \times n$. Specifically, when a matrix has the same number of rows and columns, its shape becomes a square; thus, it is called a *square matrix*. -We can create an $m \times n$ matrix +We can [**create an $m \times n$ matrix**] by specifying a shape with two components $m$ and $n$ when calling any of our favorite functions for instantiating a tensor. @@ -280,7 +277,7 @@ $$ \end{bmatrix}. $$ -Now we access a matrix's transpose in code. +Now we access a (**matrix's transpose**) in code. ```{.python .input} A.T @@ -297,8 +294,8 @@ tf.transpose(A) ``` As a special type of the square matrix, -a *symmetric matrix* $\mathbf{A}$ is equal to its transpose: -$\mathbf{A} = \mathbf{A}^\top$. +[**a *symmetric matrix* $\mathbf{A}$ is equal to its transpose: +$\mathbf{A} = \mathbf{A}^\top$.**] Here we define a symmetric matrix `B`. ```{.python .input} @@ -320,6 +317,7 @@ B Now we compare `B` with its transpose. + ```{.python .input} B == B.T ``` @@ -336,28 +334,34 @@ B == tf.transpose(B) Matrices are useful data structures: they allow us to organize data that have different modalities of variation. -For example, rows in our matrix might correspond to different houses (data points), +For example, rows in our matrix might correspond to different houses (data examples), while columns might correspond to different attributes. This should sound familiar if you have ever used spreadsheet software or have read :numref:`sec_pandas`. Thus, although the default orientation of a single vector is a column vector, in a matrix that represents a tabular dataset, -it is more conventional to treat each data point as a row vector in the matrix. +it is more conventional to treat each data example as a row vector in the matrix. And, as we will see in later chapters, this convention will enable common deep learning practices. For example, along the outermost axis of a tensor, -we can access or enumerate minibatches of data points, -or just data points if no minibatch exists. +we can access or enumerate minibatches of data examples, +or just data examples if no minibatch exists. ## Tensors -Just as vectors generalize scalars, and matrices generalize vectors, we can build data structures with even more axes. Tensors ("tensors" in this subsection refer to algebraic objects) give us a generic way of describing $n$-dimensional arrays with an arbitrary number of axes. Vectors, for example, are first-order tensors, and matrices are second-order tensors. +Just as vectors generalize scalars, and matrices generalize vectors, we can build data structures with even more axes. +[**Tensors**] +("tensors" in this subsection refer to algebraic objects) +(**give us a generic way of describing $n$-dimensional arrays with an arbitrary number of axes.**) +Vectors, for example, are first-order tensors, and matrices are second-order tensors. Tensors are denoted with capital letters of a special font face (e.g., $\mathsf{X}$, $\mathsf{Y}$, and $\mathsf{Z}$) and their indexing mechanism (e.g., $x_{ijk}$ and $[\mathsf{X}]_{1, 2i-1, 3}$) is similar to that of matrices. -Tensors will become more important when we start working with images, which arrive as $n$-dimensional arrays with 3 axes corresponding to the height, width, and a *channel* axis for stacking the color channels (red, green, and blue). For now, we will skip over higher order tensors and focus on the basics. +Tensors will become more important when we start working with images, + which arrive as $n$-dimensional arrays with 3 axes corresponding to the height, width, and a *channel* axis for stacking the color channels (red, green, and blue). For now, we will skip over higher order tensors and focus on the basics. + ```{.python .input} X = np.arange(24).reshape(2, 3, 4) @@ -384,9 +388,10 @@ have some nice properties that often come in handy. For example, you might have noticed from the definition of an elementwise operation that any elementwise unary operation does not change the shape of its operand. -Similarly, given any two tensors with the same shape, +Similarly, +[**given any two tensors with the same shape, the result of any binary elementwise operation -will be a tensor of that same shape. +will be a tensor of that same shape.**] For example, adding two matrices of the same shape performs elementwise addition over these two matrices. @@ -410,7 +415,9 @@ B = A # No cloning of `A` to `B` by allocating new memory A, A + B ``` -Specifically, elementwise multiplication of two matrices is called their *Hadamard product* (math notation $\odot$). +Specifically, +[**elementwise multiplication of two matrices is called their *Hadamard product***] +(math notation $\odot$). Consider matrix $\mathbf{B} \in \mathbb{R}^{m \times n}$ whose element of row $i$ and column $j$ is $b_{ij}$. The Hadamard product of matrices $\mathbf{A}$ (defined in :eqref:`eq_matrix_def`) and $\mathbf{B}$ $$ @@ -437,7 +444,7 @@ A * B A * B ``` -Multiplying or adding a tensor by a scalar also does not change the shape of the tensor, +[**Multiplying or adding a tensor by a scalar**] also does not change the shape of the tensor, where each element of the operand tensor will be added or multiplied by the scalar. ```{.python .input} @@ -464,7 +471,8 @@ a + X, (a * X).shape :label:`subseq_lin-alg-reduction` One useful operation that we can perform with arbitrary tensors -is to calculate the sum of their elements. +is to +calculate [**the sum of their elements.**] In mathematical notation, we express sums using the $\sum$ symbol. To express the sum of the elements in a vector $\mathbf{x}$ of length $d$, we write $\sum_{i=1}^d x_i$. @@ -487,7 +495,7 @@ x = tf.range(4, dtype=tf.float32) x, tf.reduce_sum(x) ``` -We can express sums over the elements of tensors of arbitrary shape. +We can express [**sums over the elements of tensors of arbitrary shape.**] For example, the sum of the elements of an $m \times n$ matrix $\mathbf{A}$ could be written $\sum_{i=1}^{m} \sum_{j=1}^{n} a_{ij}$. ```{.python .input} @@ -506,7 +514,7 @@ A.shape, tf.reduce_sum(A) By default, invoking the function for calculating the sum *reduces* a tensor along all its axes to a scalar. -We can also specify the axes along which the tensor is reduced via summation. +We can also [**specify the axes along which the tensor is reduced via summation.**] Take matrices as an example. To reduce the row dimension (axis 0) by summing up elements of all the rows, we specify `axis=0` when invoking the function. @@ -530,7 +538,8 @@ A_sum_axis0 = tf.reduce_sum(A, axis=0) A_sum_axis0, A_sum_axis0.shape ``` -Specifying `axis=1` will reduce the column dimension (axis 1) by summing up elements of all the columns. +Specifying +`axis=1` will reduce the column dimension (axis 1) by summing up elements of all the columns. Thus, the dimension of axis 1 of the input is lost in the output shape. ```{.python .input} @@ -567,7 +576,7 @@ A.sum(axis=[0, 1]) # Same as `A.sum()` tf.reduce_sum(A, axis=[0, 1]) # Same as `tf.reduce_sum(A)` ``` -A related quantity is the *mean*, which is also called the *average*. +[**A related quantity is the *mean*, which is also called the *average*.**] We calculate the mean by dividing the sum by the total number of elements. In code, we could just call the function for calculating the mean on tensors of arbitrary shape. @@ -605,7 +614,8 @@ tf.reduce_mean(A, axis=0), tf.reduce_sum(A, axis=0) / A.shape[0] ### Non-Reduction Sum :label:`subseq_lin-alg-non-reduction` -However, sometimes it can be useful to keep the number of axes unchanged +However, +sometimes it can be useful to [**keep the number of axes unchanged**] when invoking the function for calculating the sum or mean. ```{.python .input} @@ -625,7 +635,8 @@ sum_A = tf.reduce_sum(A, axis=1, keepdims=True) sum_A ``` -For instance, since `sum_A` still keeps its two axes after summing each row, we can divide `A` by `sum_A` with broadcasting. +For instance, +since `sum_A` still keeps its two axes after summing each row, we can (**divide `A` by `sum_A` with broadcasting.**) ```{.python .input} A / sum_A @@ -641,7 +652,7 @@ A / sum_A A / sum_A ``` -If we want to calculate the cumulative sum of elements of `A` along some axis, say `axis=0` (row by row), +If we want to calculate [**the cumulative sum of elements of `A` along some axis**], say `axis=0` (row by row), we can call the `cumsum` function. This function will not reduce the input tensor along any axis. ```{.python .input} @@ -660,7 +671,10 @@ tf.cumsum(A, axis=0) ## Dot Products -So far, we have only performed elementwise operations, sums, and averages. And if this was all we could do, linear algebra probably would not deserve its own section. However, one of the most fundamental operations is the dot product. Given two vectors $\mathbf{x}, \mathbf{y} \in \mathbb{R}^d$, their *dot product* $\mathbf{x}^\top \mathbf{y}$ (or $\langle \mathbf{x}, \mathbf{y} \rangle$) is a sum over the products of the elements at the same position: $\mathbf{x}^\top \mathbf{y} = \sum_{i=1}^{d} x_i y_i$. +So far, we have only performed elementwise operations, sums, and averages. And if this was all we could do, linear algebra probably would not deserve its own section. However, one of the most fundamental operations is the dot product. +Given two vectors $\mathbf{x}, \mathbf{y} \in \mathbb{R}^d$, their *dot product* $\mathbf{x}^\top \mathbf{y}$ (or $\langle \mathbf{x}, \mathbf{y} \rangle$) is a sum over the products of the elements at the same position: $\mathbf{x}^\top \mathbf{y} = \sum_{i=1}^{d} x_i y_i$. + +[~~The *dot product* of two vectors is a sum over the products of the elements at the same position~~] ```{.python .input} y = np.ones(4) @@ -679,7 +693,8 @@ y = tf.ones(4, dtype=tf.float32) x, y, tf.tensordot(x, y, axes=1) ``` -Note that we can express the dot product of two vectors equivalently by performing an elementwise multiplication and then a sum: +Note that +(**we can express the dot product of two vectors equivalently by performing an elementwise multiplication and then a sum:**) ```{.python .input} np.sum(x * y) @@ -729,9 +744,10 @@ $$\mathbf{A}= where each $\mathbf{a}^\top_{i} \in \mathbb{R}^n$ is a row vector representing the $i^\mathrm{th}$ row of the matrix $\mathbf{A}$. -The matrix-vector product $\mathbf{A}\mathbf{x}$ + +[**The matrix-vector product $\mathbf{A}\mathbf{x}$ is simply a column vector of length $m$, -whose $i^\mathrm{th}$ element is the dot product $\mathbf{a}^\top_i \mathbf{x}$: +whose $i^\mathrm{th}$ element is the dot product $\mathbf{a}^\top_i \mathbf{x}$:**] $$ \mathbf{A}\mathbf{x} @@ -841,7 +857,7 @@ $$\mathbf{C} = \mathbf{AB} = \begin{bmatrix} \end{bmatrix}. $$ -We can think of the matrix-matrix multiplication $\mathbf{AB}$ as simply performing $m$ matrix-vector products and stitching the results together to form an $n \times m$ matrix. +[**We can think of the matrix-matrix multiplication $\mathbf{AB}$ as simply performing $m$ matrix-vector products and stitching the results together to form an $n \times m$ matrix.**] In the following snippet, we perform matrix multiplication on `A` and `B`. Here, `A` is a matrix with 5 rows and 4 columns, and `B` is a matrix with 4 rows and 3 columns. @@ -911,11 +927,14 @@ In fact, the Euclidean distance is a norm: specifically it is the $L_2$ norm. Suppose that the elements in the $n$-dimensional vector $\mathbf{x}$ are $x_1, \ldots, x_n$. -The $L_2$ *norm* of $\mathbf{x}$ is the square root of the sum of the squares of the vector elements: -$$\|\mathbf{x}\|_2 = \sqrt{\sum_{i=1}^n x_i^2},$$ +[**The $L_2$ *norm* of $\mathbf{x}$ is the square root of the sum of the squares of the vector elements:**] + +(**$$\|\mathbf{x}\|_2 = \sqrt{\sum_{i=1}^n x_i^2},$$**) + -where the subscript $2$ is often omitted in $L_2$ norms, i.e., $\|\mathbf{x}\|$ is equivalent to $\|\mathbf{x}\|_2$. In code, we can calculate the $L_2$ norm of a vector as follows. +where the subscript $2$ is often omitted in $L_2$ norms, i.e., $\|\mathbf{x}\|$ is equivalent to $\|\mathbf{x}\|_2$. In code, +we can calculate the $L_2$ norm of a vector as follows. ```{.python .input} u = np.array([3, -4]) @@ -936,10 +955,12 @@ tf.norm(u) In deep learning, we work more often with the squared $L_2$ norm. -You will also frequently encounter the $L_1$ *norm*, + +You will also frequently encounter [**the $L_1$ *norm***], which is expressed as the sum of the absolute values of the vector elements: -$$\|\mathbf{x}\|_1 = \sum_{i=1}^n \left|x_i \right|.$$ +(**$$\|\mathbf{x}\|_1 = \sum_{i=1}^n \left|x_i \right|.$$**) + As compared with the $L_2$ norm, it is less influenced by outliers. @@ -960,16 +981,17 @@ torch.abs(u).sum() tf.reduce_sum(tf.abs(u)) ``` + Both the $L_2$ norm and the $L_1$ norm are special cases of the more general $L_p$ *norm*: $$\|\mathbf{x}\|_p = \left(\sum_{i=1}^n \left|x_i \right|^p \right)^{1/p}.$$ Analogous to $L_2$ norms of vectors, -the *Frobenius norm* of a matrix $\mathbf{X} \in \mathbb{R}^{m \times n}$ +[**the *Frobenius norm* of a matrix $\mathbf{X} \in \mathbb{R}^{m \times n}$**] is the square root of the sum of the squares of the matrix elements: -$$\|\mathbf{X}\|_F = \sqrt{\sum_{i=1}^m \sum_{j=1}^n x_{ij}^2}.$$ +[**$$\|\mathbf{X}\|_F = \sqrt{\sum_{i=1}^m \sum_{j=1}^n x_{ij}^2}.$$**] The Frobenius norm satisfies all the properties of vector norms. It behaves as if it were an $L_2$ norm of a matrix-shaped vector. diff --git a/chapter_preliminaries/lookup-api.md b/chapter_preliminaries/lookup-api.md index 01e21c79e..63fab7efa 100644 --- a/chapter_preliminaries/lookup-api.md +++ b/chapter_preliminaries/lookup-api.md @@ -13,7 +13,7 @@ ## 查找模块中的所有函数和类 -为了知道模块中可以调用哪些函数和类,我们调用 `dir` 函数。例如,我们可以查询随机数生成模块中的所有属性: +为了知道模块中可以调用哪些函数和类,我们调用 `dir` 函数。例如,我们可以(**查询随机数生成模块中的所有属性:**) ```{.python .input n=1} from mxnet import np @@ -36,7 +36,7 @@ print(dir(tf.random)) ## 查找特定函数和类的用法 -有关如何使用给定函数或类的更具体说明,我们可以调用 `help` 函数。例如,我们来查看张量 `ones` 函数的用法。 +有关如何使用给定函数或类的更具体说明,我们可以调用 `help` 函数。例如,我们来[**查看张量 `ones` 函数的用法。**] ```{.python .input} help(np.ones) @@ -52,7 +52,7 @@ help(torch.ones) help(tf.ones) ``` -从文档中,我们可以看到 `ones` 函数创建一个具有指定形状的新张量,并将所有元素值设置为 1。让我们来运行一个快速测试来确认这一解释: +从文档中,我们可以看到 `ones` 函数创建一个具有指定形状的新张量,并将所有元素值设置为 1。让我们来[**运行一个快速测试**]来确认这一解释: ```{.python .input} np.ones(4) diff --git a/chapter_preliminaries/lookup-api_origin.md b/chapter_preliminaries/lookup-api_origin.md index 1af9d8625..c70806636 100644 --- a/chapter_preliminaries/lookup-api_origin.md +++ b/chapter_preliminaries/lookup-api_origin.md @@ -1,17 +1,22 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/lookup-api.md -commit: 9e55a9c ---- - # Documentation - +:begin_tab:`mxnet` Due to constraints on the length of this book, we cannot possibly introduce every single MXNet function and class (and you probably would not want us to). The API documentation and additional tutorials and examples provide plenty of documentation beyond the book. In this section we provide you with some guidance to exploring the MXNet API. +:end_tab: + +:begin_tab:`pytorch` +Due to constraints on the length of this book, we cannot possibly introduce every single PyTorch function and class (and you probably would not want us to). The API documentation and additional tutorials and examples provide plenty of documentation beyond the book. In this section we provide you with some guidance to exploring the PyTorch API. +:end_tab: + +:begin_tab:`tensorflow` +Due to constraints on the length of this book, we cannot possibly introduce every single TensorFlow function and class (and you probably would not want us to). The API documentation and additional tutorials and examples provide plenty of documentation beyond the book. In this section we provide you with some guidance to exploring the TensorFlow API. +:end_tab: + ## Finding All the Functions and Classes in a Module In order to know which functions and classes can be called in a module, we -invoke the `dir` function. For instance, we can query all properties in the -module for generating random numbers: +invoke the `dir` function. For instance, we can (**query all properties in the +module for generating random numbers**): ```{.python .input n=1} from mxnet import np @@ -34,7 +39,7 @@ Generally, we can ignore functions that start and end with `__` (special objects ## Finding the Usage of Specific Functions and Classes -For more specific instructions on how to use a given function or class, we can invoke the `help` function. As an example, let us explore the usage instructions for tensors' `ones` function. +For more specific instructions on how to use a given function or class, we can invoke the `help` function. As an example, let us [**explore the usage instructions for tensors' `ones` function**]. ```{.python .input} help(np.ones) @@ -50,7 +55,7 @@ help(torch.ones) help(tf.ones) ``` -From the documentation, we can see that the `ones` function creates a new tensor with the specified shape and sets all the elements to the value of 1. Whenever possible, you should run a quick test to confirm your interpretation: +From the documentation, we can see that the `ones` function creates a new tensor with the specified shape and sets all the elements to the value of 1. Whenever possible, you should (**run a quick test**) to confirm your interpretation: ```{.python .input} np.ones(4) diff --git a/chapter_preliminaries/ndarray.md b/chapter_preliminaries/ndarray.md index 509cb471f..205969441 100644 --- a/chapter_preliminaries/ndarray.md +++ b/chapter_preliminaries/ndarray.md @@ -1,6 +1,4 @@ -:begin_slide:`keep` # 数据操作 -:end_slide:`keep` :label:`sec_ndarray` 为了能够完成各种操作,我们需要某种方法来存储和操作数据。一般来说,我们需要进行两件重要的事情:(1)获取数据;(2)在数据进入计算机后对其进行处理。如果没有某种方法来存储数据,那么获取数据是没有意义的。我们先尝试一个合成数据。首先,我们介绍$n$维数组,也称为 *张量*(tensor)。 @@ -17,9 +15,7 @@ :end_tab: :begin_tab:`pytorch` -:begin_slide:`keep, cont` -首先,我们导入 `torch`。请注意,虽然它被称为PyTorch,但我们应该导入 `torch` 而不是 `pytorch`。 -:end_slide: +[**首先,我们导入 `torch`。请注意,虽然它被称为PyTorch,但我们应该导入 `torch` 而不是 `pytorch`。**] :end_tab: :begin_tab:`tensorflow` @@ -41,14 +37,11 @@ import torch import tensorflow as tf ``` -张量表示一个数值组成的数组,这个数组可能有多个维度。具有一个轴的张量对应于数学上的 *向量*(vector)。具有两个轴的张量对应于数学上的 *矩阵*(matrix)。具有两个轴以上的张量没有特殊的数学名称。 +[**张量表示一个数值组成的数组,这个数组可能有多个维度**]。具有一个轴的张量对应于数学上的 *向量*(vector)。具有两个轴的张量对应于数学上的 *矩阵*(matrix)。具有两个轴以上的张量没有特殊的数学名称。 首先,我们可以使用 `arange` 创建一个行向量 `x`。这个行向量包含以0开始的前12个整数,它们默认创建为浮点数。张量中的每个值都称为张量的 *元素*(element)。例如,张量 `x` 中有 12 个元素。除非额外指定,新的张量将存储在内存中,并采用基于CPU的计算。 -:begin_slide: -张量表示一个数值数组(可能是多维的)。我们可以得到张量的形状。 -:end_slide: ```{.python .input} x = np.arange(12) @@ -67,7 +60,7 @@ x = tf.range(12) x ``` -我们可以通过张量的 `shape` 属性来访问张量的 *形状*(沿每个轴的长度)。 +[**我们可以通过张量的 `shape` 属性来访问张量的 *形状***] (~~和张量中元素的总数~~)(沿每个轴的长度)。 ```{.python .input} #@tab all @@ -91,12 +84,9 @@ x.numel() tf.size(x) ``` -要改变一个张量的形状而不改变元素数量和元素值,我们可以调用 `reshape` 函数。 +[**要改变一个张量的形状而不改变元素数量和元素值,我们可以调用 `reshape` 函数。**] 例如,我们可以把张量 `x` 从形状为 (12,) 的行向量转换为形状 (3, 4) 的矩阵。这个新的张量包含与转换前相同的值,但是把它们看成一个三行四列的矩阵。要重点说明一下,虽然形状发生了改变,但元素值没有变。注意,通过改变张量的形状,张量的大小不会改变。 -:begin_slide: -我们可以将数组变形,例如通过: -:end_slide: ```{.python .input} #@tab mxnet, pytorch @@ -113,15 +103,9 @@ X 不需要通过手动指定每个维度来改变形状。 如果我们的目标形状是 (高度, 宽度) ,那么在知道宽度后,高度应当会隐式得出,我们不必自己做除法。在上面的例子中,要获得一个有3行的矩阵,我们手动指定了它有3行和4列。幸运的是,张量在给出其他部分后可以自动计算出一个维度。我们可以通过将希望张量自动推断的维度放置 `-1` 来调用此功能。在上面的例子中,我们可以用 `x.reshape(-1, 4)` 或 `x.reshape(3, -1)`来取代`x.reshape(3, 4)`。 -:begin_slide:`cont` -我们可以通过填写`-1`自动让系统推断形状,例如`x.reshape(-1, 4)` 或 `x.reshape(3, -1)`。 -:end_slide: -有时,我们希望使用全0、全1、其他常量或者从特定分布中随机采样的数字,来初始化矩阵。我们可以创建一个形状为 (2, 3, 4) 的张量,其中所有元素都设置为0。代码如下: +有时,我们希望[**使用全0、全1、其他常量或者从特定分布中随机采样的数字**],来初始化矩阵。我们可以创建一个形状为 (2, 3, 4) 的张量,其中所有元素都设置为0。代码如下: -:begin_slide: -要显式的初始化张量,例如将其初始化为$0$或$1$,我们可以使用: -:end_slide: ```{.python .input} np.zeros((2, 3, 4)) @@ -155,9 +139,6 @@ tf.ones((2, 3, 4)) 有时我们想从某个概率分布中随机采样来得到张量中每个元素的值。例如,当我们构造数组来作为神经网络中的参数时,我们通常会随机初始化参数的值。以下代码创建一个形状为(3, 4)的张量。其中的每个元素都从均值为0,标准差为1的标准高斯(正态)分布中随机采样。 -:begin_slide: -要生成带有高斯分布的随机矩阵,我们可以使用`randn`。 -:end_slide: ```{.python .input} np.random.normal(0, 1, size=(3, 4)) @@ -173,11 +154,8 @@ torch.randn(3, 4) tf.random.normal(shape=[3, 4]) ``` -我们还可以通过提供包含数值的 Python 列表(或嵌套列表)来为所需张量中的每个元素赋予确定值。在这里,最外层的列表对应于轴 0,内层的列表对应于轴 1。 +我们还可以[**通过提供包含数值的 Python 列表(或嵌套列表)来为所需张量中的每个元素赋予确定值**]。在这里,最外层的列表对应于轴 0,内层的列表对应于轴 1。 -:begin_slide: -我们也可以显式生成张量。 -:end_slide: ```{.python .input} np.array([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) @@ -199,15 +177,8 @@ tf.constant([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) 在数学表示法中,我们将通过符号 $f: \mathbb{R} \rightarrow \mathbb{R}$ 来表示 *一元* 标量运算符(只接收一个输入)。这意味着该函数从任何实数($\mathbb{R}$)映射到另一个实数。同样,我们通过符号 $f: \mathbb{R}, \mathbb{R} \rightarrow \mathbb{R}$ 表示 *二元* 标量运算符,这意味着该函数接收两个输入,并产生一个输出。给定同一形状的任意两个向量$\mathbf{u}$和$\mathbf{v}$ 和二元运算符 $f$,我们可以得到向量$\mathbf{c} = F(\mathbf{u},\mathbf{v})$。具体计算方法是$c_i \gets f(u_i, v_i)$ ,其中 $c_i, u_i$ 和 $v_i$ 分别是向量$\mathbf{c}, \mathbf{u}$和 $\mathbf{v}$中的元素。在这里,我们通过将标量函数升级为逐元素向量运算来生成向量值 $F: \mathbb{R}^d, \mathbb{R}^d \rightarrow \mathbb{R}^d$。 -对于任意具有相同形状的张量,常见的标准算术运算符(`+`、`-`、`*`、`/` 和 `**`)都可以被升级为逐元素运算。我们可以在同一形状的任意两个张量上调用逐元素操作。在下面的例子中,我们使用逗号来表示一个具有5个元素的元组,其中每个元素都是逐元素操作的结果。 +对于任意具有相同形状的张量,[**常见的标准算术运算符(`+`、`-`、`*`、`/` 和 `**`)都可以被升级为逐元素运算**]。我们可以在同一形状的任意两个张量上调用逐元素操作。在下面的例子中,我们使用逗号来表示一个具有5个元素的元组,其中每个元素都是逐元素操作的结果。 -:begin_slide: -### 运算符 - -常用标准算术运算符 -(`+`、`-`、`*`、 `/`和 `**`) -都被“提升”到逐元素操作。 -:end_slide: ```{.python .input} @@ -230,11 +201,7 @@ y = tf.constant([2.0, 2, 2, 2]) x + y, x - y, x * y, x / y, x ** y # **运算符是求幂运算 ``` -可以按逐元素方式应用更多的计算,包括像求幂这样的一元运算符。 - -:begin_slide: -更多的操作可以按元素应用,例如`exp`。 -:end_slide: +可以(**按逐元素方式应用更多的计算**),包括像求幂这样的一元运算符。 ```{.python .input} np.exp(x) @@ -252,9 +219,7 @@ tf.exp(x) 除了逐元素计算外,我们还可以执行线性代数运算,包括向量点积和矩阵乘法。我们将在 :numref:`sec_linear-algebra` 中解释线性代数的重点内容(不需要先修知识)。 -:begin_slide:`keep` -我们也可以把多个张量连结在一起,把它们端对端地叠起来形成一个更大的张量。 -:end_slide: +[**我们也可以把多个张量连结在一起**],把它们端对端地叠起来形成一个更大的张量。 我们也可以 *连结*(concatenate) 多个张量在一起,将它们端到端堆叠以形成更大的张量。我们只需要提供张量列表,并给出沿哪个轴连结。下面的例子分别演示了当我们沿行(轴-0,形状的第一个元素)和按列(轴-1,形状的第二个元素)连结两个矩阵时会发生什么情况。我们可以看到,第一个输出张量的轴-0长度 ($6$) 是两个输入张量轴-0长度的总和 ($3 + 3$);第二个输出张量的轴-1长度 ($8$) 是两个输入张量轴-1长度的总和 ($4 + 4$)。 ```{.python .input} @@ -277,9 +242,7 @@ Y = tf.constant([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) tf.concat([X, Y], axis=0), tf.concat([X, Y], axis=1) ``` -:begin_slide:`keep` -有时,我们想通过 *逻辑运算符* 构建二元张量。以 `X == Y` 为例子。 -:end_slide: +有时,我们想[**通过 *逻辑运算符* 构建二元张量**]。以 `X == Y` 为例子。 对于每个位置,如果 `X` 和 `Y` 在该位置相等,则新张量中相应项的值为1,这意味着逻辑语句 `X == Y` 在该位置处为真,否则该位置为 0。 ```{.python .input} @@ -287,9 +250,7 @@ tf.concat([X, Y], axis=0), tf.concat([X, Y], axis=1) X == Y ``` -:begin_slide:`cont,keep` -对张量中的所有元素进行求和会产生一个只有一个元素的张量。 -:end_slide: +[**对张量中的所有元素进行求和会产生一个只有一个元素的张量。**] ```{.python .input} #@tab mxnet, pytorch @@ -304,14 +265,10 @@ tf.reduce_sum(X) ## 广播机制 :label:`subsec_broadcasting` -在上面的部分中,我们看到了如何在相同形状的两个张量上执行逐元素操作。在某些情况下,即使形状不同,我们仍然可以通过调用 *广播机制*(broadcasting mechanism) 来执行逐元素操作。这种机制的工作方式如下:首先,通过适当复制元素来扩展一个或两个数组,以便在转换之后,两个张量具有相同的形状。其次,对生成的数组执行逐元素操作。 +在上面的部分中,我们看到了如何在相同形状的两个张量上执行逐元素操作。在某些情况下,[**即使形状不同,我们仍然可以通过调用 *广播机制*(broadcasting mechanism) 来执行逐元素操作**]。这种机制的工作方式如下:首先,通过适当复制元素来扩展一个或两个数组,以便在转换之后,两个张量具有相同的形状。其次,对生成的数组执行逐元素操作。 在大多数情况下,我们将沿着数组中长度为1的轴进行广播,如下样本: -:begin_slide: -### 广播机制 -即使形状不同,我们仍然可以通过调用*广播机制*执行逐元素操作。 -:end_slide: ```{.python .input} a = np.arange(3).reshape(3, 1) @@ -335,10 +292,6 @@ a, b 由于 `a` 和 `b` 分别是 $3\times1$ 和 $1\times2$ 矩阵,如果我们让它们相加,它们的形状不匹配。我们将两个矩阵*广播*为一个更大的 $3\times2$ 矩阵,如下所示:矩阵 `a`将复制列,矩阵 `b`将复制行,然后再逐元素相加。 -:begin_slide:`cont` -由于`a`和`b`分别是$3\times1$和$1\times2$矩阵,如果我们让它们相加,它们的形状不匹配。尽管如此,我们还是可以让它们相加。 -:end_slide: - ```{.python .input} #@tab all a + b @@ -348,12 +301,7 @@ a + b 就像在任何其他 Python 数组中一样,张量中的元素可以通过索引访问。与任何 Python 数组一样:第一个元素的索引是 0;可以指定范围以包含第一个元素和最后一个之前的元素。与标准 Python 列表一样,我们可以通过使用负索引根据元素到列表尾部的相对位置访问元素。 -因此,我可以用`[-1]` 选择最后一个元素,可以用`[1:3]` 选择第二个元素和第三个元素,如下所示: - -:begin_slide: -### 索引和切片 -`[-1]`选择最后一个元素,`[1:3]`选择第二个和第三个元素。 -:end_slide: +因此,我[**可以用`[-1]` 选择最后一个元素,可以用`[1:3]` 选择第二个元素和第三个元素**],如下所示: ```{.python .input} #@tab all @@ -361,9 +309,7 @@ X[-1], X[1:3] ``` :begin_tab:`mxnet, pytorch` -:begin_slide:`keep,cont` -除读取之外,我们还可以通过指定索引来将元素写入矩阵。 -:end_slide: +[**除读取之外,我们还可以通过指定索引来将元素写入矩阵。**] :end_tab: :begin_tab:`tensorflow` @@ -384,9 +330,8 @@ X_var = tf.Variable(X) X_var[1, 2].assign(9) X_var ``` -:begin_slide:`keep` -如果我们想为多个元素赋值相同的值,我们只需要索引所有元素,然后为它们赋值。 -:end_slide: + +如果我们想[**为多个元素赋值相同的值,我们只需要索引所有元素,然后为它们赋值。**] 例如,`[0:2, :]` 访问第1行和第2行,其中 `:` 代表沿轴 1(列)的所有元素。虽然我们讨论的是矩阵的索引,但这也适用于向量和超过2个维度的张量。 ```{.python .input} @@ -402,11 +347,10 @@ X_var[0:2, :].assign(tf.ones(X_var[0:2,:].shape, dtype = tf.float32) * 12) X_var ``` -:begin_slide:`keep` ## 节省内存 -运行一些操作可能会导致为新结果分配内存。例如,如果我们用 `Y = X + Y`,我们将取消引用 `Y` 指向的张量,而是指向新分配的内存处的张量。 -:end_slide: +[**运行一些操作可能会导致为新结果分配内存**]。例如,如果我们用 `Y = X + Y`,我们将取消引用 `Y` 指向的张量,而是指向新分配的内存处的张量。 + 在下面的例子中,我们用 Python 的 `id()` 函数演示了这一点,它给我们提供了内存中引用对象的确切地址。运行 `Y = Y + X` 后,我们会发现 `id(Y)` 指向另一个位置。这是因为 Python 首先计算 `Y + X`,为结果分配新的内存,然后使 `Y` 指向内存中的这个新位置。 ```{.python .input} @@ -419,17 +363,13 @@ id(Y) == before 这可能是不可取的,原因有两个。首先,我们不想总是不必要地分配内存。在机器学习中,我们可能有数百兆的参数,并且在一秒内多次更新所有参数。通常情况下,我们希望原地执行这些更新。其次,我们可能通过多个变量指向相同参数。如果我们不原地更新,其他引用仍然会指向旧的内存位置,这样我们的某些代码可能会无意中引用旧的参数。 :begin_tab:`mxnet, pytorch` -幸运的是,执行原地操作非常简单。我们可以使用切片表示法将操作的结果分配给先前分配的数组,例如 `Y[:] = `。为了说明这个概念,我们首先创建一个新的矩阵 `Z`,其形状与另一个 `Y` 相同,使用 `zeros_like` 来分配一个全$0$的块。 +幸运的是,(**执行原地操作**)非常简单。我们可以使用切片表示法将操作的结果分配给先前分配的数组,例如 `Y[:] = `。为了说明这个概念,我们首先创建一个新的矩阵 `Z`,其形状与另一个 `Y` 相同,使用 `zeros_like` 来分配一个全$0$的块。 :end_tab: :begin_tab:`tensorflow` `Variables` 是TensorFlow中的可变容器。它们提供了一种存储模型参数的方法。我们可以通过`assign`将一个操作的结果分配给一个 `Variable`。为了说明这个概念,我们创建了一个与另一个张量 `Y` 相同的形状的 `Z`,使用 `zeros_like` 来分配一个全$0$的块。 :end_tab: -:begin_slide:`cont` -如果我们想尽可能的节省内存,可以通过`Y[:] = <表达式>`来实现。 -:end_slide: - ```{.python .input} Z = np.zeros_like(Y) print('id(Z):', id(Z)) @@ -454,9 +394,7 @@ print('id(Z):', id(Z)) ``` :begin_tab:`mxnet, pytorch` -:begin_slide:`keep` -如果在后续计算中没有重复使用 `X`,我们也可以使用 `X[:] = X + Y` 或 `X += Y` 来减少操作的内存开销。 -:end_slide: +[**如果在后续计算中没有重复使用 `X`,我们也可以使用 `X[:] = X + Y` 或 `X += Y` 来减少操作的内存开销。**] :end_tab: :begin_tab:`tensorflow` @@ -487,16 +425,11 @@ def computation(X, Y): computation(X, Y) ``` -:begin_slide:`keep` ## 转换为其他 Python 对象 -转换为 NumPy 张量很容易,反之也很容易。转换后的结果不共享内存。 -:end_slide: +[**转换为 NumPy 张量**]很容易,反之也很容易。转换后的结果不共享内存。 这个小的不便实际上是非常重要的:当等待Python的NumPy包希望使用相同的内存块执行其他操作时,且在 CPU 或 GPU 上执行操作时,你不希望停止计算。 -:begin_slide:`cont` -警惕——当你在CPU或GPU执行操作,你不想停止计算。 -:end_slide: ```{.python .input} A = X.asnumpy() @@ -517,9 +450,7 @@ A = X.numpy() B = tf.constant(A) type(A), type(B) ``` -:begin_slide:`cont,keep` -要将大小为1的张量转换为 Python 标量,我们可以调用 `item` 函数或 Python 的内置函数。 -:end_slide: +要(**将大小为1的张量转换为 Python 标量**),我们可以调用 `item` 函数或 Python 的内置函数。 ```{.python .input} a = np.array([3.5]) diff --git a/chapter_preliminaries/ndarray_origin.md b/chapter_preliminaries/ndarray_origin.md index 6968f0e5a..212aab47b 100644 --- a/chapter_preliminaries/ndarray_origin.md +++ b/chapter_preliminaries/ndarray_origin.md @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/ndarray.md -commit: 9e55a9c ---- - # Data Manipulation :label:`sec_ndarray` @@ -51,8 +46,8 @@ this is for compatibility of tensor processing by other components of MXNet. :end_tab: :begin_tab:`pytorch` -To start, we import `torch`. Note that though it's called PyTorch, we should -import `torch` instead of `pytorch`. +(**To start, we import `torch`. Note that though it's called PyTorch, we should +import `torch` instead of `pytorch`.**) :end_tab: :begin_tab:`tensorflow` @@ -75,7 +70,7 @@ import torch import tensorflow as tf ``` -A tensor represents a (possibly multi-dimensional) array of numerical values. +[**A tensor represents a (possibly multi-dimensional) array of numerical values.**] With one axis, a tensor corresponds (in math) to a *vector*. With two axes, a tensor corresponds to a *matrix*. Tensors with more than two axes do not have special @@ -89,6 +84,7 @@ For instance, there are 12 elements in the tensor `x`. Unless otherwise specified, a new tensor will be stored in main memory and designated for CPU-based computation. + ```{.python .input} x = np.arange(12) x @@ -106,7 +102,7 @@ x = tf.range(12) x ``` -We can access a tensor's *shape* (the length along each axis) +(**We can access a tensor's *shape***) (~~and the total number of elements~~) (the length along each axis) by inspecting its `shape` property. ```{.python .input} @@ -134,27 +130,28 @@ x.numel() tf.size(x) ``` -To change the shape of a tensor without altering -either the number of elements or their values, +To [**change the shape of a tensor without altering +either the number of elements or their values**], we can invoke the `reshape` function. For example, we can transform our tensor, `x`, from a row vector with shape (12,) to a matrix with shape (3, 4). This new tensor contains the exact same values, but views them as a matrix organized as 3 rows and 4 columns. To reiterate, although the shape has changed, -the elements in `x` have not. +the elements have not. Note that the size is unaltered by reshaping. + ```{.python .input} #@tab mxnet, pytorch -x = x.reshape(3, 4) -x +X = x.reshape(3, 4) +X ``` ```{.python .input} #@tab tensorflow -x = tf.reshape(x, (3, 4)) -x +X = tf.reshape(x, (3, 4)) +X ``` Reshaping by manually specifying every dimension is unnecessary. @@ -172,8 +169,10 @@ we could have equivalently called `x.reshape(-1, 4)` or `x.reshape(3, -1)`. Typically, we will want our matrices initialized either with zeros, ones, some other constants, or numbers randomly sampled from a specific distribution. -We can create a tensor representing a tensor with all elements -set to 0 and a shape of (2, 3, 4) as follows: +[**We can create a tensor representing a tensor with all elements +set to 0**] (~~or 1~~) +and a shape of (2, 3, 4) as follows: + ```{.python .input} np.zeros((2, 3, 4)) @@ -181,7 +180,7 @@ np.zeros((2, 3, 4)) ```{.python .input} #@tab pytorch -torch.zeros(2, 3, 4) +torch.zeros((2, 3, 4)) ``` ```{.python .input} @@ -205,8 +204,8 @@ torch.ones((2, 3, 4)) tf.ones((2, 3, 4)) ``` -Often, we want to randomly sample the values -for each element in a tensor +Often, we want to [**randomly sample the values +for each element in a tensor**] from some probability distribution. For example, when we construct arrays to serve as parameters in a neural network, we will @@ -216,6 +215,7 @@ Each of its elements is randomly sampled from a standard Gaussian (normal) distribution with a mean of 0 and a standard deviation of 1. + ```{.python .input} np.random.normal(0, 1, size=(3, 4)) ``` @@ -230,10 +230,11 @@ torch.randn(3, 4) tf.random.normal(shape=[3, 4]) ``` -We can also specify the exact values for each element in the desired tensor +We can also [**specify the exact values for each element**] in the desired tensor by supplying a Python list (or list of lists) containing the numerical values. Here, the outermost list corresponds to axis 0, and the inner list to axis 1. + ```{.python .input} np.array([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) ``` @@ -290,6 +291,12 @@ We can call elementwise operations on any two tensors of the same shape. In the following example, we use commas to formulate a 5-element tuple, where each element is the result of an elementwise operation. +### Operations + +[**The common standard arithmetic operators +(`+`, `-`, `*`, `/`, and `**`) +have all been *lifted* to elementwise operations.**] + ```{.python .input} x = np.array([1, 2, 4, 8]) y = np.array([2, 2, 2, 2]) @@ -310,7 +317,7 @@ y = tf.constant([2.0, 2, 2, 2]) x + y, x - y, x * y, x / y, x ** y # The ** operator is exponentiation ``` -Many more operations can be applied elementwise, +Many (**more operations can be applied elementwise**), including unary operators like exponentiation. ```{.python .input} @@ -333,7 +340,7 @@ including vector dot products and matrix multiplication. We will explain the crucial bits of linear algebra (with no assumed prior knowledge) in :numref:`sec_linear-algebra`. -We can also *concatenate* multiple tensors together, +We can also [***concatenate* multiple tensors together,**] stacking them end-to-end to form a larger tensor. We just need to provide a list of tensors and tell the system along which axis to concatenate. @@ -346,47 +353,48 @@ while the second output tensor's axis-1 length ($8$) is the sum of the two input tensors' axis-1 lengths ($4 + 4$). ```{.python .input} -x = np.arange(12).reshape(3, 4) -y = np.array([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) -np.concatenate([x, y], axis=0), np.concatenate([x, y], axis=1) +X = np.arange(12).reshape(3, 4) +Y = np.array([[2, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) +np.concatenate([X, Y], axis=0), np.concatenate([X, Y], axis=1) ``` ```{.python .input} #@tab pytorch -x = torch.arange(12, dtype=torch.float32).reshape((3,4)) -y = torch.tensor([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) -torch.cat((x, y), dim=0), torch.cat((x, y), dim=1) +X = torch.arange(12, dtype=torch.float32).reshape((3,4)) +Y = torch.tensor([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) +torch.cat((X, Y), dim=0), torch.cat((X, Y), dim=1) ``` ```{.python .input} #@tab tensorflow -x = tf.reshape(tf.range(12, dtype=tf.float32), (3, 4)) -y = tf.constant([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) -tf.concat([x, y], axis=0), tf.concat([x, y], axis=1) +X = tf.reshape(tf.range(12, dtype=tf.float32), (3, 4)) +Y = tf.constant([[2.0, 1, 4, 3], [1, 2, 3, 4], [4, 3, 2, 1]]) +tf.concat([X, Y], axis=0), tf.concat([X, Y], axis=1) ``` -Sometimes, we want to construct a binary tensor via *logical statements*. -Take `x == y` as an example. -For each position, if `x` and `y` are equal at that position, + +Sometimes, we want to [**construct a binary tensor via *logical statements*.**] +Take `X == Y` as an example. +For each position, if `X` and `Y` are equal at that position, the corresponding entry in the new tensor takes a value of 1, -meaning that the logical statement `x == y` is true at that position; +meaning that the logical statement `X == Y` is true at that position; otherwise that position takes 0. ```{.python .input} #@tab all -x == y +X == Y ``` -Summing all the elements in the tensor yields a tensor with only one element. +[**Summing all the elements in the tensor**] yields a tensor with only one element. ```{.python .input} #@tab mxnet, pytorch -x.sum() +X.sum() ``` ```{.python .input} #@tab tensorflow -tf.reduce_sum(x) +tf.reduce_sum(X) ``` ## Broadcasting Mechanism @@ -394,8 +402,8 @@ tf.reduce_sum(x) In the above section, we saw how to perform elementwise operations on two tensors of the same shape. Under certain conditions, -even when shapes differ, we can still perform elementwise operations -by invoking the *broadcasting mechanism*. +even when shapes differ, we can still [**perform elementwise operations +by invoking the *broadcasting mechanism*.**] This mechanism works in the following way: First, expand one or both arrays by copying elements appropriately @@ -407,6 +415,7 @@ on the resulting arrays. In most cases, we broadcast along an axis where an array initially only has length 1, such as in the following example: + ```{.python .input} a = np.arange(3).reshape(3, 1) b = np.arange(2).reshape(1, 2) @@ -434,6 +443,7 @@ for matrix `a` it replicates the columns and for matrix `b` it replicates the rows before adding up both elementwise. + ```{.python .input} #@tab all a + b @@ -448,16 +458,17 @@ As in standard Python lists, we can access elements according to their relative position to the end of the list by using negative indices. -Thus, `[-1]` selects the last element and `[1:3]` -selects the second and the third elements as follows: +Thus, [**`[-1]` selects the last element and `[1:3]` +selects the second and the third elements**] as follows: + ```{.python .input} #@tab all -x[-1], x[1:3] +X[-1], X[1:3] ``` :begin_tab:`mxnet, pytorch` -Beyond reading, we can also write elements of a matrix by specifying indices. +Beyond reading, (**we can also write elements of a matrix by specifying indices.**) :end_tab: :begin_tab:`tensorflow` @@ -472,19 +483,20 @@ Beyond assigning a value to the entire `Variable`, we can write elements of a ```{.python .input} #@tab mxnet, pytorch -x[1, 2] = 9 -x +X[1, 2] = 9 +X ``` ```{.python .input} #@tab tensorflow -x_var = tf.Variable(x) -x_var[1, 2].assign(9) -x_var +X_var = tf.Variable(X) +X_var[1, 2].assign(9) +X_var ``` -If we want to assign multiple elements the same value, -we simply index all of them and then assign them the value. + +If we want [**to assign multiple elements the same value, +we simply index all of them and then assign them the value.**] For instance, `[0:2, :]` accesses the first and second rows, where `:` takes all the elements along axis 1 (column). While we discussed indexing for matrices, @@ -493,36 +505,36 @@ and for tensors of more than 2 dimensions. ```{.python .input} #@tab mxnet, pytorch -x[0:2, :] = 12 -x +X[0:2, :] = 12 +X ``` ```{.python .input} #@tab tensorflow -x_var = tf.Variable(x) -x_var[0:2,:].assign(tf.ones(x_var[0:2,:].shape, dtype = tf.float32)*12) -x_var +X_var = tf.Variable(X) +X_var[0:2, :].assign(tf.ones(X_var[0:2,:].shape, dtype = tf.float32) * 12) +X_var ``` ## Saving Memory -Running operations can cause new memory to be -allocated to host results. -For example, if we write `y = x + y`, -we will dereference the tensor that `y` used to point to -and instead point `y` at the newly allocated memory. +[**Running operations can cause new memory to be +allocated to host results.**] +For example, if we write `Y = X + Y`, +we will dereference the tensor that `Y` used to point to +and instead point `Y` at the newly allocated memory. In the following example, we demonstrate this with Python's `id()` function, which gives us the exact address of the referenced object in memory. -After running `y = y + x`, we will find that `id(y)` points to a different location. -That is because Python first evaluates `y + x`, -allocating new memory for the result and then makes `y` +After running `Y = Y + X`, we will find that `id(Y)` points to a different location. +That is because Python first evaluates `Y + X`, +allocating new memory for the result and then makes `Y` point to this new location in memory. ```{.python .input} #@tab all -before = id(y) -y = y + x -id(y) == before +before = id(Y) +Y = Y + X +id(Y) == before ``` This might be undesirable for two reasons. @@ -538,12 +550,12 @@ the old memory location, making it possible for parts of our code to inadvertently reference stale parameters. :begin_tab:`mxnet, pytorch` -Fortunately, performing in-place operations is easy. +Fortunately, (**performing in-place operations**) is easy. We can assign the result of an operation to a previously allocated array with slice notation, -e.g., `y[:] = `. -To illustrate this concept, we first create a new matrix `z` -with the same shape as another `y`, +e.g., `Y[:] = `. +To illustrate this concept, we first create a new matrix `Z` +with the same shape as another `Y`, using `zeros_like` to allocate a block of $0$ entries. :end_tab: @@ -552,38 +564,38 @@ using `zeros_like` to allocate a block of $0$ entries. a way to store your model parameters. We can assign the result of an operation to a `Variable` with `assign`. -To illustrate this concept, we create a `Variable` `z` -with the same shape as another tensor `y`, +To illustrate this concept, we create a `Variable` `Z` +with the same shape as another tensor `Y`, using `zeros_like` to allocate a block of $0$ entries. :end_tab: ```{.python .input} -z = np.zeros_like(y) -print('id(z):', id(z)) -z[:] = x + y -print('id(z):', id(z)) +Z = np.zeros_like(Y) +print('id(Z):', id(Z)) +Z[:] = X + Y +print('id(Z):', id(Z)) ``` ```{.python .input} #@tab pytorch -z = torch.zeros_like(y) -print('id(z):', id(z)) -z[:] = x + y -print('id(z):', id(z)) +Z = torch.zeros_like(Y) +print('id(Z):', id(Z)) +Z[:] = X + Y +print('id(Z):', id(Z)) ``` ```{.python .input} #@tab tensorflow -z = tf.Variable(tf.zeros_like(y)) -print('id(z):', id(z)) -z.assign(x + y) -print('id(z):', id(z)) +Z = tf.Variable(tf.zeros_like(Y)) +print('id(Z):', id(Z)) +Z.assign(X + Y) +print('id(Z):', id(Z)) ``` :begin_tab:`mxnet, pytorch` -If the value of `x` is not reused in subsequent computations, -we can also use `x[:] = x + y` or `x += y` -to reduce the memory overhead of the operation. +[**If the value of `X` is not reused in subsequent computations, +we can also use `X[:] = X + Y` or `X += Y` +to reduce the memory overhead of the operation.**] :end_tab: :begin_tab:`tensorflow` @@ -604,27 +616,28 @@ overhead of TensorFlow computations. ```{.python .input} #@tab mxnet, pytorch -before = id(x) -x += y -id(x) == before +before = id(X) +X += Y +id(X) == before ``` ```{.python .input} #@tab tensorflow @tf.function -def computation(x, y): - z = tf.zeros_like(y) # This unused value will be pruned out. - a = x + y # Allocations will be re-used when no longer needed. - b = a + y - c = b + y - return c + y +def computation(X, Y): + Z = tf.zeros_like(Y) # This unused value will be pruned out + A = X + Y # Allocations will be re-used when no longer needed + B = A + Y + C = B + Y + return C + Y -computation(x, y) +computation(X, Y) ``` + ## Conversion to Other Python Objects -Converting to a NumPy tensor, or vice versa, is easy. +[**Converting to a NumPy tensor**], or vice versa, is easy. The converted result does not share memory. This minor inconvenience is actually quite important: when you perform operations on the CPU or on GPUs, @@ -632,29 +645,31 @@ you do not want to halt computation, waiting to see whether the NumPy package of Python might want to be doing something else with the same chunk of memory. + ```{.python .input} -a = x.asnumpy() -b = np.array(a) -type(a), type(b) +A = X.asnumpy() +B = np.array(A) +type(A), type(B) ``` ```{.python .input} #@tab pytorch -a = x.numpy() -b = torch.tensor(a) -type(a), type(b) +A = X.numpy() +B = torch.tensor(A) +type(A), type(B) ``` ```{.python .input} #@tab tensorflow -a = x.numpy() -b = tf.constant(a) -type(a), type(b) +A = X.numpy() +B = tf.constant(A) +type(A), type(B) ``` -To convert a size-1 tensor to a Python scalar, +To (**convert a size-1 tensor to a Python scalar**), we can invoke the `item` function or Python's built-in functions. + ```{.python .input} a = np.array([3.5]) a, a.item(), float(a), int(a) @@ -679,7 +694,7 @@ a, a.item(), float(a), int(a) ## Exercises -1. Run the code in this section. Change the conditional statement `x == y` in this section to `x < y` or `x > y`, and then see what kind of tensor you can get. +1. Run the code in this section. Change the conditional statement `X == Y` in this section to `X < Y` or `X > Y`, and then see what kind of tensor you can get. 1. Replace the two tensors that operate by element in the broadcasting mechanism with other shapes, e.g., 3-dimensional tensors. Is the result the same as expected? :begin_tab:`mxnet` diff --git a/chapter_preliminaries/pandas.md b/chapter_preliminaries/pandas.md index 1290edbf5..982783dea 100644 --- a/chapter_preliminaries/pandas.md +++ b/chapter_preliminaries/pandas.md @@ -5,7 +5,7 @@ ## 读取数据集 -举一个例子,我们首先创建一个人工数据集,并存储在csv(逗号分隔值)文件 `../data/house_tiny.csv` 中。以其他格式存储的数据也可以通过类似的方式进行处理。下面的`mkdir_if_not_exist` 函数可确保目录 `../data` 存在。注意,注释 `#@save`是一个特殊的标记,该标记下面的函数、类或语句将保存在 `d2l` 软件包中,以便以后可以直接调用它们(例如 `d2l.mkdir_if_not_exist(path)`)而无需重新定义。 +举一个例子,我们首先(**创建一个人工数据集,并存储在csv(逗号分隔值)文件**) `../data/house_tiny.csv` 中。以其他格式存储的数据也可以通过类似的方式进行处理。下面的`mkdir_if_not_exist` 函数可确保目录 `../data` 存在。注意,注释 `#@save`是一个特殊的标记,该标记下面的函数、类或语句将保存在 `d2l` 软件包中,以便以后可以直接调用它们(例如 `d2l.mkdir_if_not_exist(path)`)而无需重新定义。 下面我们将数据集按行写入 csv 文件中。 @@ -23,7 +23,7 @@ with open(data_file, 'w') as f: f.write('NA,NA,140000\n') ``` -要从创建的 csv 文件中加载原始数据集,我们导入 `pandas` 包并调用 `read_csv` 函数。该数据集有四行三列。其中每行描述了房间数量(“NumRooms”)、巷子类型(“Alley”)和房屋价格(“Price”)。 +要[**从创建的 csv 文件中加载原始数据集**],我们导入 `pandas` 包并调用 `read_csv` 函数。该数据集有四行三列。其中每行描述了房间数量(“NumRooms”)、巷子类型(“Alley”)和房屋价格(“Price”)。 ```{.python .input} #@tab all @@ -37,7 +37,7 @@ print(data) ## 处理缺失值 -注意,“NaN” 项代表缺失值。为了处理缺失的数据,典型的方法包括 *插值* 和 *删除*,其中插值用替代值代替缺失值。而删除则忽略缺失值。在这里,我们将考虑插值。 +注意,“NaN” 项代表缺失值。[**为了处理缺失的数据,典型的方法包括 *插值* 和 *删除*,**]其中插值用替代值代替缺失值。而删除则忽略缺失值。在(**这里,我们将考虑插值**)。 通过位置索引`iloc`,我们将 `data` 分成 `inputs` 和 `outputs`,其中前者为 `data`的前两列,而后者为 `data`的最后一列。对于 `inputs` 中缺少的的数值,我们用同一列的均值替换 “NaN” 项。 @@ -48,7 +48,7 @@ inputs = inputs.fillna(inputs.mean()) print(inputs) ``` -对于 `inputs` 中的类别值或离散值,我们将 “NaN” 视为一个类别。由于 “巷子”(“Alley”)列只接受两种类型的类别值 “Alley” 和 “NaN”,`pandas` 可以自动将此列转换为两列 “Alley_Pave” 和 “Alley_nan”。巷子类型为 “Pave” 的行会将“Alley_Pave”的值设置为1,“Alley_nan”的值设置为0。缺少巷子类型的行会将“Alley_Pave”和“Alley_nan”分别设置为0和1。 +[**对于 `inputs` 中的类别值或离散值,我们将 “NaN” 视为一个类别。**]由于 “巷子”(“Alley”)列只接受两种类型的类别值 “Alley” 和 “NaN”,`pandas` 可以自动将此列转换为两列 “Alley_Pave” 和 “Alley_nan”。巷子类型为 “Pave” 的行会将“Alley_Pave”的值设置为1,“Alley_nan”的值设置为0。缺少巷子类型的行会将“Alley_Pave”和“Alley_nan”分别设置为0和1。 ```{.python .input} #@tab all @@ -58,7 +58,7 @@ print(inputs) ## 转换为张量格式 -现在 `inputs` 和 `outputs` 中的所有条目都是数值类型,它们可以转换为张量格式。当数据采用张量格式后,可以通过在 :numref:`sec_ndarray` 中引入的那些张量函数来进一步操作。 +[**现在 `inputs` 和 `outputs` 中的所有条目都是数值类型,它们可以转换为张量格式。**]当数据采用张量格式后,可以通过在 :numref:`sec_ndarray` 中引入的那些张量函数来进一步操作。 ```{.python .input} from mxnet import np diff --git a/chapter_preliminaries/pandas_origin.md b/chapter_preliminaries/pandas_origin.md index 63857c3fd..1e6cdb4c8 100644 --- a/chapter_preliminaries/pandas_origin.md +++ b/chapter_preliminaries/pandas_origin.md @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/pandas.md -commit: 9e55a9c ---- - # Data Preprocessing :label:`sec_pandas` @@ -18,42 +13,29 @@ We will cover more data preprocessing techniques in later chapters. ## Reading the Dataset -As an example, we begin by creating an artificial dataset that is stored in a -csv (comma-separated values) file `../data/house_tiny.csv`. Data stored in other +As an example, +we begin by (**creating an artificial dataset that is stored in a +csv (comma-separated values) file**) +`../data/house_tiny.csv`. Data stored in other formats may be processed in similar ways. -The following `mkdir_if_not_exist` -function ensures that the directory `../data` exists. -Note that the comment `#@save` is a special mark where the following function, -class, or statements are saved in the `d2l` package -so later they can be directly invoked (e.g., `d2l.mkdir_if_not_exist(path)`) without being redefined. - -```{.python .input} -#@tab all -import os - -def mkdir_if_not_exist(path): #@save - """Make a directory if it does not exist.""" - if not isinstance(path, str): - path = os.path.join(*path) - if not os.path.exists(path): - os.makedirs(path) -``` Below we write the dataset row by row into a csv file. ```{.python .input} #@tab all -data_file = '../data/house_tiny.csv' -mkdir_if_not_exist('../data') +import os + +os.makedirs(os.path.join('..', 'data'), exist_ok=True) +data_file = os.path.join('..', 'data', 'house_tiny.csv') with open(data_file, 'w') as f: f.write('NumRooms,Alley,Price\n') # Column names - f.write('NA,Pave,127500\n') # Each row represents a data point + f.write('NA,Pave,127500\n') # Each row represents a data example f.write('2,NA,106000\n') f.write('4,NA,178100\n') f.write('NA,NA,140000\n') ``` -To load the raw dataset from the created csv file, +To [**load the raw dataset from the created csv file**], we import the `pandas` package and invoke the `read_csv` function. This dataset has four rows and three columns, where each row describes the number of rooms ("NumRooms"), the alley type ("Alley"), and the price ("Price") of a house. @@ -76,7 +58,8 @@ while deletion ignores missing values. Here we will consider imputation. By integer-location based indexing (`iloc`), we split `data` into `inputs` and `outputs`, where the former takes the first two columns while the latter only keeps the last column. -For numerical values in `inputs` that are missing, we replace the "NaN" entries with the mean value of the same column. +For numerical values in `inputs` that are missing, +we [**replace the "NaN" entries with the mean value of the same column.**] ```{.python .input} #@tab all @@ -85,7 +68,7 @@ inputs = inputs.fillna(inputs.mean()) print(inputs) ``` -For categorical or discrete values in `inputs`, we consider "NaN" as a category. +[**For categorical or discrete values in `inputs`, we consider "NaN" as a category.**] Since the "Alley" column only takes two types of categorical values "Pave" and "NaN", `pandas` can automatically convert this column to two columns "Alley_Pave" and "Alley_nan". A row whose alley type is "Pave" will set values of "Alley_Pave" and "Alley_nan" to 1 and 0. @@ -99,7 +82,7 @@ print(inputs) ## Conversion to the Tensor Format -Now that all the entries in `inputs` and `outputs` are numerical, they can be converted to the tensor format. +Now that [**all the entries in `inputs` and `outputs` are numerical, they can be converted to the tensor format.**] Once data are in this format, they can be further manipulated with those tensor functionalities that we have introduced in :numref:`sec_ndarray`. ```{.python .input} @@ -138,7 +121,6 @@ Create a raw dataset with more rows and columns. 1. Delete the column with the most missing values. 2. Convert the preprocessed dataset to the tensor format. - :begin_tab:`mxnet` [Discussions](https://discuss.d2l.ai/t/28) :end_tab: diff --git a/chapter_preliminaries/probability_origin.md b/chapter_preliminaries/probability_origin.md index bdfb834ea..d3fa2d97f 100644 --- a/chapter_preliminaries/probability_origin.md +++ b/chapter_preliminaries/probability_origin.md @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/chapter_preliminaries/probability.md -commit: 9e55a9c ---- - # Probability :label:`sec_prob` @@ -307,7 +302,7 @@ if and only if $P(A, B \mid C) = P(A \mid C)P(B \mid C)$. This is expressed as $ ### Application :label:`subsec_probability_hiv_app` -Let us put our skills to the test. Assume that a doctor administers an AIDS test to a patient. This test is fairly accurate and it fails only with 1% probability if the patient is healthy but reporting him as diseased. Moreover, +Let us put our skills to the test. Assume that a doctor administers an HIV test to a patient. This test is fairly accurate and it fails only with 1% probability if the patient is healthy but reporting him as diseased. Moreover, it never fails to detect HIV if the patient actually has it. We use $D_1$ to indicate the diagnosis ($1$ if positive and $0$ if negative) and $H$ to denote the HIV status ($1$ if positive and $0$ if negative). :numref:`conditional_prob_D1` lists such conditional probabilities. @@ -319,7 +314,7 @@ it never fails to detect HIV if the patient actually has it. We use $D_1$ to ind |$P(D_1 = 0 \mid H)$| 0 | 0.99 | :label:`conditional_prob_D1` -Note that the column sums are all 1 (but the row sums are not), since the conditional probability needs to sum up to 1, just like the probability. Let us work out the probability of the patient having AIDS if the test comes back positive, i.e., $P(H = 1 \mid D_1 = 1)$. Obviously this is going to depend on how common the disease is, since it affects the number of false alarms. Assume that the population is quite healthy, e.g., $P(H=1) = 0.0015$. To apply Bayes' theorem, we need to apply marginalization and the multiplication rule to determine +Note that the column sums are all 1 (but the row sums are not), since the conditional probability needs to sum up to 1, just like the probability. Let us work out the probability of the patient having HIV if the test comes back positive, i.e., $P(H = 1 \mid D_1 = 1)$. Obviously this is going to depend on how common the disease is, since it affects the number of false alarms. Assume that the population is quite healthy, e.g., $P(H=1) = 0.0015$. To apply Bayes' theorem, we need to apply marginalization and the multiplication rule to determine $$\begin{aligned} &P(D_1 = 1) \\ @@ -335,7 +330,7 @@ $$\begin{aligned} &P(H = 1 \mid D_1 = 1)\\ =& \frac{P(D_1=1 \mid H=1) P(H=1)}{P(D_1=1)} \\ =& 0.1306 \end{aligned}.$$ In other words, there is only a 13.06% chance that the patient -actually has AIDS, despite using a very accurate test. +actually has HIV, despite using a very accurate test. As we can see, probability can be counterintuitive. What should a patient do upon receiving such terrifying news? Likely, the patient @@ -379,7 +374,7 @@ $$\begin{aligned} \end{aligned} $$ -In the end, the probability of the patient having AIDS given both positive tests is +In the end, the probability of the patient having HIV given both positive tests is $$\begin{aligned} &P(H = 1 \mid D_1 = 1, D_2 = 1)\\ @@ -431,7 +426,7 @@ $$\mathrm{Var}[f(x)] = E\left[\left(f(x) - E[f(x)]\right)^2\right].$$ 1. We conducted $m=500$ groups of experiments where each group draws $n=10$ samples. Vary $m$ and $n$. Observe and analyze the experimental results. 1. Given two events with probability $P(\mathcal{A})$ and $P(\mathcal{B})$, compute upper and lower bounds on $P(\mathcal{A} \cup \mathcal{B})$ and $P(\mathcal{A} \cap \mathcal{B})$. (Hint: display the situation using a [Venn Diagram](https://en.wikipedia.org/wiki/Venn_diagram).) 1. Assume that we have a sequence of random variables, say $A$, $B$, and $C$, where $B$ only depends on $A$, and $C$ only depends on $B$, can you simplify the joint probability $P(A, B, C)$? (Hint: this is a [Markov Chain](https://en.wikipedia.org/wiki/Markov_chain).) -1. In :numref:`subsec_probability_hiv_app`, the first test is more accurate. Why not just run the first test a second time? +1. In :numref:`subsec_probability_hiv_app`, the first test is more accurate. Why not run the first test twice rather than run both the first and second tests? :begin_tab:`mxnet` diff --git a/config.ini b/config.ini index 6b90c8e2a..6b058a026 100644 --- a/config.ini +++ b/config.ini @@ -166,3 +166,8 @@ reverse_alias = google_analytics_tracking_id = UA-96378503-2 +[slides] + +top_right = + +github_repo = pytorch, d2l-ai/d2l-zh-pytorch-slides diff --git a/config_origin.ini b/config_origin.ini index 7984a84e1..5d4697faa 100644 --- a/config_origin.ini +++ b/config_origin.ini @@ -1,8 +1,3 @@ ---- -source: https://github.com/d2l-ai/d2l-en/blob/master/config.ini -commit: 9bf95b1 ---- - [project] # The project name, used as the filename of the package and the PDF file. For @@ -15,9 +10,9 @@ title = Dive into Deep Learning author = Aston Zhang, Zachary C. Lipton, Mu Li, and Alexander J. Smola -copyright = 2020, All authors. Licensed under CC-BY-SA-4.0 and MIT-0. +copyright = 2021, All authors. Licensed under CC-BY-SA-4.0 and MIT-0. -release = 0.14.0 +release = 0.16.1 [build] @@ -46,7 +41,7 @@ tabs = mxnet, pytorch, tensorflow header_links = Courses, https://courses.d2l.ai, fas fa-user-graduate, PDF, https://d2l.ai/d2l-en.pdf, fas fa-file-pdf, All Notebooks, https://d2l.ai/d2l-en.zip, fas fa-download, - Discuss, https://discuss.d2l.ai, fab fa-discourse, + Discuss, https://discuss.d2l.ai/c/5, fab fa-discourse, GitHub, https://github.com/d2l-ai/d2l-en, fab fa-github, 中文版, https://zh.d2l.ai, fas fa-external-link-alt @@ -78,7 +73,8 @@ lib_name = np # Map from d2l.xx to np.xx simple_alias = ones, zeros, arange, meshgrid, sin, sinh, cos, cosh, tanh, linspace, exp, log, tensor -> array, normal -> random.normal, - matmul -> dot, int32, float32, concat -> concatenate, stack, abs + rand -> random.rand, matmul -> dot, int32, float32, + concat -> concatenate, stack, abs, eye # Map from d2l.xx(a, *args, **kwargs) to a.xx(*args, **kwargs) fluent_alias = numpy -> asnumpy, reshape, to -> as_in_context, reduce_sum -> sum, @@ -98,8 +94,8 @@ lib_file = d2l/torch.py lib_name = torch simple_alias = ones, zeros, tensor, arange, meshgrid, sin, sinh, cos, cosh, - tanh, linspace, exp, log, normal, matmul, int32, float32, - concat -> cat, stack, abs + tanh, linspace, exp, log, normal, rand, matmul, int32, float32, + concat -> cat, stack, abs, eye fluent_alias = numpy -> detach().numpy, size -> numel, reshape, to, reduce_sum -> sum, argmax, astype -> type, transpose -> t @@ -113,9 +109,10 @@ lib_file = d2l/tensorflow.py lib_name = tf simple_alias = reshape, ones, zeros, meshgrid, sin, sinh, cos, cosh, tanh, - linspace, exp, matmul, reduce_sum, argmax, tensor -> constant, + linspace, exp, normal -> random.normal, rand -> random.uniform, + matmul, reduce_sum, argmax, tensor -> constant, arange -> range, astype -> cast, int32, float32, transpose, - concat, stack, normal -> random.normal, abs + concat, stack, abs, eye fluent_alias = numpy, @@ -141,7 +138,7 @@ github_repo = mxnet, d2l-ai/d2l-en-colab replace_svg_url = img, http://d2l.ai/_images -libs = mxnet, mxnet, -U mxnet-cu101mkl==1.6.0 # updating mxnet to at least v1.6 +libs = mxnet, mxnet, -U mxnet-cu101==1.7.0 mxnet, d2l, d2l==RELEASE pytorch, d2l, d2l==RELEASE tensorflow, d2l, d2l==RELEASE @@ -157,7 +154,13 @@ kernel = mxnet, conda_mxnet_p36 pytorch, conda_pytorch_p36 tensorflow, conda_tensorflow_p36 -libs = mxnet, mxnet, -U mxnet-cu101mkl==1.6.0 # updating mxnet to at least v1.6 +libs = mxnet, mxnet, -U mxnet-cu101==1.7.0 mxnet, d2l, .. # installing d2l pytorch, d2l, .. # installing d2l tensorflow, d2l, .. # installing d2l + +[slides] + +top_right = + +github_repo = pytorch, d2l-ai/d2l-pytorch-slides