Fix typos <noupdate>

2026-02-04 07:41:43 +01:00 · 2025-01-16 20:34:29 +01:00
parent 6f03be6eea
commit 5bb022dd14
3 changed files with 29 additions and 29 deletions
--- a/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
@ -292,7 +292,7 @@ Network with bottleneck-block-inspired inception modules.
                Global average pooling to obtain a channel-wise vector.

            \item[Excitation]
-                Feed-forward network that first compresses the input channels by a ratio $r$ (typically $16$) and then restores them.
+                Feed-forward network that first compresses the input channels by a ratio $r$ (typically $16$) and then restores them. A final sigmoid gives the channel weights.
        \end{descriptionlist}


@ -341,7 +341,7 @@ Network with bottleneck-block-inspired inception modules.

 \begin{description}
    \item[Inverted residual block] \marginnote{Inverted residual block}
-        Modified bottleneck block defined as follows:
+        Modified bottleneck block composed by:
        \begin{enumerate}
            \item A $1 \times 1$ convolution to expand the input channels by a factor of $t$. 
            \item A $3 \times 3$ depth-wise convolution.
@ -370,25 +370,25 @@ Network with bottleneck-block-inspired inception modules.

        \begin{table}[H]
            \centering
-            \caption{\parbox[t]{0.6\linewidth}{Architecture of MobileNetV2 with expansion factor ($t$), number of channels ($c$), number of times a block is repeated ($n$), and stride ($s$).}}
+            \caption{\parbox[t]{0.77\linewidth}{Architecture of MobileNetV2 with expansion factor ($t$), number of channels ($c$), number of times a block is repeated ($n$), and stride ($s$).}}
            \small
            \begin{tabular}{cccccc}
                \toprule
                \textbf{Input}                       & \textbf{Operator}                      & $t$ & $c$     & $n$ & $s$ \\
                \midrule
-                $2242 \times 3$             & \texttt{conv2d}               & - & 32    & 1 & 2 \\
+                $224^2 \times 3$             & \texttt{conv2d}               & - & 32    & 1 & 2 \\
                \midrule
-                $1122 \times 32$            & \texttt{bottleneck}           & 1 & 16    & 1 & 1 \\
-                $1122 \times 16$            & \texttt{bottleneck}           & 6 & 24    & 2 & 2 \\
-                $562 \times 24$             & \texttt{bottleneck}           & 6 & 32    & 3 & 2 \\
-                $282 \times 32$             & \texttt{bottleneck}           & 6 & 64    & 4 & 2 \\
-                $142 \times 64$             & \texttt{bottleneck}           & 6 & 96    & 3 & 1 \\
-                $142 \times 96$             & \texttt{bottleneck}           & 6 & 160   & 3 & 2 \\
-                $72 \times 160$             & \texttt{bottleneck}           & 6 & 320   & 1 & 1 \\
+                $112^2 \times 32$            & \texttt{bottleneck}           & 1 & 16    & 1 & 1 \\
+                $112^2 \times 16$            & \texttt{bottleneck}           & 6 & 24    & 2 & 2 \\
+                $56^2 \times 24$             & \texttt{bottleneck}           & 6 & 32    & 3 & 2 \\
+                $28^2 \times 32$             & \texttt{bottleneck}           & 6 & 64    & 4 & 2 \\
+                $14^2 \times 64$             & \texttt{bottleneck}           & 6 & 96    & 3 & 1 \\
+                $14^2 \times 96$             & \texttt{bottleneck}           & 6 & 160   & 3 & 2 \\
+                $7^2 \times 160$             & \texttt{bottleneck}           & 6 & 320   & 1 & 1 \\
                \midrule
-                $72 \times 320$             & \texttt{conv2d $1\times1$}    & - & 1280  & 1 & 1 \\
+                $7^2 \times 320$             & \texttt{conv2d $1\times1$}    & - & 1280  & 1 & 1 \\
                \midrule
-                $72 \times 1280$            & \texttt{avgpool $7\times7$}   & - & -     & 1 & - \\
+                $7^2 \times 1280$            & \texttt{avgpool $7\times7$}   & - & -     & 1 & - \\
                $1 \times 1 \times 1280$    & \texttt{conv2d $1\times1$}    & - & k     & - & 1 \\
                \bottomrule
            \end{tabular}
@ -401,7 +401,7 @@ Network with bottleneck-block-inspired inception modules.

 \begin{description}
    \item[Single dimension scaling]
-        Scaling a baseline model by width, depth, or resolution. It generally always improve the accuracy.
+        Scaling a baseline model by width, depth, or resolution generally improves the accuracy.

        \begin{description}
            \item[Width scaling] \marginnote{Width scaling}
@ -443,7 +443,7 @@ Network with bottleneck-block-inspired inception modules.
                In practice, $\alpha$, $\beta$, and $\gamma$ are determined through grid search.

                \begin{remark}
-                    The constraint is formulated in this way as FLOPS scales linearly with depth but quadratically with width and resolution.
+                    The constraint is formulated in this way as FLOPS scales linearly by depth but quadratically by width and resolution.
                \end{remark}
        \end{description}
 \end{description}
--- a/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex
@ -105,7 +105,7 @@ Methods that also consider the second-order derivatives when determining the ste
        \]
        where $\mu \in [0, 1[$ is the momentum coefficient.

-        In other words, $v^{(t+1)}$ represents a weighted average of the updates steps done up until time $t$.
+        In other words, $v^{(t+1)}$ represents a weighted average of the update steps done up until time $t$.

        \begin{remark}
            Momentum helps to counteract a poor conditioning of the Hessian matrix when working with canyons.
@ -134,7 +134,7 @@ Methods that also consider the second-order derivatives when determining the ste
        \]

        \begin{remark}
-            The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as $\matr{\theta}^{(t)}$ has been partially updated.
+            The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as it has been partially updated.
        \end{remark}

        \begin{remark}
@ -228,7 +228,7 @@ Methods that also consider the second-order derivatives when determining the ste
        where $\beta \in [0, 1]$ (typically $0.9$ or higher) makes $s^{(t)}$ an exponential moving average.

        \begin{remark}
-            RMSProp is faster than SGD at the beginning and slows down reaching similar performances as SGD.
+            RMSProp is faster than SGD at the beginning before slowing down and reaching similar performances as SGD.
        \end{remark}

        \begin{figure}[H]
@ -254,11 +254,11 @@ Methods that also consider the second-order derivatives when determining the ste

        Moreover, as $\vec{g}^{(0)} = 0$, $\vec{s}^{(0)} = 0$, and $\beta_1$, $\beta_2$ are typically large (i.e., past history weighs more), Adam starts by taking small steps (e.g., $\vec{g}^{(1)} = (1-\beta_1) \nabla\mathcal{L}(\vec{\theta}^{(0)})$ is simply rescaling the gradient for no reason). To cope with this, a debiased formulation of $\vec{g}$ and $\vec{s}$ is used:
        \[
-            \vec{g}^{(t)}_{\text{debiased}} = \frac{g^{(t+1)}}{1-\beta_1^{t+1}} 
+            \vec{g}^{(t)}_{\text{debiased}} = \frac{\vec{g}^{(t+1)}}{1-\beta_1^{t+1}} 
            \qquad
-            \vec{s}^{(t)}_{\text{debiased}} = \frac{s^{(t+1)}}{1-\beta_2^{t+1}} 
+            \vec{s}^{(t)}_{\text{debiased}} = \frac{\vec{s}^{(t+1)}}{1-\beta_2^{t+1}} 
        \]
-        where the denominator $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.
+        where the denominators $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.

        Finally, the update is defined as:
        \[
@ -296,7 +296,7 @@ Methods that also consider the second-order derivatives when determining the ste
 \end{remark}

 \begin{remark}
-    Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape" small basins.
+    Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape'' smaller ones.

    \begin{figure}[H]
        \centering
--- a/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
@ -70,7 +70,7 @@
                With the projection $\matr{W}_K \in \mathbb{R}^{d_Y \times d_K}$ such that $\mathbb{R}^{M \times d_K} \ni \matr{K} = \matr{Y} \matr{W}_K$, where $d_K$ is the dimension of the keys.

            \item[Query] 
-                With the projection $\matr{W}_Q \in \mathbb{R}^{d_X \times d_K}$ such that $\mathbb{R}^{d_K} \ni \vec{q}_1 = \matr{Y} \matr{W}_X$, where $d_X$ is the length of $\vec{x}_1$ that is no longer required to be $d_Y$ as there is a projection.
+                With the projection $\matr{W}_Q \in \mathbb{R}^{d_X \times d_K}$ such that $\mathbb{R}^{d_K} \ni \vec{q}_1 = \vec{x}_1 \matr{W}_X$, where $d_X$ is the length of $\vec{x}_1$ that is no longer required to be $d_Y$ as there is a projection.

            \item[Values]
                With the projection $\matr{W}_V \in \mathbb{R}^{d_Y \times d_V}$ such that $\mathbb{R}^{M \times d_V} \ni \matr{V} = \matr{Y} \matr{W}_K$, where $d_V$ is the dimension of the values.
@ -88,7 +88,7 @@
            % \caption{Steps of scaled dot-product attention}
        \end{figure}

-        Finally, due to the linear projections, instead of a single vector there can be an arbitrary number $N$ of inputs $\matr{X} \in \mathbb{R}^{N \times d_X}$ to compute the queries $\mathbb{R}^{N \times d_K} \ni \matr{Q} = \matr{X} \matr{W}_Q$. This change affects the similarity scores $\matr{Q}\matr{K}^T \in \mathbb{R}^{N \times M}$ and the output activations $\matr{A} \in \mathbb{R}^{N \times d_V}$. 
+        Finally, due to the linear projections, instead of a single vector, there can be an arbitrary number $N$ of inputs $\matr{X} \in \mathbb{R}^{N \times d_X}$ to compute the queries $\mathbb{R}^{N \times d_K} \ni \matr{Q} = \matr{X} \matr{W}_Q$. This change affects the similarity scores $\matr{Q}\matr{K}^T \in \mathbb{R}^{N \times M}$ and the output activations $\matr{A} \in \mathbb{R}^{N \times d_V}$. 

        The overall attention mechanism can be defined as:
        \[ \matr{A} = \texttt{softmax}_\texttt{row-wise}\left( \frac{\matr{Q}\matr{K}^T}{\sqrt{d_K}} \right) \matr{V} \]
@ -166,7 +166,7 @@
                \end{remark}

                \begin{remark}
-                    Layer normalization is easier to distribute on multiple computation units and has the same behavior at both train and inference time.
+                    Layer normalization is easier to distribute on multiple computation units and has the same behavior at both training and inference time.
                \end{remark}

                \begin{figure}[H]
@ -176,7 +176,7 @@
                \end{figure}

            \item[Feed-forward network (\texttt{FFN})] \marginnote{Feed-forward network}
-                MLP with one hidden layer applied to each token independently. ReLU or one of its variants are used as activation function:
+                MLP with one hidden layer applied to each token independently. ReLU or one of its variant is used as activation function:
                \[ \texttt{FFN}(\vec{x}) = \texttt{relu}(\vec{x}\matr{W}_1 + \vec{b}_1)\matr{W}_2 + \vec{b}_2 \]

                \begin{remark}
@ -301,7 +301,7 @@

        \begin{description}
            \item[Masked self-attention] \marginnote{Masked self-attention}
-                Modification to self-attention to prevent tokens to attend at future positions (i.e., at their right). This can be done by either setting the similarity scores with future tokens to $-\infty$ or directly setting the corresponding attention weights to $0$ (i.e., make the attention weights a triangular matrix).
+                Modification to self-attention to prevent tokens from attending at future positions (i.e., at their right). This can be done by either setting the similarity scores with future tokens to $-\infty$ or directly setting the corresponding attention weights to $0$ (i.e., make the attention weights a triangular matrix).

                \begin{figure}[H]
                    \centering
@ -318,7 +318,7 @@

    \begin{figure}[H]
        \centering
-        \includegraphics[width=0.8\linewidth]{./img/_self_attention_permutation.jpg}
+        \includegraphics[width=0.7\linewidth]{./img/_self_attention_permutation.jpg}
    \end{figure}
 \end{remark}

@ -372,7 +372,7 @@
 \end{remark}

 \begin{remark}
-    Compared to text, image pixels are more redundant and less semantically rich. Therefore, processing all of them together is not strictly necessary.
+    Compared to text, image pixels are more redundant and less semantically rich. Therefore, processing all of them individually is not strictly necessary.
 \end{remark}

 \begin{description}