diff --git a/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex b/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
index 0f6e36c..fd49d2c 100644
--- a/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
@@ -292,7 +292,7 @@ Network with bottleneck-block-inspired inception modules.
                 Global average pooling to obtain a channel-wise vector.
 
             \item[Excitation]
-                Feed-forward network that first compresses the input channels by a ratio $r$ (typically $16$) and then restores them.
+                Feed-forward network that first compresses the input channels by a ratio $r$ (typically $16$) and then restores them. A final sigmoid gives the channel weights.
         \end{descriptionlist}
 
 
@@ -341,7 +341,7 @@ Network with bottleneck-block-inspired inception modules.
 
 \begin{description}
     \item[Inverted residual block] \marginnote{Inverted residual block}
-        Modified bottleneck block defined as follows:
+        Modified bottleneck block composed by:
         \begin{enumerate}
             \item A $1 \times 1$ convolution to expand the input channels by a factor of $t$. 
             \item A $3 \times 3$ depth-wise convolution.
@@ -370,25 +370,25 @@ Network with bottleneck-block-inspired inception modules.
 
         \begin{table}[H]
             \centering
-            \caption{\parbox[t]{0.6\linewidth}{Architecture of MobileNetV2 with expansion factor ($t$), number of channels ($c$), number of times a block is repeated ($n$), and stride ($s$).}}
+            \caption{\parbox[t]{0.77\linewidth}{Architecture of MobileNetV2 with expansion factor ($t$), number of channels ($c$), number of times a block is repeated ($n$), and stride ($s$).}}
             \small
             \begin{tabular}{cccccc}
                 \toprule
                 \textbf{Input}                       & \textbf{Operator}                      & $t$ & $c$     & $n$ & $s$ \\
                 \midrule
-                $2242 \times 3$             & \texttt{conv2d}               & - & 32    & 1 & 2 \\
+                $224^2 \times 3$             & \texttt{conv2d}               & - & 32    & 1 & 2 \\
                 \midrule
-                $1122 \times 32$            & \texttt{bottleneck}           & 1 & 16    & 1 & 1 \\
-                $1122 \times 16$            & \texttt{bottleneck}           & 6 & 24    & 2 & 2 \\
-                $562 \times 24$             & \texttt{bottleneck}           & 6 & 32    & 3 & 2 \\
-                $282 \times 32$             & \texttt{bottleneck}           & 6 & 64    & 4 & 2 \\
-                $142 \times 64$             & \texttt{bottleneck}           & 6 & 96    & 3 & 1 \\
-                $142 \times 96$             & \texttt{bottleneck}           & 6 & 160   & 3 & 2 \\
-                $72 \times 160$             & \texttt{bottleneck}           & 6 & 320   & 1 & 1 \\
+                $112^2 \times 32$            & \texttt{bottleneck}           & 1 & 16    & 1 & 1 \\
+                $112^2 \times 16$            & \texttt{bottleneck}           & 6 & 24    & 2 & 2 \\
+                $56^2 \times 24$             & \texttt{bottleneck}           & 6 & 32    & 3 & 2 \\
+                $28^2 \times 32$             & \texttt{bottleneck}           & 6 & 64    & 4 & 2 \\
+                $14^2 \times 64$             & \texttt{bottleneck}           & 6 & 96    & 3 & 1 \\
+                $14^2 \times 96$             & \texttt{bottleneck}           & 6 & 160   & 3 & 2 \\
+                $7^2 \times 160$             & \texttt{bottleneck}           & 6 & 320   & 1 & 1 \\
                 \midrule
-                $72 \times 320$             & \texttt{conv2d $1\times1$}    & - & 1280  & 1 & 1 \\
+                $7^2 \times 320$             & \texttt{conv2d $1\times1$}    & - & 1280  & 1 & 1 \\
                 \midrule
-                $72 \times 1280$            & \texttt{avgpool $7\times7$}   & - & -     & 1 & - \\
+                $7^2 \times 1280$            & \texttt{avgpool $7\times7$}   & - & -     & 1 & - \\
                 $1 \times 1 \times 1280$    & \texttt{conv2d $1\times1$}    & - & k     & - & 1 \\
                 \bottomrule
             \end{tabular}
@@ -401,7 +401,7 @@ Network with bottleneck-block-inspired inception modules.
 
 \begin{description}
     \item[Single dimension scaling]
-        Scaling a baseline model by width, depth, or resolution. It generally always improve the accuracy.
+        Scaling a baseline model by width, depth, or resolution generally improves the accuracy.
 
         \begin{description}
             \item[Width scaling] \marginnote{Width scaling}
@@ -443,7 +443,7 @@ Network with bottleneck-block-inspired inception modules.
                 In practice, $\alpha$, $\beta$, and $\gamma$ are determined through grid search.
 
                 \begin{remark}
-                    The constraint is formulated in this way as FLOPS scales linearly with depth but quadratically with width and resolution.
+                    The constraint is formulated in this way as FLOPS scales linearly by depth but quadratically by width and resolution.
                 \end{remark}
         \end{description}
 \end{description}
diff --git a/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex b/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex
index d078220..5c2786b 100644
--- a/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex
@@ -105,7 +105,7 @@ Methods that also consider the second-order derivatives when determining the ste
         \]
         where $\mu \in [0, 1[$ is the momentum coefficient.
 
-        In other words, $v^{(t+1)}$ represents a weighted average of the updates steps done up until time $t$.
+        In other words, $v^{(t+1)}$ represents a weighted average of the update steps done up until time $t$.
 
         \begin{remark}
             Momentum helps to counteract a poor conditioning of the Hessian matrix when working with canyons.
@@ -134,7 +134,7 @@ Methods that also consider the second-order derivatives when determining the ste
         \]
 
         \begin{remark}
-            The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as $\matr{\theta}^{(t)}$ has been partially updated.
+            The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as it has been partially updated.
         \end{remark}
 
         \begin{remark}
@@ -228,7 +228,7 @@ Methods that also consider the second-order derivatives when determining the ste
         where $\beta \in [0, 1]$ (typically $0.9$ or higher) makes $s^{(t)}$ an exponential moving average.
 
         \begin{remark}
-            RMSProp is faster than SGD at the beginning and slows down reaching similar performances as SGD.
+            RMSProp is faster than SGD at the beginning before slowing down and reaching similar performances as SGD.
         \end{remark}
 
         \begin{figure}[H]
@@ -254,11 +254,11 @@ Methods that also consider the second-order derivatives when determining the ste
 
         Moreover, as $\vec{g}^{(0)} = 0$, $\vec{s}^{(0)} = 0$, and $\beta_1$, $\beta_2$ are typically large (i.e., past history weighs more), Adam starts by taking small steps (e.g., $\vec{g}^{(1)} = (1-\beta_1) \nabla\mathcal{L}(\vec{\theta}^{(0)})$ is simply rescaling the gradient for no reason). To cope with this, a debiased formulation of $\vec{g}$ and $\vec{s}$ is used:
         \[
-            \vec{g}^{(t)}_{\text{debiased}} = \frac{g^{(t+1)}}{1-\beta_1^{t+1}} 
+            \vec{g}^{(t)}_{\text{debiased}} = \frac{\vec{g}^{(t+1)}}{1-\beta_1^{t+1}} 
             \qquad
-            \vec{s}^{(t)}_{\text{debiased}} = \frac{s^{(t+1)}}{1-\beta_2^{t+1}} 
+            \vec{s}^{(t)}_{\text{debiased}} = \frac{\vec{s}^{(t+1)}}{1-\beta_2^{t+1}} 
         \]
-        where the denominator $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.
+        where the denominators $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.
 
         Finally, the update is defined as:
         \[
@@ -296,7 +296,7 @@ Methods that also consider the second-order derivatives when determining the ste
 \end{remark}
 
 \begin{remark}
-    Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape" small basins.
+    Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape'' smaller ones.
 
     \begin{figure}[H]
         \centering
diff --git a/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex b/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
index e5040b3..b3b00c4 100644
--- a/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
@@ -70,7 +70,7 @@
                 With the projection $\matr{W}_K \in \mathbb{R}^{d_Y \times d_K}$ such that $\mathbb{R}^{M \times d_K} \ni \matr{K} = \matr{Y} \matr{W}_K$, where $d_K$ is the dimension of the keys.
 
             \item[Query] 
-                With the projection $\matr{W}_Q \in \mathbb{R}^{d_X \times d_K}$ such that $\mathbb{R}^{d_K} \ni \vec{q}_1 = \matr{Y} \matr{W}_X$, where $d_X$ is the length of $\vec{x}_1$ that is no longer required to be $d_Y$ as there is a projection.
+                With the projection $\matr{W}_Q \in \mathbb{R}^{d_X \times d_K}$ such that $\mathbb{R}^{d_K} \ni \vec{q}_1 = \vec{x}_1 \matr{W}_X$, where $d_X$ is the length of $\vec{x}_1$ that is no longer required to be $d_Y$ as there is a projection.
 
             \item[Values]
                 With the projection $\matr{W}_V \in \mathbb{R}^{d_Y \times d_V}$ such that $\mathbb{R}^{M \times d_V} \ni \matr{V} = \matr{Y} \matr{W}_K$, where $d_V$ is the dimension of the values.
@@ -88,7 +88,7 @@
             % \caption{Steps of scaled dot-product attention}
         \end{figure}
 
-        Finally, due to the linear projections, instead of a single vector there can be an arbitrary number $N$ of inputs $\matr{X} \in \mathbb{R}^{N \times d_X}$ to compute the queries $\mathbb{R}^{N \times d_K} \ni \matr{Q} = \matr{X} \matr{W}_Q$. This change affects the similarity scores $\matr{Q}\matr{K}^T \in \mathbb{R}^{N \times M}$ and the output activations $\matr{A} \in \mathbb{R}^{N \times d_V}$. 
+        Finally, due to the linear projections, instead of a single vector, there can be an arbitrary number $N$ of inputs $\matr{X} \in \mathbb{R}^{N \times d_X}$ to compute the queries $\mathbb{R}^{N \times d_K} \ni \matr{Q} = \matr{X} \matr{W}_Q$. This change affects the similarity scores $\matr{Q}\matr{K}^T \in \mathbb{R}^{N \times M}$ and the output activations $\matr{A} \in \mathbb{R}^{N \times d_V}$. 
 
         The overall attention mechanism can be defined as:
         \[ \matr{A} = \texttt{softmax}_\texttt{row-wise}\left( \frac{\matr{Q}\matr{K}^T}{\sqrt{d_K}} \right) \matr{V} \]
@@ -166,7 +166,7 @@
                 \end{remark}
 
                 \begin{remark}
-                    Layer normalization is easier to distribute on multiple computation units and has the same behavior at both train and inference time.
+                    Layer normalization is easier to distribute on multiple computation units and has the same behavior at both training and inference time.
                 \end{remark}
 
                 \begin{figure}[H]
@@ -176,7 +176,7 @@
                 \end{figure}
 
             \item[Feed-forward network (\texttt{FFN})] \marginnote{Feed-forward network}
-                MLP with one hidden layer applied to each token independently. ReLU or one of its variants are used as activation function:
+                MLP with one hidden layer applied to each token independently. ReLU or one of its variant is used as activation function:
                 \[ \texttt{FFN}(\vec{x}) = \texttt{relu}(\vec{x}\matr{W}_1 + \vec{b}_1)\matr{W}_2 + \vec{b}_2 \]
 
                 \begin{remark}
@@ -301,7 +301,7 @@
 
         \begin{description}
             \item[Masked self-attention] \marginnote{Masked self-attention}
-                Modification to self-attention to prevent tokens to attend at future positions (i.e., at their right). This can be done by either setting the similarity scores with future tokens to $-\infty$ or directly setting the corresponding attention weights to $0$ (i.e., make the attention weights a triangular matrix).
+                Modification to self-attention to prevent tokens from attending at future positions (i.e., at their right). This can be done by either setting the similarity scores with future tokens to $-\infty$ or directly setting the corresponding attention weights to $0$ (i.e., make the attention weights a triangular matrix).
 
                 \begin{figure}[H]
                     \centering
@@ -318,7 +318,7 @@
 
     \begin{figure}[H]
         \centering
-        \includegraphics[width=0.8\linewidth]{./img/_self_attention_permutation.jpg}
+        \includegraphics[width=0.7\linewidth]{./img/_self_attention_permutation.jpg}
     \end{figure}
 \end{remark}
 
@@ -372,7 +372,7 @@
 \end{remark}
 
 \begin{remark}
-    Compared to text, image pixels are more redundant and less semantically rich. Therefore, processing all of them together is not strictly necessary.
+    Compared to text, image pixels are more redundant and less semantically rich. Therefore, processing all of them individually is not strictly necessary.
 \end{remark}
 
 \begin{description}