From 0969f767e9410f2959dca79b2f1e055a1afc3614 Mon Sep 17 00:00:00 2001
From: NotXia <35894453+NotXia@users.noreply.github.com>
Date: Thu, 20 Jun 2024 10:48:32 +0200
Subject: [PATCH] Fix typos <noupdate>

---
 .../module2/sections/_architectures.tex               |  9 +++++----
 .../module2/sections/_classification.tex              | 10 +++++-----
 .../module2/sections/_image_formation.tex             | 11 ++++++-----
 .../module2/sections/_training.tex                    |  2 +-
 4 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_architectures.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_architectures.tex
index 2d56ca0..2e1cd01 100644
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_architectures.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_architectures.tex
@@ -5,7 +5,7 @@
 
 \begin{description}
     \item[Stem layer] \marginnote{Stem layer}
-        First convolutional layer of a CNN that aims to reduce the spatial size of the activations for memory and computational purposes
+        First convolutional layer(s) of a CNN that aims to reduce the spatial size of the activations for memory and computational purposes
         but also to rapidly increase the receptive field.
 
     \item[Model parallelism] \marginnote{Model parallelism}
@@ -299,6 +299,7 @@ The authors constrained the layers to:
 
             On the other hand, two activations are computed and both need to be stored for backpropagation.
 
+            \indenttbox
             \begin{example}
                 \phantom{}
                 \begin{center}
@@ -451,7 +452,7 @@ Network that aims to optimize computing resources.
 \begin{description}
     \item[Stem layers]
         Down-sample the image from a shape of 224 to 28.
-        As in ZFNet, multiple layers are used (5) and the largest convolution is of shape $7 \times 7$ and stride $2$.
+        As in ZFNet, multiple layers are used (5) and the largest convolution is of shape $7 \times 7$ with stride $2$.
 
     \item[Inception module] \marginnote{Inception module}
         Main component of Inception-v1 that computes multiple convolutions on the input.
@@ -621,7 +622,7 @@ A larger version of Inception v3 with more complicated stem layers.
 
 \begin{description}
     \item[Standard residual block] \marginnote{Standard residual block}
-        Block that allows to easily learn the identity function through skip connections.
+        Block that allows to easily learn the identity function through a skip connection.
         The output of a residual block with input $x$ and a series of convolutional layers $F$ is:
         \[ F(x; \matr{\theta}) + x \]
 
@@ -779,7 +780,7 @@ It has the following properties:
         \item The majority of the possible paths have a length of $\sim 30$.
         \item The gradient magnitude is significant at the first layers (i.e. in shorter paths).
     \end{itemize}
-    By multiplying values of two points above, results show that the total gradient magnitude is significant only up until paths of length $\sim 20$.
+    By multiplying the values of the two points above, results show that the total gradient magnitude is significant only up until paths of length $\sim 20$.
     
     \begin{figure}[H]
         \centering
diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
index e219add..4685c1d 100644
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
@@ -291,7 +291,7 @@ The prediction is obtained as the index of the maximum score.
 
     \begin{figure}[H]
         \centering
-        \includegraphics[width=0.45\linewidth]{./img/data_representation_linear.png}
+        \includegraphics[width=0.40\linewidth]{./img/data_representation_linear.png}
         \caption{
             \parbox[t]{0.6\linewidth}{
                 Example of non-linearly separable data points that become linearly separable in polar coordinates
@@ -386,12 +386,12 @@ The prediction is obtained as the index of the maximum score.
 
             \begin{figure}[H]
                 \centering
-                \begin{subfigure}{0.55\linewidth}
+                \begin{subfigure}{0.6\linewidth}
                     \centering
                     \includegraphics[width=\linewidth]{./img/relu_separability_1.png}
                 \end{subfigure}
 
-                \begin{subfigure}{0.55\linewidth}
+                \begin{subfigure}{0.6\linewidth}
                     \centering
                     \includegraphics[width=\linewidth]{./img/relu_separability_2.png}
                 \end{subfigure}
@@ -442,7 +442,7 @@ Image filtering can be implemented through:
 
         Given an image of size $H \times W$, a convolution requires:
         \begin{itemize}
-            \item $2$ parameters.
+            \item $2$ parameters (in the case of edge detection).
             \item $3 (H \cdot (W-1)) \approx 3HW$ FLOPs.
         \end{itemize}
 
@@ -469,7 +469,7 @@ Image filtering can be implemented through:
 
 \begin{description}
     \item[Multi-channel convolution] \marginnote{Multi-channel convolution}
-        On inputs with multiple channels (i.e. 3D inputs), different 2D convolutions are applied across the different channels.
+        On inputs with multiple channels (e.g. RGB images), different 2D convolutions are applied across the different channels.
 
         Given a $C_\text{in} \times H_\text{in} \times W_\text{in}$ image $I$, a convolution kernel $K$ will have shape $C_\text{in} \times H_K \times W_K$
         and the output activation at each pixel is computed as:
diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex
index 964afed..a1a87ba 100644
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex
@@ -84,7 +84,7 @@ is done in two steps:
 \marginnote{Roto-translation}
 
 The conversion from the world reference system to the camera reference system
-is done through a roto-translation wrt the optical center.
+is done through a roto-translation w.r.t. the optical center.
 
 Given: 
 \begin{itemize}
@@ -111,7 +111,7 @@ the coordinates $\vec{M}_C$ in CRF corresponding to $\vec{M}_W$ are given by:
 \]
 
 \begin{remark}
-    The coordinates $\vec{C}_W$ of the optical center $\vec{C}$ are obtained as:
+    The coordinates $\vec{C}_W$ of the optical center $\vec{C} = \nullvec$ are obtained as:
     \[ 
         \nullvec = \matr{R}\vec{C}_W + \vec{t} 
             \iff (\nullvec - \vec{t}) = \matr{R}\vec{C}_W 
@@ -378,7 +378,7 @@ where:
         \]
         where $p_1$ and $p_2$ are additional intrinsic parameters.
         \begin{remark}
-            This approximation has empirically been shown to work.
+            This approximation has been empirically shown to work.
         \end{remark}
 \end{itemize}
 
@@ -620,7 +620,7 @@ Therefore, the complete workflow for image formation becomes the following:
 
     \item[Homographies non-linear refinement]
         The homographies $\matr{H}_i$ estimated at the previous step are obtained using a linear method and need to be refined as, for each image $i$, 
-        the IRF coordinates $\matr{H}_i\vec{w}_j = (\frac{h_{i, 1}^T \tilde{\vec{w}}_j}{h_{i, 3}^T \tilde{\vec{w}}_j}, \frac{h_{i, 2}^T \tilde{\vec{w}}_j}{h_{i, 3}^T \tilde{\vec{w}}_j})$
+        the IRF coordinates $\matr{H}_i\vec{w}_j = \left( \frac{h_{i, 1}^T \tilde{\vec{w}}_j}{h_{i, 3}^T \tilde{\vec{w}}_j}, \frac{h_{i, 2}^T \tilde{\vec{w}}_j}{h_{i, 3}^T \tilde{\vec{w}}_j} \right)$
         of the world point $\vec{w}_j$ are still not matching the known IRF coordinates $\vec{m}_{i,j}$ of the $j$-corner in the $i$-image.
         \begin{figure}[H]
             \centering
@@ -896,7 +896,7 @@ The computed input coordinates might be continuous. Possible discretization stra
 
 \subsection{Undistort warping}
 
-Once a camera has been calibrated, the lens distortion parameters can be used to obtain the undistorted image through backward warping.
+Once a camera has been calibrated using Zhang's method, the lens distortion parameters can be used to obtain the undistorted image through backward warping.
 \[
     \begin{split}
         w_u &= u_\text{undist} + (k_1 r^2 + k_2 r^4)(u_\text{undist} - u_0) \\
@@ -1018,6 +1018,7 @@ Undistorted images enjoy some properties:
 
             Finally, the homography $\matr{A}\matr{R}_\text{pitch}\matr{A}^{-1}$ relates the pitched image to the ideal image.
 
+            \indenttbox
             \begin{remark}
                 The same procedure can be done for the yaw.
             \end{remark}
diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_training.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_training.tex
index 3a4cc8f..529d023 100644
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_training.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_training.tex
@@ -297,7 +297,7 @@ Given $C$ classes, labels can be smoothed by assuming a small uniform noise $\va
     Dropout and batch normalization show a general pattern for regularization:
     \begin{itemize}
         \item At train time, some randomness is added.
-        \item Ad test time, inference is done by averaging or approximating the output of the network.
+        \item At test time, inference is done by averaging or approximating the output of the network.
     \end{itemize}
 \end{remark}