Add IPCV2 training recipes

2026-02-04 15:51:43 +01:00 · 2024-06-03 22:20:09 +02:00
parent 0a378adcbf
commit d915f2c40a
14 changed files with 371 additions and 0 deletions
--- a/src/year1/image-processing-and-computer-vision/module2/img/_lr_schedule_steps.pdf
+++ b/src/year1/image-processing-and-computer-vision/module2/img/_lr_schedule_steps.pdf
--- a/src/year1/image-processing-and-computer-vision/module2/img/_model_capacity.pdf
+++ b/src/year1/image-processing-and-computer-vision/module2/img/_model_capacity.pdf
--- a/src/year1/image-processing-and-computer-vision/module2/img/fixres1.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/fixres1.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/fixres2.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/fixres2.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/fixres3.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/fixres3.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_1cycle.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_1cycle.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_cosine.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_cosine.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_linear.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_linear.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_step.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_step.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_warmup.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/lr_schedule_warmup.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/multi_scale_augmentation.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/multi_scale_augmentation.png
--- a/src/year1/image-processing-and-computer-vision/module2/img/regularization.png
+++ b/src/year1/image-processing-and-computer-vision/module2/img/regularization.png
--- a/src/year1/image-processing-and-computer-vision/module2/ipcv2.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/ipcv2.tex
@ -15,5 +15,7 @@
    \input{./sections/_image_formation.tex}
    \input{./sections/_classification.tex}
    \input{./sections/_architectures.tex}
+    \input{./sections/_training.tex}
+    \eoc

 \end{document}
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_training.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_training.tex
@ -0,0 +1,369 @@
+\chapter{Training recipes}
+
+
+\begin{description}
+    \item[Model capacity] \marginnote{Model capacity}
+        Capability of a network to fit the train set.
+        It depends on the architecture of the model.
+    
+        \begin{remark}
+            By varying the architecture of a model, the resulting network might overfit or underfit.
+        \end{remark}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.6\linewidth]{./img/_model_capacity.pdf}
+        \end{figure}
+
+
+    \item[Effective capacity] \marginnote{Effective capacity}
+        Actual capacity of a model that depends on the training hyperparameters (e.g. learning rate, epochs, \dots).
+
+        \begin{remark}
+            In practice, a model with a large theoretical capacity is used and it is tuned on its effective capacity.
+        \end{remark}
+\end{description}
+
+
+
+\section{Learning rate schedule}
+\marginnote{Learning rate schedule}
+
+Mixture of high and low learning rates to find a good compromise between speed and accuracy.
+
+\begin{remark}
+    If the learning rate is too high, after the first iteration, updates might get stuck in a valley.
+\end{remark}
+
+\begin{remark}
+    Intuitively, learning rate schedulers allow to find wide minima (i.e. skip narrow minima and reach a basin with a minimum more ``compatible'' with the step size).
+
+    Moreover, the optimal loss of the test set is usually a shifted and distorted version of the train loss. Therefore, a wider minimum results in a more robust model.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/_lr_schedule_steps.pdf}
+    \end{figure}
+\end{remark}
+
+\begin{minipage}{0.6\linewidth}
+    \begin{description}
+        \item[Step]
+            Start with a high learning rate and divide it by 10 when reaching a plateau.
+    \end{description}
+\end{minipage}
+\begin{minipage}{0.35\linewidth}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/lr_schedule_step.png}
+    \end{figure}
+\end{minipage}
+
+\begin{minipage}{0.6\linewidth}
+    \begin{description}
+        \item[Cosine]
+            Continuous decay of the learning rate that follows the cosine function.
+            Given the number of training epochs $E$, the starting learning rate $\texttt{lr}_0$ and ending learning rate $\texttt{lr}_E$, the learning rate at epoch $e$ is given by:
+            \[ \texttt{lr}_e = \texttt{lr}_E + \frac{1}{2}(\texttt{lr}_0 - \texttt{lr}_E) \left( 1 + \cos\left( \frac{e\pi}{E} \right) \right) \]
+
+            \begin{remark}
+                Compared to the step scheduler, it only requires two hyperparameters (starting and ending learning rate).
+            \end{remark}
+    \end{description}
+\end{minipage}
+\begin{minipage}{0.35\linewidth}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/lr_schedule_cosine.png}
+    \end{figure}
+\end{minipage}
+
+\begin{minipage}{0.6\linewidth}
+    \begin{description}
+        \item[Linear]
+            Continuous decay of the learning rate that follows a linear function.
+            Given the number of training epochs $E$, the starting learning rate $\texttt{lr}_0$ and ending learning rate $\texttt{lr}_E$, the learning rate at epoch $e$ is given by:
+            \[ \texttt{lr}_e = \texttt{lr}_E + (\texttt{lr}_0 - \texttt{lr}_E) \left( 1 + \frac{e}{E} \right) \]
+    \end{description}
+\end{minipage}
+\begin{minipage}{0.35\linewidth}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/lr_schedule_linear.png}
+    \end{figure}
+\end{minipage}
+
+\begin{minipage}{0.6\linewidth}
+    \begin{description}
+        \item[Warm-up]
+            Start with a small learning rate for a few steps (a few epochs or batch steps) before growing and progressively decaying.
+
+            \begin{remark}
+                This is useful for large networks where poor initialization and high learning rates at the beginning might slow down convergence.
+            \end{remark}
+    \end{description}
+\end{minipage}
+\begin{minipage}{0.35\linewidth}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/lr_schedule_warmup.png}
+    \end{figure}
+\end{minipage}
+
+\begin{description}
+    \item[One cycle]
+        Scheduler defined on batch steps. It starts with a small learning rate that grows until a maximum is reached after which decay starts.
+
+        \begin{minipage}{0.6\linewidth}
+            Given:
+            \begin{itemize}
+                \item The number of training iterations $I$,
+                \item The starting learning rate $\texttt{lr}_0$,
+                \item The peak learning rate $\texttt{lr}_{\max}$,
+                \item The ending learning rate $\texttt{lr}_{\min}$,
+                \item The percentage of iterations $p$ for which the learning rate should increase,
+            \end{itemize}
+        \end{minipage}
+        \begin{minipage}{0.35\linewidth}
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.9\linewidth]{./img/lr_schedule_1cycle.png}
+            \end{figure}
+        \end{minipage}
+
+        The learning rate at iteration $i$ is given by:
+        \[ 
+            \texttt{lr}_i = 
+                \begin{cases}
+                    \texttt{lr}_{\max} + (\texttt{lr}_0 - \texttt{lr}_{\max}) \left( 1 - \frac{i}{pI} \right) & \text{if $i < pI$} \\
+                    \texttt{lr}_{\min} + (\texttt{lr}_0 - \texttt{lr}_{\min}) \left( 1 - \frac{i - pI}{I - pI} \right) & \text{if $i \geq pI$} \\
+                \end{cases}
+        \]
+\end{description}
+
+
+
+\section{Regularization}
+
+\begin{description}
+    \item[Regularization] \marginnote{Regularization} 
+        Modifications to the network that aim to improve its generalization capacity without introducing overfitting.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.5\linewidth]{./img/regularization.png}
+        \end{figure}
+\end{description}
+
+
+\subsection{Parameter norm penalties}
+
+Add a regularization term to the loss:
+\[ 
+    \mathcal{L}(\matr{\theta}; \mathcal{D}^\text{(train)}) = 
+        \mathcal{L}^\text{(task)}(\matr{\theta}; \mathcal{D}^\text{(train)}) + 
+        \lambda \mathcal{L}^\text{(reg)}(\matr{\theta}) 
+\]
+
+\begin{remark}
+    Intuitively, normalization forces parameters to be small, therefore, limiting the effective capacity of the model and improving generalization.
+\end{remark}
+
+\begin{description}
+    \item[L1 regularization] 
+        The regularization term is:
+        \[ \mathcal{L}^\text{(reg)}(\matr{\theta}) = \sum_{i} \vert \matr{\theta} \vert \]
+
+    \item[L2 regularization] 
+        The regularization term is:
+        \[ \mathcal{L}^\text{(reg)}(\matr{\theta}) = \sum_{i} \matr{\theta}^2 = \Vert \matr{\theta} \Vert_2^2 \]
+
+        \begin{remark}
+            When using plain SGD, L2 regularization is also called weight decay.
+            In fact, given the loss:
+            \[ 
+                \mathcal{L}(\matr{\theta}; \mathcal{D}^\text{(train)}) = 
+                \mathcal{L}^\text{(task)}(\matr{\theta}; \mathcal{D}^\text{(train)}) + 
+                \frac{\lambda}{2} \Vert \matr{\theta} \Vert_2^2
+            \]
+            The gradient update is:
+            \[
+                \begin{split}
+                    \matr{\theta}^{(i+1)} &= \matr{\theta}^{(i)} - \texttt{lr} \nabla_\matr{\theta} \mathcal{L}(\matr{\theta}^{(i)}; \mathcal{D}^\text{(train)}) \\
+                        &= \matr{\theta}^{(i)} - \texttt{lr} \nabla_\matr{\theta}\left[ 
+                                \mathcal{L}^\text{(task)}(\matr{\theta}; \mathcal{D}^\text{(train)}) + 
+                                \frac{\lambda}{2} \Vert \matr{\theta}^{(i)} \Vert_2^2
+                            \right] \\
+                        &= \matr{\theta}^{(i)} - \texttt{lr} \left[ 
+                                \nabla_\matr{\theta} \mathcal{L}^\text{(task)}(\matr{\theta}; \mathcal{D}^\text{(train)}) + 
+                                \lambda \matr{\theta}^{(i)}
+                            \right] \\
+                        &= 
+                            \underbrace{(1 - \texttt{lr} \lambda) \matr{\theta}^{(i)}}_{\mathclap{\text{Decayed parameter vector \phantom{aaaa}}}} - 
+                            \underbrace{\texttt{lr} \nabla_\matr{\theta} \mathcal{L}^\text{(task)}(\matr{\theta}; \mathcal{D}^\text{(train)})}_{\mathclap{\text{\phantom{aaaa} Standard gradient descent step}}}
+                \end{split}  
+            \]
+        \end{remark}
+
+        \begin{remark}
+            The \texttt{weight\_decay} of more advanced optimizers (e.g. Adam) is not always the L2 regularization.
+            
+            In PyTorch, \texttt{Adam} implements L2 regularization while \texttt{AdamW} uses another type of regularizer.
+        \end{remark}
+\end{description}
+
+
+\subsection{Early stopping}
+\marginnote{Early stopping}
+
+Monitor performance on the validation set and either:
+\begin{itemize}
+    \item Select the checkpoint with the best validation set after a maximum number of epochs.
+    \item Set a hyperparameter (patience) that stops training if, after a certain number of steps, validation performance does not improve.
+\end{itemize}
+
+\begin{remark}
+    If possible, the first option is preferable as validation metrics might improve after a few steps of stagnation or decrease.
+\end{remark}
+
+
+\subsection{Label smoothing}
+\marginnote{Label smoothing}
+
+When using cross-entropy with softmax, the model has to push the correct logit to $+\infty$ and the others to $-\infty$ so that the softmax outputs $1$ for the correct label. This is unnecessary as it might cause overfitting.
+
+Given $C$ classes, labels can be smoothed by assuming a small uniform noise $\varepsilon$:
+\[
+    \vec{y}^{(i)} = \begin{cases}
+        1 - \frac{\varepsilon(C-1)}{C} & \text{if $i$ is the correct label} \\
+        \frac{\varepsilon}{C} & \text{otherwise} \\
+    \end{cases}
+\]
+
+
+\subsection{Dropout}
+
+\begin{description}
+    \item[Train time] \marginnote{Dropout}
+        For each batch step, generate a mask that sets some activations (usually $10\% - 50\%$) to zero during the forward pass.
+        
+        \begin{remark}
+            Intuitively, activations that are not dropped should learn to not rely on the other neurons and therefore avoiding focusing on selective information.
+        \end{remark}
+        
+        \begin{remark}
+            Dropout can be seen as an ensemble of models where each mask is a new model.
+        \end{remark}
+
+    \item[Test time] 
+        With dropout, the output of the network is non-deterministic and methods to make inference more deterministic should be applied:
+        \begin{description}
+            \item[Naive approach] 
+                A naive workaround is to sample some random masks and average the outputs of the network on these masks:
+                \[ f(\vec{x}; \matr{\theta}) = \mathbb{E}_\matr{m} [ f(\vec{x}; \matr{\theta}, \matr{m}) ] = \sum_\matr{m} p(\matr{m}) f(\vec{x}; \matr{\theta}, \matr{m}) \]
+                Although more accurate, this method is computationally expensive.
+
+            \item[Weight scaling] 
+                Without loss of generality, consider an activation $a$ obtained as a linear combination of two neurons $x_1$ and $x_2$.
+                During test time, the activation is:
+                \[ a^\text{(test)} = w_1 x_1 + w_2 x_2 \]
+                Assuming a dropout with $p=0.5$ (a neuron has a $50\%$ probability of being dropped), the expected value of the activation $a^\text{(train)}$ at train time is:
+                \[  
+                    \begin{split}
+                        \mathbb{E}_\matr{m} [ a^\text{(train)} ] &= \frac{1}{4}(w_1 x_1 + w_2 x_2) + \frac{1}{4}(w_1 x_1 + 0) + \frac{1}{4}(0 + w_2 x_2) + \frac{1}{4}(0 + 0) \\
+                            &= \frac{1}{2}(w_1 x_1 + w_2 x_2) = p a^\text{(test)}
+                    \end{split}
+                \]
+                There is, therefore, a $p$ factor of discrepancy between $a^\text{(train)}$ and $a^\text{(test)}$ that might disrupt the distribution of the activations. Two approaches can be taken:
+                \begin{itemize}
+                    \item Rescale the value at test time:
+                        \[ p a^\text{(test)} = \mathbb{E}_\matr{m} \left[ a^\text{(train)} \right] \]
+
+                    \item Rescale the value at train time (inverted dropout):
+                        \[ a^\text{(test)} = \mathbb{E}_\matr{m} \left[ \frac{a^\text{(train)}}{p} \right] \]
+                        This approach is preferred as it leaves test time unchanged.
+                \end{itemize}
+
+                \begin{remark}
+                    Weight scaling is exact only for linear layers. Still, with a non-linear activation, this method is a fast approximation of the output of the network with dropout.
+                \end{remark}
+        \end{description}
+\end{description}
+
+\begin{remark}
+    Dropout on convolutional layers are usually not necessary as they already have a strong inductive bias.
+\end{remark}
+
+\begin{remark}
+    Dropout and batch normalization show a general pattern for regularization:
+    \begin{itemize}
+        \item At train time, some randomness is added.
+        \item Ad test time, inference is done by averaging or approximating the output of the network.
+    \end{itemize}
+\end{remark}
+
+
+\subsection{Data augmentation}
+\marginnote{Data augmentation}
+
+Increase the size of a dataset by manipulating (e.g. flipping) the existing examples.
+
+\begin{remark}
+    More data always reduces variance and is therefore always a positive thing.
+\end{remark}
+
+\begin{remark}
+    Transformations should be label-preserving (e.g. a $180^\circ$ rotation should not be applied on a $6$ or $9$).
+\end{remark}
+
+
+\begin{description}
+    \item[Multi-scale training] \marginnote{Multi-scale training}
+        Sample random crops and scales:
+
+        \begin{minipage}{0.7\linewidth}
+            \begin{enumerate}
+                \item Choose a random size $S$ in range $[S_{\min}, S_{\max}]$.
+                \item Isotropically (i.e. preserve the aspect ratio) scale the training image so that the short side is of size $S$.
+                \item Sample random patches of a given size.
+            \end{enumerate}
+        \end{minipage}
+        \begin{minipage}{0.2\linewidth}
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.9\linewidth]{./img/multi_scale_augmentation.png}
+            \end{figure}
+        \end{minipage}
+
+        \begin{remark}
+            For $S$ close to the patch size, crops will capture whole-image statistics.
+            For $S$ bigger than the patch size, crops will cover small portions of the image (with the risk of getting areas where the target content is not present).
+        \end{remark}
+
+    \item[FixRes] \marginnote{FixRes}
+        At test time, the image is usually presented with the target at the center (photographer bias) at a specific size. 
+        However, using data augmentation at train time, the size of the crops of the target varies following a distribution that is different from the one of the test set and usually involves smaller crops of the input image.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.35\linewidth]{./img/fixres1.png}
+        \end{figure}
+
+        Therefore, there is a discrepancy between train and test images as, during training, the network sees objects that are bigger due to the rescaling of the input image.
+
+        It has been shown that a possible solution to close this discrepancy is to train the model using images with a lower resolution than the test set. The possible alternatives are:
+        \begin{itemize}
+            \item Reduce train-time resolution.
+            \item Reduce test-time resolution.
+        \end{itemize}
+        \begin{figure}[H]
+            \centering
+            \begin{subfigure}{0.6\linewidth}
+                \centering
+                \includegraphics[width=0.9\linewidth]{./img/fixres2.png}
+            \end{subfigure}
+            \begin{subfigure}{0.38\linewidth}
+                \centering
+                \includegraphics[width=0.9\linewidth]{./img/fixres3.png}
+            \end{subfigure}
+        \end{figure}
+\end{description}