Add ML4CV Adam + ResNeXt

This commit is contained in:
2024-09-26 22:04:37 +02:00
parent 0be5931691
commit 4c887f7bc2
13 changed files with 372 additions and 2 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 222 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 257 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 308 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 350 KiB

View File

@ -9,5 +9,6 @@
\makenotesfront
\input{./sections/_optimizers.tex}
\input{./sections/_architectures.tex}
\end{document}

View File

@ -0,0 +1,282 @@
\chapter{Architectures}
\graphicspath{{../../year1/image-processing-and-computer-vision/module2/}}
\section{Inception-v1 (GoogLeNet)\protect\footnote{Excerpt from \href{https://raw.githubusercontent.com/NotXia/unibo-ai-notes/pdfs/year1/image-processing-and-computer-vision/module2/ipcv2.pdf}{IPCV2}}}
\marginnote{Inception-v1 (GoogLeNet)}
Network that aims to optimize computing resources (i.e., small amount of parameters and FLOPs).
\begin{description}
\item[Stem layers]
Down-sample the image from a shape of 224 to 28.
As in ZFNet, multiple layers are used (5) and the largest convolution is of shape $7 \times 7$ with stride $2$.
\item[Inception module] \marginnote{Inception module}
Main component of Inception-v1 that computes multiple convolutions on the input.
Given the input activation, the output is the concatenation of:
\begin{itemize}
\item A $1 \times 1$ (stride $1$) and a $5 \times 5$ (stride $1$, padding $2$) convolution.
\item A $1 \times 1$ (stride $1$) and a $3 \times 3$ (stride $1$ and padding $1$) convolution.
\item A $1 \times 1$ (stride $1$ and padding $0$) convolution.
\item A $1 \times 1$ (stride $1$) convolution and a $3 \times 3$ (stride $1$ and padding $1$) max-pooling.
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_actual_inception.pdf}
\caption{Inception module on the output of the stem layers}
\end{figure}
\begin{remark}
The multiple convolutions of an inception module can be seen as decision components.
\end{remark}
\item[Auxiliary \texttt{softmax}]
Intermediate \texttt{softmax}s are used to ensure that hidden features are good enough.
They also act as regularizers. During inference, they are discarded.
\item[Global average pooling classifier] \marginnote{Global average pooling classifier}
Instead of flattening between the convolutional and fully connected layers,
global average pooling is used to reduce the number of parameters.
\end{description}
\begin{figure}[H]
\centering
\includegraphics[angle=-90, width=0.7\linewidth]{./img/_inception_v1.pdf}
\caption{Architecture of Inception-v1}
\end{figure}
\section{Residual networks\protect\footnote{Excerpt from \href{https://raw.githubusercontent.com/NotXia/unibo-ai-notes/pdfs/year1/image-processing-and-computer-vision/module2/ipcv2.pdf}{IPCV2}}}
\begin{description}
\item[Standard residual block] \marginnote{Standard residual block}
Block that allows to easily learn the identity function through a skip connection.
The output of a residual block with input $x$ and a series of convolutional layers $F$ is:
\[ F(x; \matr{\theta}) + x \]
\begin{minipage}{0.75\linewidth}
\begin{description}
\item[Skip connection] \marginnote{Skip connection}
Connection that skips a certain number of layers (e.g. 2 convolutional blocks).
\end{description}
\begin{remark}
Training starts with small weights so that the network starts as the identity function. Updates can be seen as perturbations of the identity function.
\end{remark}
\begin{remark}
Batch normalization is heavily used.
\end{remark}
\end{minipage}
\begin{minipage}{0.2\linewidth}
\centering
\includegraphics[width=0.8\linewidth]{./img/skip_conn.png}
\end{minipage}
\begin{remark}
Skip connections are applied before the activation function (ReLU) as otherwise it would be summed to all positive values making the perturbation of the identity function less effective.
\end{remark}
\end{description}
\subsection{ResNet}
\marginnote{ResNet-18}
VGG-inspired network with residual blocks.
It has the following properties:\\
\begin{minipage}{0.48\linewidth}
\begin{itemize}
\item A stage is composed of residual blocks.
\item A residual block is composed of two $3 \times 3$ convolutions followed by batch normalization.
\item The first residual block of each stage halves the spatial dimension and doubles the number of channels (there is no pooling).
\end{itemize}
\end{minipage}
\hfill
\begin{minipage}{0.45\linewidth}
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/resnet_18.png}
\caption{Architecture of ResNet-18}
\end{figure}
\end{minipage}
\begin{description}
\item[Bottleneck residual network] \marginnote{Bottleneck residual network}
Variant of residual blocks that uses more layers with approximately the same number of parameters and FLOPs of the standard residual block.
Instead of using two $3 \times 3$ convolutions, bottleneck residual network has the following structure:
\begin{itemize}
\item $1 \times 1$ convolution to compress the channels of the input by an order of $4$ (and the spatial dimension by $2$ if it is the first block of a stage, as in normal ResNet).
\item $3 \times 3$ convolution.
\item $1 \times 1$ convolution to match the shape of the skip connection.
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/bottleneck_block.png}
\caption{Standard residual block (left) and bottleneck block (right)}
\end{figure}
\end{description}
\subsection{Inception-ResNet-v4}
Network with bottleneck-block-inspired inception modules.
\begin{descriptionlist}
\item[Inception-ResNet-A] \marginnote{Inception-ResNet-A}
Three $1 \times 1$ convolutions are used to compress the input channels. Each of them leads to a different path:
\begin{itemize}
\item Directly to the final concatenation.
\item To a $3 \times 3$ convolution.
\item To two $3 \times 3$ convolutions (i.e. a factorized $5 \times 5$ convolution).
\end{itemize}
The final concatenation is passed through a $1 \times 1$ convolution to match the skip connection shape.
\item[Inception-ResNet-B] \marginnote{Inception-ResNet-B}
Three $1 \times 1$ convolutions are used to compress the input channels. Each of them leads to:
\begin{itemize}
\item Directly to the final concatenation.
\item A $1 \times 7$ and $7 \times 1$ convolutions (i.e. a factorized $7 \times 7$ convolution).
\end{itemize}
The final concatenation is passed through a $1 \times 1$ convolution to match the skip connection shape.
\end{descriptionlist}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/inception_resnet.png}
\end{figure}
\graphicspath{}
\section{ResNeXt}
\begin{remark}
Inception and Inception-ResNet modules are multi-branch architectures and can be interpreted as a split-transform-merge paradigm. Moreover, their architectures have been specifically ``hand" designed.
\end{remark}
\begin{description}
\item[Grouped convolution] \marginnote{Grouped convolution}
Given:
\begin{itemize}
\item The input activation of shape $C_\text{in} \times W_\text{in} \times H_\text{in}$,\item The desired number of output channels $C_\text{out}$,
\item The number of groups $G$,
\end{itemize}
a grouped convolution splits the input into $G$ chunks of $\frac{C_\text{in}}{G}$ channels and processes each with a dedicated set of kernels of shape $\frac{C_\text{out}}{G} \times \frac{C_\text{in}}{G} \times W_K \times H_K$. The output activation is obtained by stacking the outputs of each group.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_grouped_conv.pdf}
\end{figure}
By processing the input in smaller chunks, there are the following gains:
\begin{itemize}
\item The number of parameters is $G$ times less.
\item The number of FLOPs is $G$ times less.
\end{itemize}
\begin{remark}
Grouped convolutions are trivially less expressive than convolving on the full input activation. However, as convolutions are expected to build a hierarchy of features, it is reasonable to process the input in chunks as, probably, not all of it is needed.
\end{remark}
\end{description}
\subsection{Architecture}
\begin{description}
\item[ResNetXt block] \marginnote{ResNetXt block}
Given the number of branches $G$ and the number of intermediate channels $d$, a ResNeXt block decomposes a bottleneck residual block into $G$ parallel branches that are summed out at the end.
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/_resnext_block.pdf}
\end{figure}
\begin{remark}
The branching in a ResNeXt block should not be confused with grouped convolutions.
\end{remark}
\begin{remark}
Parametrizing $G$ and $d$ allows obtaining configurations that are FLOP-wise comparable with the original ResNet by fixing $G$ and solving a second-order equation over $d$.
\end{remark}
\begin{description}
\item[Equivalent formulation]
Given an input activation $\vec{x}$ of shape $4C \times H \times W$, each layer of the ResNeXt block can be reformulated as follows:
\begin{descriptionlist}
\item[Second $1 \times 1$ convolution]
Without loss of generality, consider a ResNeXt block with $G=2$ branches.
The output $\vec{y}_k$ at each channel $k=1, \dots, 4C$ is obtained as:
\[ \vec{y}_k = \vec{y}_k^{(1)} + \vec{y}_k^{(2)} + \vec{x}_k \]
where the output $\vec{y}_k^{(b)}$ of a branch $b$ is computed as:
\[
\begin{split}
\vec{y}_k^{(b)}(j, i) &= \left[ \vec{w}^{(b)} * \vec{a}^{(b)} \right]_k(j, i) \\
&= \vec{w}^{(b)}_k \cdot \vec{a}^{(b)}(j, i) \\
&= \vec{w}^{(b)}_k(1) \vec{a}^{(b)}(j, i, 1) + \dots + \vec{w}^{(b)}_k(d) \vec{a}^{(b)}(j, i, d)
\end{split}
\]
where:
\begin{itemize}
\item $*$ represents a convolution,
\item $\vec{a}^{(b)}$ is the input activation with $d$ channels from the previous layer.
\item $\vec{w}^{(b)}$ is the convolutional kernel. $\vec{w}^{(b)}_k \in \mathbb{R}^{d}$ is the kernel used to obtain the $k$-th output channel.
\end{itemize}
By putting everything together:
\[
\begin{split}
\vec{y}_k(j, i) &= \vec{w}^{(1)}_k \cdot \vec{a}^{(1)}(j, i) + \vec{w}^{(2)}_k \cdot \vec{a}^{(2)}(j, i) + \vec{x}_k \\
&=
\underbrace{\left[ \vec{w}^{(1)}_k \vec{w}^{(2)}_k \right]}_{\hspace{1cm}\mathllap{\parbox{4cm}{\scriptsize by stacking, this is a $1\times1$ convolution with $2d$ channels}}}
\cdot
\underbrace{\left[ \vec{a}^{(1)}(j, i) \vec{a}^{(2)}(j, i) \right] }_{\hspace{-1cm}\mathrlap{\parbox{4cm}{\scriptsize by stacking depth-wise, this is an activation with $2d$ channels}}}
+\, \vec{x}_k
\end{split}
\]
Therefore, the last ResNeXt layer with $G$ branches is equivalent to a single convolution with $Gd$ input channels that processes the concatenation of the activations of the previous layer.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_resnext_to_resnet_l3.pdf}
\end{figure}
\item[First $1 \times 1$ convolution]
The $G$ $1 \times 1$ convolutions at the first layer of ResNeXt all process the same input $\vec{x}$. Trivially, this can also be represented using a single $1 \times 1$ convolution with $G$ times more output channels that can be split afterwards.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_resnext_to_resnet_l1.pdf}
\end{figure}
\item[$3 \times 3$ convolution]
By putting together the previous two equivalences, the middle layer has the same definition of a grouped convolution with $G$ groups. Therefore, it can be seen as a single grouped convolution with $G$ groups and $Gd$ input and output channels.
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_resnext_to_resnet_l2.pdf}
\end{figure}
\end{descriptionlist}
\begin{remark}
Therefore, a ResNeXt block is similar to a bottleneck block.
\end{remark}
\end{description}
\end{description}
\subsection{Properties}
The following holds:
\begin{itemize}
\item It has been empirically seen that, with the same FLOPs, it is better to have more groups (i.e., wider activations).
\end{itemize}

View File

@ -162,7 +162,7 @@ Methods that also consider the second-order derivatives when determining the ste
\item[Adaptive learning rates] \marginnote{Adaptive learning rates}
Methods to define per-parameter adaptive learning rates.
Ideally, assuming that the changes in the curvature of the loss are axis-aligned (e.g., in a canyon), it is reasonable to obtain a faster convergence by:
Ideally, assuming that the changes in the curvature of the loss are axis-aligned (i.e., the parameters are independent), it is reasonable to obtain a faster convergence by:
\begin{itemize}
\item Reducing the learning rate along the dimension where the gradient is large.
\item Increasing the learning rate along the dimension where the gradient is small.
@ -227,9 +227,96 @@ Methods that also consider the second-order derivatives when determining the ste
\]
where $\beta \in [0, 1]$ (typically $0.9$ or higher) makes $s^{(t)}$ an exponential moving average.
\begin{remark}
RMSProp is faster than SGD at the beginning and slows down reaching similar performances as SGD.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/rmsprop.png}
\includegraphics[width=0.35\linewidth]{./img/rmsprop.png}
\caption{SGD vs AdaGrad vs RMSProp}
\end{figure}
\end{description}
\subsection{Adam}
\begin{description}
\item[Adaptive moments (Adam)] \marginnote{Adaptive moments (Adam)}
Extends RMSProp by also considering a running average for the gradients:
\[
\begin{split}
\vec{g}^{(t+1)} &= \beta_1 \vec{g}^{(t)} + (1-\beta_1) \nabla\mathcal{L}(\vec{\theta}^{(t)}) \\
\vec{s}^{(t+1)} &= \beta_2 \vec{s}^{(t)} + (1-\beta_2) \nabla\mathcal{L}(\vec{\theta}^{(t)}) \odot \nabla\mathcal{L}(\vec{\theta}^{(t)}) \\
\end{split}
\]
where $\beta_1, \beta_2 \in [0, 1]$ (typically $\beta_1 = 0.9$ and $\beta_2 = 0.999$).
Moreover, as $\vec{g}^{(0)} = 0$, $\vec{s}^{(0)} = 0$, and $\beta_1$, $\beta_2$ are typically large (i.e., past history weighs more), Adam starts by taking small steps (e.g., $\vec{g}^{(1)} = (1-\beta_1) \nabla\mathcal{L}(\vec{\theta}^{(0)})$ is simply rescaling the gradient for no reason). To cope with this, a debiased formulation of $\vec{g}$ and $\vec{s}$ is used:
\[
\vec{g}^{(t)}_{\text{debiased}} = \frac{g^{(t+1)}}{1-\beta_1^{t+1}}
\qquad
\vec{s}^{(t)}_{\text{debiased}} = \frac{s^{(t+1)}}{1-\beta_2^{t+1}}
\]
where the denominator $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.
Finally, the update is defined as:
\[
\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \frac{\texttt{lr}}{\sqrt{s^{(t)}_{\text{debiased}}} + \varepsilon} \odot g^{(t)}_{\text{debiased}}
\]
\begin{remark}
It can be shown that $\frac{g^{(t)}_{\text{debiased}}}{\sqrt{s^{(t)}_{\text{debiased}}}}$ has a bounded domain, making it more controlled than RMSProp.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/adam.png}
\caption{SGD vs AdaGrad vs RMSProp vs Adam}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/adam_noisy.png}
\caption{SGD vs AdaGrad vs RMSProp vs Adam with a smaller batch size}
\end{figure}
\begin{remark}
Adam is based on the assumption of unrelated parameters (i.e., axis-aligned). If this does not actually hold, it might be slower to converge.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/optimizers_no_align.png}
\end{figure}
\end{remark}
\end{description}
\begin{remark}
Empirically, in computer vision Nesterov momentum (properly tuned) works better than Adam.
\end{remark}
\begin{remark}
Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape" small basins.
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/momentum_local_global.png}
\end{figure}
\end{remark}
\subsection{AdamW}
\begin{description}
\item[Adam with weight decay (AdamW)] \marginnote{Adam with weight decay (AdamW)}
Modification on the gradient update of Adam to include weight decay:
\[
\vec{\theta}^{(t+1)} = \vec{\theta}^{(t)} - \frac{\texttt{lr}}{\sqrt{s^{(t)}_{\text{debiased}}} + \varepsilon} \odot g^{(t)}_{\text{debiased}} - \lambda\vec{\theta}^{(t)}
\]
\begin{remark}
Differently from SGD, L2 regularization on Adam is not equivalent to applying weight decay. In fact, by definition, the regularization term is applied to the gradient and not on the update step:
\[ \nabla_{\text{actual}}\mathcal{L}(\vec{\theta}^{(t)}) = \nabla\mathcal{L}(\vec{\theta}^{(t)}) + \lambda\vec{\theta}^{(t)} \]
where $\nabla_{\text{actual}}\mathcal{L}(\vec{\theta}^{(t)})$ is the actual gradient used to compute the running averages $\vec{g}$ and $\vec{s}$.
\end{remark}
\end{description}