mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add IPCV2 AlexNet
This commit is contained in:
@ -6,7 +6,7 @@
|
||||
|
||||
\usepackage{geometry}
|
||||
\usepackage{graphicx, xcolor}
|
||||
\usepackage{amsmath, amsfonts, amssymb, amsthm, mathtools, bm, upgreek, cancel, bbm}
|
||||
\usepackage{amsmath, amsfonts, amssymb, amsthm, mathtools, bm, upgreek, cancel, bbm, siunitx}
|
||||
\usepackage[bottom]{footmisc}
|
||||
\usepackage[pdfusetitle]{hyperref}
|
||||
\usepackage[nameinlink]{cleveref}
|
||||
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 100 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 99 KiB |
@ -4,11 +4,15 @@
|
||||
\date{2023 -- 2024}
|
||||
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
|
||||
|
||||
\newcommand*\rot{\rotatebox{90}}
|
||||
|
||||
|
||||
\begin{document}
|
||||
|
||||
\makenotesfront
|
||||
|
||||
\input{./sections/_image_formation.tex}
|
||||
\input{./sections/_classification.tex}
|
||||
\input{./sections/_architectures.tex}
|
||||
|
||||
\end{document}
|
||||
@ -0,0 +1,209 @@
|
||||
\chapter{Successful architectures}
|
||||
|
||||
|
||||
\section{Preliminaries}
|
||||
|
||||
\begin{description}
|
||||
\item[Stem layer] \marginnote{Stem layer}
|
||||
First convolutional layer of a CNN that aims to reduce the spatial size of the activations for memory and computational purposes
|
||||
but also to rapidly increase the receptive field.
|
||||
|
||||
\item[Model parallelism] \marginnote{Model parallelism}
|
||||
Train a model by splitting its weights on multiple computational units, each receiving the same data.
|
||||
|
||||
\item[Data parallelism] \marginnote{Data parallelism}
|
||||
Train a model by distributing the data on multiple computational units, each with a copy of the weights of the model.
|
||||
|
||||
\item[Overlapping pooling] \marginnote{Overlapping pooling}
|
||||
Pooling layer with kernel size and stride chosen in such a way that
|
||||
some pixels at a step have also been considered at the previous one (e.g. $3 \times 3$ kernel with stride $2$).
|
||||
|
||||
\item[Parameters computation] \marginnote{Parameters computation}
|
||||
\phantom{}
|
||||
\begin{description}
|
||||
\item[Input layer]
|
||||
Given an input image of shape $W_\text{in} \times H_\text{in} \times C_\text{in}$,
|
||||
the number of activations in the input layer is given by:
|
||||
\[ \texttt{\#activations} = W_\text{in} \cdot H_\text{in} \cdot C_\text{in} \]
|
||||
|
||||
% As data is processed in batches of size $B$, the actual number of activations is:
|
||||
% \[ \texttt{\#activations\_batch} = B \cdot \texttt{\#activations} \]
|
||||
|
||||
\item[Convolutional layer]
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item An input of shape $W_\text{in} \times H_\text{in} \times C_\text{in}$,
|
||||
\item A kernel of shape $W_\text{K} \times H_\text{K}$ with padding $P$ and stride $S$,
|
||||
\item A desired number of output channels $C_\text{out}$,
|
||||
\end{itemize}
|
||||
the number of parameters of a convolutional layer (see \Cref{sec:conv_layer}) is given by:
|
||||
\[ \texttt{\#params} = \big( (W_\text{K} \cdot H_\text{K} \cdot C_\text{in}) + 1 \big) \cdot C_\text{out} \]
|
||||
|
||||
The output shape (see \Cref{sec:conv_layer}) and the number of activations is given by:
|
||||
\[
|
||||
\begin{gathered}
|
||||
\texttt{activ\_w} = \left\lfloor \frac{W_\text{in} - W_\text{K} + 2P}{S} \right\rfloor + 1 \hspace{2em}
|
||||
\texttt{activ\_h} = \left\lfloor \frac{H_\text{in} - H_\text{K} + 2P}{S} \right\rfloor + 1 \\
|
||||
\texttt{\#activations} = \texttt{activ\_w} \cdot \texttt{activ\_h} \cdot C_\text{out}
|
||||
\end{gathered}
|
||||
\]
|
||||
|
||||
The number of FLOPs for a single example of the batch is given by:
|
||||
\[ \texttt{FLOPs} = \texttt{\#activations} \cdot (W_\text{K} \cdot H_\text{K} \cdot C_\text{in}) \cdot 2 \]
|
||||
|
||||
\item[Pooling layer]
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item An input of shape $W_\text{in} \times H_\text{in} \times C_\text{in}$,
|
||||
\item A kernel of shape $W_\text{K} \times H_\text{K}$ with padding $P$ and stride $S$,
|
||||
\end{itemize}
|
||||
the number of activations in a pooling layer is computed as above with $C_\text{in} = C_\text{out}$:
|
||||
\[ \texttt{\#activations} = \texttt{activ\_w} \cdot \texttt{activ\_h} \cdot C_\text{in} \]
|
||||
|
||||
The number of FLOPs for a single example of the batch is given by:
|
||||
\[ \texttt{FLOPs} = \texttt{\#activations} \cdot (W_\text{K} \cdot H_\text{K}) \]
|
||||
|
||||
\item[Fully-connected layer]
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item An activation of shape $C_\text{in}$,
|
||||
\item The number of neurons $C_\text{out} = \texttt{\#activations}$,
|
||||
\end{itemize}
|
||||
the number of parameters of a fully-connected layer is:
|
||||
\[ \texttt{\#params} = (C_\text{in} \cdot C_\text{out}) + C_\text{out}\]
|
||||
|
||||
The number of FLOPs for a single example of the batch is given by:
|
||||
\[ \texttt{FLOPs} = 2 \cdot C_\text{in} \cdot C_\text{out} \]
|
||||
|
||||
\item[Memory usage]
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item The batch size $B$,
|
||||
\item The activation size $\texttt{\#activations}$,
|
||||
\item The number of parameters $\texttt{\#params}$,
|
||||
\end{itemize}
|
||||
to estimate the memory consumption, the following have to be taken into account:
|
||||
\begin{itemize}
|
||||
\item To compute the gradient of the loss, every intermediate activation for every example in the batch has to be stored.
|
||||
\item For every parameter, we have to store its value and the gradient of the loss w.r.t. it.
|
||||
\item Optimizers with momentum need to store more values per parameter.
|
||||
\end{itemize}
|
||||
|
||||
It is therefore hard to estimate memory requirements.
|
||||
A rule of thumb estimates a lower bound as twice the activation size and 3-4 times the number of parameters.
|
||||
Assuming that $\texttt{float32}$ (4 bytes) are used, memory consumption is computed as:
|
||||
\[
|
||||
\begin{gathered}
|
||||
\texttt{memory\_activ\_bytes} = 2 \cdot (B \cdot \texttt{\#activations} \cdot 4) \\
|
||||
\texttt{memory\_params\_bytes} = 3 \cdot (\texttt{\#params} \cdot 4)
|
||||
\end{gathered}
|
||||
\]
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{LeNet-5}
|
||||
\marginnote{LeNet-5}
|
||||
|
||||
LeNet-5 is one of the first convolutional neural networks.
|
||||
The network has the following properties:
|
||||
\begin{itemize}
|
||||
\item At each layer, the number of channels increases and the spatial dimension decreases.
|
||||
\item Convolutions have the following characteristics: $5 \times 5$ kernels, no padding and average pooling for down-sampling.
|
||||
\item \texttt{Sigmoid} and \texttt{tanh} activation functions are used, with carefully selected amplitudes (i.e. they are scaled).
|
||||
\item The last layers are fully connected with a radial basis function as the output activation.
|
||||
\item There are no residual connections and normalization layers.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/lenet5.png}
|
||||
\end{figure}
|
||||
|
||||
|
||||
|
||||
\section{AlexNet}
|
||||
\marginnote{AlexNet}
|
||||
|
||||
AlexNet is the first CNN that broke the stagnation of the field.
|
||||
|
||||
|
||||
\subsection{Architecture}
|
||||
|
||||
AlexNet is composed of:
|
||||
\begin{itemize}
|
||||
\item 5 convolutional layers (convolution + \texttt{ReLU}, sometimes with max-pooling).
|
||||
\item 3 feed-forward layers.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/alexnet.png}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Training}
|
||||
|
||||
The network is trained on ImageNet 1k (for the ILSVRC 2012) with a batch size of 128.
|
||||
Due to GPU memory limitations, training was split into two parallel lines on two GPUs (model parallelism).
|
||||
|
||||
\begin{description}
|
||||
\item[Grouped convolution] \marginnote{Grouped convolution}
|
||||
A convolution is split into two sets of weights, each trained independently on a computational line.
|
||||
At some steps (e.g. \texttt{conv3}), the two GPUs are allowed to communicate.
|
||||
In this case, the result of the operation can be virtually seen as if it was done by a single computational unit (i.e. the operation is done on the full set of weights).
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
At the time, training took 5-6 days on two Nvidia GTX 580.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
|
||||
AlexNet has the following trends:
|
||||
\begin{itemize}
|
||||
\item The first convolutional layer is a stem layer.
|
||||
\item The majority of the parameters are in the fully connected layers (even though they have to process an activation of 4096 elements).
|
||||
\item The largest memory consumption for activations is at the first convolutional layer.
|
||||
\item The largest amount of FLOPs is required by the convolutional layers.
|
||||
\item The total number of parameters and activations at training time are relatively comparable.
|
||||
\item $2.2$ GFLOPs are required to process an image at training time.
|
||||
\end{itemize}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\caption{Parameters of AlexNet (batch size of 128)}
|
||||
\small
|
||||
\begin{tabular}{cccccccccccc}
|
||||
\toprule
|
||||
\multirow{2}[20]{*}{\textbf{Layer}}
|
||||
& \multicolumn{4}{c}{\textbf{Convolution}}
|
||||
& \multicolumn{3}{c}{\textbf{Single activation}}
|
||||
& \multirow{2}[20]{*}{\texttt{\#params}}
|
||||
& \multicolumn{3}{c}{\textbf{Batch requirements}} \\
|
||||
\cmidrule(lr){2-5} \cmidrule(lr){6-8} \cmidrule(lr){10-12}
|
||||
& \rot{\textbf{Channels}} & \rot{\textbf{H/W}} & \rot{\textbf{Stride}} & \rot{\textbf{Padding}}
|
||||
& \rot{\textbf{H/W}} & \rot{\textbf{Channels}} & \rot{\texttt{\#activ.}}
|
||||
&
|
||||
& \rot{\textbf{MFLOPs}} & \rot{\textbf{Activ. mem.}} & \rot{\textbf{Params mem.}} \\
|
||||
\midrule
|
||||
\texttt{input} & & & & & \num{227} & \num{3} & \num{154587} & \num{0} & -- & \num{75.5} MB & \num{0.0} \\
|
||||
\texttt{conv1} & \num{96} & \num{11} & \num{4} & \num{0} & \num{55} & \num{96} & \num{290400} & \num{35} K & \num{26986.3} & \num{283.6} MB & \num{0.4} MB \\
|
||||
\texttt{pool1} & \num{1} & \num{3} & \num{2} & \num{0} & \num{27} & \num{96} & \num{69984} & \num{0} & \num{80.6} & \num{68.3} MB & \num{0.0} \\
|
||||
\texttt{conv2} & \num{256} & \num{5} & \num{1} & \num{2} & \num{27} & \num{256} & \num{186624} & \num{615} K & \num{114661.8} & \num{182.3} MB & \num{7.0} MB \\
|
||||
\texttt{pool2} & \num{1} & \num{3} & \num{2} & \num{0} & \num{13} & \num{256} & \num{43264} & \num{0} & \num{49.8} & \num{42.3} MB & \num{0.0} \\
|
||||
\texttt{conv3} & \num{384} & \num{3} & \num{1} & \num{1} & \num{13} & \num{384} & \num{64896} & \num{885} K & \num{38277.2} & \num{63.4} MB & \num{10.1} MB \\
|
||||
\texttt{conv4} & \num{384} & \num{3} & \num{1} & \num{1} & \num{13} & \num{384} & \num{64896} & \num{1327} K & \num{57415.8} & \num{63.4} MB & \num{15.2} MB \\
|
||||
\texttt{conv5} & \num{256} & \num{3} & \num{1} & \num{1} & \num{13} & \num{256} & \num{43264} & \num{885} K & \num{38277.2} & \num{42.3} MB & \num{10.1} MB \\
|
||||
\texttt{pool3} & \num{1} & \num{3} & \num{2} & \num{0} & \num{6} & \num{256} & \num{9216} & \num{0} & \num{10.6} & \num{9.0} MB & \num{0.0} \\
|
||||
\texttt{flatten} & \num{0} & \num{0} & \num{0} & \num{0} & \num{1} & \num{9216} & \num{9216} & \num{0} & \num{0.0} & \num{0.0} & \num{0.0} \\
|
||||
\texttt{fc6} & \num{4096} & \num{1} & \num{1} & \num{0} & \num{1} & \num{4096} & \num{4096} & \num{37758} K & \num{9663.7} & \num{4.0} MB & \num{432.0} MB \\
|
||||
\texttt{fc7} & \num{4096} & \num{1} & \num{1} & \num{0} & \num{1} & \num{4096} & \num{4096} & \num{16781} K & \num{4295.0} & \num{4.0} MB & \num{192.0} MB \\
|
||||
\texttt{fc8} & \num{1000} & \num{1} & \num{1} & \num{0} & \num{1} & \num{1000} & \num{1000} & \num{4097} K & \num{1048.6} & \num{1.0} MB & \num{46.9} MB \\
|
||||
\midrule
|
||||
&&&&&&& \textbf{Total} & \num{62378} K & \num{290851} & \num{1.406} MB & \num{714} MB \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
@ -465,7 +465,7 @@ Image filtering can be implemented through:
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Convolutional layer}
|
||||
\subsection{Convolutional layer} \label{sec:conv_layer}
|
||||
|
||||
\begin{description}
|
||||
\item[Multi-channel convolution] \marginnote{Multi-channel convolution}
|
||||
@ -688,7 +688,7 @@ Normalize the output of a layer during training in such a way that it has zero m
|
||||
The advantages of batch normalization are:
|
||||
\begin{itemize}
|
||||
\item It allows to use a higher learning rate and makes initialization less important.
|
||||
\item Training becomes non-deterministic, introducing some regularization.
|
||||
\item Training becomes non-deterministic (i.e. adds noise) acting as some form of regularization.
|
||||
\item During inference, there is no overhead as it can be merged with the previous layer.
|
||||
\end{itemize}
|
||||
The disadvantages are:
|
||||
|
||||
Reference in New Issue
Block a user