mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 02:52:22 +01:00
706 lines
32 KiB
TeX
706 lines
32 KiB
TeX
\chapter{Image classification}
|
|
|
|
|
|
\section{Supervised datasets}
|
|
|
|
\begin{description}
|
|
\item[Dataset] \marginnote{Dataset}
|
|
Given a set of labeled data, it can be split into:
|
|
\begin{descriptionlist}
|
|
\item[Train set] $D^\text{train} = \{ (\text{x}_\text{train}^{(i)}, y_\text{train}^{(i)}) \mid i = 1, \dots, N \}$.
|
|
\item[Test set] $D^\text{test} = \{ (\text{x}_\text{test}^{(i)}, y_\text{test}^{(i)}) \mid i = 1, \dots, M \}$.
|
|
\end{descriptionlist}
|
|
|
|
It is assumed that the two sets contain i.i.d. samples drawn from the same unknown distribution.
|
|
\end{description}
|
|
|
|
|
|
\subsection{Modified NIST (MNIST)}
|
|
|
|
\begin{minipage}{0.45\linewidth}
|
|
\centering
|
|
\includegraphics[width=0.9\linewidth]{./img/mnist.png}
|
|
\end{minipage}
|
|
\begin{minipage}{0.5\linewidth}
|
|
\begin{descriptionlist}
|
|
\item[Content] Handwritten digits from 0 to 9.
|
|
\item[Number of classes] 10.
|
|
\item[Train set size] 50k.
|
|
\item[Test set size] 10k.
|
|
\item[Image format] $28 \times 28$ grayscale.
|
|
\end{descriptionlist}
|
|
\end{minipage}
|
|
|
|
|
|
\subsection{CIFAR10}
|
|
|
|
\begin{minipage}{0.45\linewidth}
|
|
\centering
|
|
\includegraphics[width=0.9\linewidth]{./img/cifar10.png}
|
|
\end{minipage}
|
|
\begin{minipage}{0.5\linewidth}
|
|
\begin{descriptionlist}
|
|
\item[Content] Objects of various categories.
|
|
\item[Number of classes] 10.
|
|
\item[Train set size] 50k.
|
|
\item[Test set size] 10k.
|
|
\item[Image size] $32 \times 32$ RGB.
|
|
\end{descriptionlist}
|
|
\end{minipage}
|
|
|
|
|
|
\subsection{CIFAR100}
|
|
|
|
\begin{minipage}{0.45\linewidth}
|
|
\centering
|
|
\includegraphics[width=0.7\linewidth]{./img/cifar100.png}
|
|
\end{minipage}
|
|
\begin{minipage}{0.5\linewidth}
|
|
\begin{descriptionlist}
|
|
\item[Content] Objects of various categories.
|
|
\item[Number of classes] 100 (20 super-classed with 5 sub-classes).
|
|
\item[Train set size] 50k.
|
|
\item[Test set size] 10k.
|
|
\item[Image size] $32 \times 32$ RGB.
|
|
\end{descriptionlist}
|
|
\end{minipage}
|
|
|
|
|
|
\subsection{ImageNet 21k}
|
|
|
|
\begin{descriptionlist}
|
|
\item[Content] Objects of various categories.
|
|
\item[Number of classes] 21k synsets from WordNet organized hierarchically.
|
|
\item[Dataset size] 14 millions.
|
|
\item[Image size] Variable resolution RGB. Average size of $400 \times 350$.
|
|
\end{descriptionlist}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.85\linewidth]{./img/imagenet21k.png}
|
|
\end{figure}
|
|
|
|
|
|
\subsection{ImageNet 1k}
|
|
|
|
\begin{minipage}{0.45\linewidth}
|
|
\centering
|
|
\includegraphics[width=\linewidth]{./img/imagenet1k.png}
|
|
\end{minipage}
|
|
\begin{minipage}{0.5\linewidth}
|
|
\begin{descriptionlist}
|
|
\item[Content] Objects of various categories.
|
|
\item[Number of classes] 1000.
|
|
\item[Train set size] $1.3$ millions.
|
|
\item[Validation set size] 50k.
|
|
\item[Test set size] 100k.
|
|
\item[Image size] Variable resolution RGB. Often resized to $256 \times 256$.
|
|
\end{descriptionlist}
|
|
\end{minipage}
|
|
|
|
\begin{remark}
|
|
Performance is usually measured as top-5 accuracy as making a single prediction might be ambiguous due to the fact that the images can contain multiple objects.
|
|
\end{remark}
|
|
|
|
|
|
|
|
\section{Learning}
|
|
|
|
\begin{description}
|
|
\item[Learning problem] \marginnote{Learning problem}
|
|
Find the best model $h^*$ from the hypothesis space $\mathbb{H}$ that minimizes a loss function $\mathcal{L}$:
|
|
\[ h^* = \arg\min_{h \in \mathbb{H}} \mathcal{L}(h, \matr{D}^\text{train}) \]
|
|
|
|
In machine learning, models are usually parametrized. The problem then becomes to find the best set of parameters $\matr{\theta}^*$ from the parameter space $\Theta$:
|
|
\[ \matr{\matr{\theta}}^* = \arg\min_{\matr{\theta} \in \Theta} \mathcal{L}(\matr{\theta}, \matr{D}^\text{train}) \]
|
|
\end{description}
|
|
|
|
|
|
\subsection{Loss function}
|
|
|
|
\begin{description}
|
|
\item[Loss function] \marginnote{Loss function}
|
|
Easy to optimize function that acts as a proxy to measure the goodness of a model.
|
|
|
|
The loss computed on a dataset is usually obtained as the average of the values of the single samples:
|
|
\[ \mathcal{L}(\matr{\theta}, \matr{D}^\text{train}) = \frac{1}{N} \sum_{i}^{\vert \matr{D}^\text{train} \vert} \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) \]
|
|
|
|
|
|
\item[0-1 loss] \marginnote{0-1 loss}
|
|
Loss computed as the number of misclassifications:
|
|
\[ \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) = \vert \text{misclassifications} \vert \]
|
|
|
|
This loss is not ideal as it is insensitive to small (or even large) changes in the parameters.
|
|
Moreover, it does not tell in which direction should the parameters be modified to reduce the loss.
|
|
|
|
\begin{remark}
|
|
This loss can be minimized using a combinatorial optimization approach but it does not scale well with large datasets.
|
|
\end{remark}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.3\linewidth]{./img/01_loss_spam.png}
|
|
\caption{\parbox[t]{0.7\linewidth}{
|
|
Example of linear classifier for spam detection.
|
|
Small changes on the boundary line do not change the 0-1 loss.
|
|
The loss itself does not tell which is the best direction to move the line.
|
|
}}
|
|
\end{figure}
|
|
|
|
|
|
\item[Root mean square error] \marginnote{Root mean square error}
|
|
Loss computed as the direct comparison between the prediction and target label:
|
|
\[ \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) = \Vert f(\vec{x}^{(i)}; \matr{\theta}) - y^{(i)} \Vert_2 \]
|
|
Note that $y^{(i)}$ might be encoded (e.g. one-hot).
|
|
|
|
|
|
\item[Cross-entropy loss] \marginnote{Cross-entropy loss}
|
|
Transform the logits of a model into a probability distribution and estimate the parameters through MLE.
|
|
|
|
\begin{descriptionlist}
|
|
\item[Softmax] \marginnote{Softmax}
|
|
Function that converts its input into a probability distribution.
|
|
Given the logits $\vec{s} \in \mathbb{R}^{c}$, the score $\vec{s}_j$ of class $j$ is converted into a probability as follows:
|
|
\[
|
|
\mathcal{P}_\text{model}(Y = j | X = \vec{x}^{(i)}; \matr{\theta}) =
|
|
\texttt{softmax}_j(\vec{s}) =
|
|
\frac{\exp(\vec{s}_j)}{\sum_{k=1}^{c} \exp(\vec{s}_k)}
|
|
\]
|
|
|
|
For numerical stability, \texttt{softmax} is usually computed as:
|
|
\[
|
|
\begin{split}
|
|
\texttt{softmax}_j(\vec{s} - \max\{ \vec{s} \}) &= \frac{\exp(\vec{s}_j - \max\{ \vec{s} \})}{\sum_{k=1}^{c} \exp(\vec{s}_k - \max\{ \vec{s} \})} \\
|
|
&= \frac{\cancel{\exp(- \max\{ \vec{s} \})}\exp(\vec{s}_j)}{\cancel{\exp(- \max\{ \vec{s} \})}\sum_{k=1}^{c} \exp(\vec{s}_k)} = \texttt{softmax}_j(\vec{s})
|
|
\end{split}
|
|
\]
|
|
|
|
\item[Maximum likelihood estimation] \marginnote{Cross-entropy loss}
|
|
Use MLE to estimate the parameters on the probability distribution outputted by the \texttt{softmax} function:
|
|
\[
|
|
\begin{split}
|
|
\matr{\theta}^* &= \arg\max_\matr{\theta} \mathcal{P}_\text{model}(y^{(1)}, \dots, y^{(N)} | \vec{x}^{(1)}, \dots, \vec{x}^{(N)}; \matr{\theta}) \\
|
|
&= \arg\max_\matr{\theta} \prod_{i=1}^{N} \mathcal{P}_\text{model}(Y = y^{(i)} | X=\vec{x}^{(i)}; \matr{\theta}) \\
|
|
&= \arg\max_\matr{\theta} \sum_{i=1}^{N} \log\mathcal{P}_\text{model}(Y = y^{(i)} | X=\vec{x}^{(i)}; \matr{\theta}) \\
|
|
&= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\log\mathcal{P}_\text{model}(Y = y^{(i)} | X=\vec{x}^{(i)}; \matr{\theta}) \\
|
|
&= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\log\left( \frac{\exp(\vec{s}_{y^{(i)}})}{\sum_{k=1}^{c} \exp(\vec{s}_k)} \right) \\
|
|
&= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\log\left( \exp(\vec{s}_{y^{(i)}}) \right) + \log\left( \sum_{k=1}^{c} \exp(\vec{s}_k) \right) \\
|
|
&= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\vec{s}_{y^{(i)}} + \log\left( \sum_{k=1}^{c} \exp(\vec{s}_k) \right) \\
|
|
\end{split}
|
|
\]
|
|
|
|
The second term ($\log\left( \sum_{k=1}^{c} \exp(\vec{s}_k)\right)$) is called \texttt{logsumexp} and approximates the max function.
|
|
Therefore, the loss can be seen as:
|
|
\[
|
|
\mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big)
|
|
= -\vec{s}_{y^{(i)}} + \log\left( \sum_{k=1}^{c} \exp(\vec{s}_k) \right)
|
|
\approx -\vec{s}_{y^{(i)}} + \max\{ \vec{s} \}
|
|
\]
|
|
\end{descriptionlist}
|
|
|
|
\end{description}
|
|
|
|
|
|
\subsection{Gradient descent}
|
|
|
|
\begin{description}
|
|
\item[Gradient descent] \marginnote{Gradient descent}
|
|
An epoch $e$ of gradient descent does the following:
|
|
\begin{enumerate}
|
|
\item Classify all training data to obtain the predictions $\hat{y}^{(i)} = f(\vec{x}^{(i)}; \matr{\theta}^{(e-1)})$
|
|
and the loss $\mathcal{L}(\matr{\theta}^{(e-1)}, \matr{D}^\text{train})$.
|
|
\item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}} (\matr{\theta}^{(e-1)}, \matr{D}^\text{train})$.
|
|
\item Update the parameters $\matr{\theta}^{(e)} = \matr{\theta}^{(e-1)} - \texttt{lr} \cdot \nabla \mathcal{L}$.
|
|
\end{enumerate}
|
|
|
|
\item[Stochastic gradient descent] \marginnote{Stochastic gradient descent}
|
|
Reduce the computational cost of gradient descent by computing the gradient of a single sample.
|
|
An epoch $e$ of SGD does the following:
|
|
\begin{enumerate}
|
|
\item Shuffle the training data $\matr{D}^\text{train}$.
|
|
\item For $i = 0, \dots, N-1$:
|
|
\begin{enumerate}
|
|
\item Classify $\vec{x}^{(i)}$ to obtain the prediction $\hat{y}^{(i)} = f(\vec{x}^{(i)}; \matr{\theta}^{(e*N+i)})$
|
|
and the loss $\mathcal{L}\big( \matr{\theta}^{(e*N+i)}, (\vec{x}^{(i)}, y^{(i)}) \big)$.
|
|
\item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}}\big( \matr{\theta}^{(e*N+i)}, (\vec{x}^{(i)}, y^{(i)}) \big)$.
|
|
\item Update the parameters $\matr{\theta}^{(e*N+i+1)} = \matr{\theta}^{(e*N+i)} - \texttt{lr} \cdot \nabla \mathcal{L}$.
|
|
\end{enumerate}
|
|
\end{enumerate}
|
|
|
|
\item[SGD with mini-batches] \marginnote{SGD with mini-batches}
|
|
Increase the update accuracy of SGD by using a mini-batch.
|
|
An epoch $e$ of SGD with mini-batches of size $B$ does the following:
|
|
\begin{enumerate}
|
|
\item Shuffle the training data $\matr{D}^\text{train}$.
|
|
\item For $u = 0, \dots, U$, with $U = \lceil \frac{N}{B} \rceil$:
|
|
\begin{enumerate}
|
|
\item Classify the examples $\matr{X}^{(u)} = \{ \vec{x}^{(Bu)}, \dots, \vec{x}^{(B(u+1)-1)} \}$
|
|
to obtain the predictions $\hat{Y}^{(u)} = f(\vec{X}^{(u)}; \matr{\theta}^{(e*U+u)})$
|
|
and the loss $\mathcal{L}\big( \matr{\theta}^{(e*U+u)}, (\matr{X}^{(u)}, \hat{Y}^{(u)}) \big)$.
|
|
\item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}}\big( \matr{\theta}^{(e*U+u)}, (\matr{X}^{(u)}, \hat{Y}^{(u)}) \big)$.
|
|
\item Update the parameters $\matr{\theta}^{(e*U+u+1)} = \matr{\theta}^{(e*U+u)} - \texttt{lr} \cdot \nabla \mathcal{L}$.
|
|
\end{enumerate}
|
|
\end{enumerate}
|
|
|
|
The following properties generally hold:
|
|
\begin{itemize}
|
|
\item Larger batches provide a smoother estimation of the gradient and allow to better exploit parallel hardware (below a certain limit, there is no gain in time).
|
|
\item Smaller batches require more iterations to train but might have a regularization effect for better generalization.
|
|
\end{itemize}
|
|
|
|
\item[Gradient computation] \marginnote{Gradient computation}
|
|
Gradients can be computed:
|
|
\begin{descriptionlist}
|
|
\item[Numerically] Slow and approximate but easy to implement.
|
|
\item[Analytically] Using the chain rule.
|
|
\item[Automatically] Using automatic differentiation (e.g. backpropagation).
|
|
\end{descriptionlist}
|
|
\end{description}
|
|
|
|
|
|
|
|
\section{Linear classifier}
|
|
\marginnote{Linear classifier}
|
|
|
|
Determine the class by computing a linear combination of the input.
|
|
|
|
Given $c$ classes and a flattened image $\vec{x} \in \mathbb{R}^{i}$, a linear classifier $f$ parametrized on $\matr{W} \in \mathbb{R}^{c \times i}$ is defined as:
|
|
\[ f(\vec{x}; \matr{W}) = \matr{W}\vec{x} = \texttt{logits} \]
|
|
where the $\texttt{logits} \in \mathbb{R}^{c}$ vector contains a score for each class.
|
|
|
|
The prediction is obtained as the index of the maximum score.
|
|
|
|
\begin{remark}
|
|
Predicting directly the integer encoded classes is not ideal as it would give a (probably) inexistent semantic ordering
|
|
(e.g. if $2$ encodes bird and $3$ encodes cat, $2.5$ should not mean half bird and half cat).
|
|
\end{remark}
|
|
|
|
\begin{remark}
|
|
Linear classifiers can be seen as a template-matching method.
|
|
Each row of $\matr{W} \in \mathbb{R}^{c \times i}$ is a class template that is cross-correlated with the image to obtain a score.
|
|
\end{remark}
|
|
|
|
\begin{remark}
|
|
\marginnote{Affine classifier}
|
|
In practice, a linear classifier is actually an affine classifier parametrized on $\theta = (\matr{W} \in \mathbb{R}^{c \times i}, \vec{b} \in \mathbb{R}^{c})$:
|
|
\[ f(\vec{x}; \theta) = \matr{W}\vec{x} + \vec{b} = \texttt{logits} \]
|
|
\end{remark}
|
|
|
|
\begin{remark}
|
|
Linear classifiers are limited by the expressiveness of the input data as pixels alone do not contain relevant features.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.45\linewidth]{./img/data_representation_linear.png}
|
|
\caption{
|
|
\parbox[t]{0.6\linewidth}{
|
|
Example of non-linearly separable data points that become linearly separable in polar coordinates
|
|
}
|
|
}
|
|
\end{figure}
|
|
\end{remark}
|
|
|
|
|
|
|
|
\section{Bag of visual words}
|
|
|
|
\begin{description}
|
|
\item[Codeword] \marginnote{Codeword}
|
|
Visual feature (e.g. an edge with a particular direction) that appears in an image.
|
|
|
|
\item[Bag of visual words (BOVW)] \marginnote{Bag of visual words (BOVW)}
|
|
Encoding of an image into a histogram of codeword frequencies.
|
|
\end{description}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.6\linewidth]{./img/bovw.png}
|
|
\end{figure}
|
|
|
|
|
|
|
|
\section{Neural networks}
|
|
|
|
\begin{description}
|
|
\item[Shallow neural network] \marginnote{Shallow neural network}
|
|
Linear transformations with an activation function:
|
|
\[
|
|
\begin{split}
|
|
f(\vec{x}, \matr{\theta}) &= \matr{W}_2 \vec{h} + \vec{b}_2 \\
|
|
&= \matr{W}_2 \phi(\matr{W}_1 \vec{x} + \vec{b}_1) + \vec{b}_2 = \vec{s}
|
|
\end{split}
|
|
\]
|
|
where:
|
|
\begin{itemize}
|
|
\item $\matr{\theta} = (W_1 \in \mathbb{R}^{h \times i}, b_1 \in \mathbb{R}^{h}, W_2 \in \mathbb{R}^{c \times h}, b_2 \in \mathbb{R}^{c})$
|
|
are the parameters of the linear transformations with an inner representation of size $h$.
|
|
\item $\phi$ is an activation function.
|
|
\item $\vec{h}$ and $\vec{s}$ are activations.
|
|
\end{itemize}
|
|
|
|
\item[Activation function] \marginnote{Activation function}
|
|
Function to introduce non-linearity.
|
|
|
|
\begin{remark}
|
|
Without an activation function, a neural network is equivalent to a plain linear transformation.
|
|
\end{remark}
|
|
|
|
Examples of activation functions are:
|
|
\begin{descriptionlist}
|
|
\item[Sigmoid]
|
|
Defined as:
|
|
\[
|
|
\sigma(a) = \frac{1}{1+\exp(-a)} \hspace{2em}
|
|
\frac{\partial \sigma(a)}{\partial a} = \sigma(a) \big( 1-\sigma(a) \big)
|
|
\]
|
|
It is subject to the vanishing gradient problem.
|
|
|
|
\item[Rectified linear unit (ReLU)]
|
|
Defined as:
|
|
\[
|
|
\texttt{ReLU}(a) = \max\{ 0, a \} \hspace{2em}
|
|
\frac{\partial \texttt{ReLU}(a)}{\partial a} = \begin{cases}
|
|
1 & \text{if } a \geq 0\\
|
|
0 & \text{otherwise}
|
|
\end{cases}
|
|
\]
|
|
It is subject to the dead neuron problem for negative inputs.
|
|
|
|
\item[Leaky ReLU]
|
|
Defined as:
|
|
\[
|
|
\texttt{leaky\_ReLU}(a) = \begin{cases}
|
|
a & \text{if $a \geq 0$} \\
|
|
0.01 & \text{otherwise}
|
|
\end{cases} \hspace{2em}
|
|
\frac{\partial \texttt{leaky\_ReLU}(a)}{\partial a} = \begin{cases}
|
|
1 & \text{if } a \geq 0 \\
|
|
0.01 & \text{otherwise}
|
|
\end{cases}
|
|
\]
|
|
\end{descriptionlist}
|
|
|
|
\begin{example}[Linear separability]
|
|
Linear transformations do not change the linear separability of the data points.
|
|
A non-linear function can make linear separation possible.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\begin{subfigure}{0.55\linewidth}
|
|
\centering
|
|
\includegraphics[width=\linewidth]{./img/relu_separability_1.png}
|
|
\end{subfigure}
|
|
|
|
\begin{subfigure}{0.55\linewidth}
|
|
\centering
|
|
\includegraphics[width=\linewidth]{./img/relu_separability_2.png}
|
|
\end{subfigure}
|
|
\end{figure}
|
|
\end{example}
|
|
|
|
\item[Deep neural network] \marginnote{Deep neural network}
|
|
Multiple layers of linear transformations and activation functions:
|
|
\[
|
|
\begin{split}
|
|
f(\vec{x}, \matr{\theta}) &= \matr{W}_L \vec{h}_{L-1} + \vec{b}_L \\
|
|
&= \matr{W}_L \phi_L(\matr{W}_{L-1} \vec{h}_{L-2} + \vec{b}_{L-1}) + \vec{b}_L \\
|
|
&= \matr{W}_L \phi_{L}(\matr{W}_{L-1} \phi_{L-1}(\cdots \phi_{1}(\matr{W}_{1} \vec{x} + \vec{b}_{1}) \cdots) + \vec{b}_{L-1}) + \vec{b}_L = \vec{s} \\
|
|
\end{split}
|
|
\]
|
|
|
|
\begin{description}
|
|
\item[Depth] Number of layers.
|
|
\item[Width] Number of activations at each layer.
|
|
\end{description}
|
|
\end{description}
|
|
|
|
|
|
|
|
\section{Convolutional neural networks}
|
|
|
|
|
|
\subsection{Image filtering}
|
|
|
|
Consider the case of vertical edge detection.
|
|
Image filtering can be implemented through:
|
|
\begin{descriptionlist}
|
|
\item[Fully-connected layer] \marginnote{Image filtering with fully-connected layers}
|
|
Use an FC layer to transform the image.
|
|
|
|
Given an image of size $H \times W$, the layer requires:
|
|
\begin{itemize}
|
|
\item $(H \cdot W) \cdot (H \cdot (W-1)) \approx H^2W^2$ parameters.
|
|
\item $2 (H \cdot W) \cdot (H \cdot (W-1)) \approx 2H^2W^2$ FLOPs (multiplications and additions).
|
|
\end{itemize}
|
|
|
|
\item[Convolution/Correlation] \marginnote{Image filtering with convolutions}
|
|
Use a convolution (more precisely, a cross-correlation) to transform the image.
|
|
|
|
\begin{remark}
|
|
Convolutions preserve the spatial structure of the image, have shared parameters and extract local features.
|
|
\end{remark}
|
|
|
|
Given an image of size $H \times W$, a convolution requires:
|
|
\begin{itemize}
|
|
\item $2$ parameters.
|
|
\item $3 (H \cdot (W-1)) \approx 3HW$ FLOPs.
|
|
\end{itemize}
|
|
|
|
\begin{description}
|
|
\item[Convolution matrix]
|
|
A convolution can be expressed as a multiplication matrix such that:
|
|
\begin{itemize}
|
|
\item The parameters are shared across rows.
|
|
\item The resulting matrix is sparse.
|
|
\item It adapts to varying input sizes.
|
|
\item It is equivariant to translation (but not w.r.t. rotation and scale).
|
|
\end{itemize}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.45\linewidth]{./img/convolution_matrix.png}
|
|
\caption{Multiplication matrix of a $1 \times 2$ convolution}
|
|
\end{figure}
|
|
\end{description}
|
|
\end{descriptionlist}
|
|
|
|
|
|
\subsection{Convolutional layer}
|
|
|
|
\begin{description}
|
|
\item[Multi-channel convolution] \marginnote{Multi-channel convolution}
|
|
On inputs with multiple channels (i.e. 3D inputs), different 2D convolutions are applied across the different channels.
|
|
|
|
Given a $C_\text{in} \times H_\text{in} \times W_\text{in}$ image $I$, a convolution kernel $K$ will have shape $C_\text{in} \times H_K \times W_K$
|
|
and the output activation at each pixel is computed as:
|
|
\[
|
|
[K * I](j, i) =
|
|
\sum_{n=1}^{C_\text{in}}
|
|
\sum_{m = -\lfloor \frac{H_K}{2} \rfloor}^{\lfloor \frac{H_K}{2} \rfloor}
|
|
\sum_{l = -\lfloor \frac{W_K}{2} \rfloor}^{\lfloor \frac{W_K}{2} \rfloor}
|
|
K_n(m, l) I_n(j-m, i-l) + b
|
|
\]
|
|
where $b$ is a bias term associated with the filter.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.2\linewidth]{./img/2d_convolution.png}
|
|
\end{figure}
|
|
|
|
\item[2D convolutional layer] \marginnote{2D convolutional layer}
|
|
Given a $C_\text{in} \times H_\text{in} \times W_\text{in}$ image $I$ and a desired number of channels $C_\text{out}$ in the output activation,
|
|
multiple different convolution kernels $K^{(i)}$ are applied and their results are stacked:
|
|
\[
|
|
[K * I]_k(j, i) = \sum_{n=1}^{C_\text{in}} \sum_{m} \sum_{l} K_n^{(k)}(m, l) I_n(j-m, i-l) + b^{(k)} \,\,\text{ for $k=1, \dots, C_\text{out}$}
|
|
\]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.65\linewidth]{./img/2d_convolution_multi_out.png}
|
|
\end{figure}
|
|
|
|
\begin{remark}
|
|
Only applying convolutions results in a linear transformation of the input. Therefore, an activation function is applied after convolving.
|
|
\end{remark}
|
|
|
|
\item[Padding]
|
|
\phantom{}
|
|
\begin{description}
|
|
\item[No padding] \marginnote{No padding}
|
|
Convolutions are only applied at pixels on which they do not overflow.
|
|
|
|
Given a $H_\text{in} \times W_\text{in}$ image and a $H_K \times W_K$ kernel,
|
|
the output shape is:
|
|
\[ H_\text{out} = H_\text{in} - H_K + 1 \hspace{2em} W_\text{out} = W_\text{in} - W_K + 1 \]
|
|
|
|
\begin{remark}
|
|
This type of padding is referred to as \texttt{valid}.
|
|
\end{remark}
|
|
|
|
\item[Zero padding] \marginnote{Zero padding}
|
|
Zeros are added around the image.
|
|
|
|
Given a $H_\text{in} \times W_\text{in}$ image and a $H_K \times W_K$ kernel,
|
|
the padding is usually $P=\frac{H_K-1}{2}$ (for odd square kernels) and the output shape is:
|
|
\[ H_\text{out} = H_\text{in} - H_K + 1 + 2P \hspace{2em} W_\text{out} = W_\text{in} - W_K + 1 + 2P \]
|
|
|
|
\begin{remark}
|
|
This type of padding is referred to as \texttt{same}.
|
|
\end{remark}
|
|
\end{description}
|
|
|
|
|
|
\item[Stride] \marginnote{Stride}
|
|
Amount of pixels the convolution kernel is slid after each application. This is useful for downsampling the image.
|
|
|
|
Given a $H_\text{in} \times W_\text{in}$ image and a $H_K \times W_K$ kernel,
|
|
the output with stride $S$ and padding $P$ has shape:
|
|
\[
|
|
H_\text{out} = \left\lfloor \frac{H_\text{in} - H_K + 2P}{2} \right\rfloor + 1
|
|
\hspace{2em}
|
|
W_\text{out} = \left\lfloor \frac{W_\text{in} - W_K + 2P}{2} \right\rfloor + 1
|
|
\]
|
|
|
|
|
|
\item[Receptive field] \marginnote{Receptive field}
|
|
Number of pixels in the input image that affects a hidden unit.
|
|
|
|
Given a $H_K \times W_K$ kernel, without stride, the receptive field of a neuron at the $L$-th layer is:
|
|
\[ r_L = \big( 1 + L \cdot (H_K - 1) \big) \cdot \big( 1 + L \cdot (W_K - 1) \big) \]
|
|
|
|
If each layer has a stride $S_l$, then the receptive field of the $L$-th activation is:
|
|
\[
|
|
r_L = \left( 1 + \sum_{l=1}^{L} \left( (H_K - 1) \prod_{i=1}^{l-1}S_i \right) \right) \cdot
|
|
\left( 1 + \sum_{l=1}^{L} \left( (W_K - 1) \prod_{i=1}^{l-1}S_i \right) \right)
|
|
\]
|
|
|
|
\begin{remark}
|
|
Without stride, the receptive field grows linearly with the number of layers.
|
|
With the same stride ($> 1$) across all the layers, the growth becomes exponential as $\prod_{i=1}^{l-1}S_i = S^{l-1}$.
|
|
\end{remark}
|
|
|
|
\item[Computational cost] \marginnote{Computational cost}
|
|
\phantom{}
|
|
\begin{description}
|
|
\item[Parameters]
|
|
Given a $C_\text{in} \times H_\text{in} \times W_\text{in}$ image, a kernel $H_K \times W_K$ and a desired number of output channels $C_\text{out}$,
|
|
the corresponding convolutional layer has the following number of parameters:
|
|
\[ C_\text{out} ( C_\text{in} H_K W_K + 1 ) \]
|
|
|
|
\item[Floating-point operations]
|
|
Given a $C_\text{in} \times H_\text{in} \times W_\text{in}$ input image, a kernel $H_K \times W_K$ and
|
|
the corresponding output image of size $C_\text{out} \times H_\text{out} \times W_\text{out}$,
|
|
the number of FLOPs (multiplications and additions) is:
|
|
\[ 2 (C_\text{out} H_\text{out} W_\text{out}) (C_\text{in} H_K W_K) \]
|
|
|
|
\item[Multiply-accumulate operations]
|
|
A MAC operation implemented in hardware allows to perform a multiplication and an addition in a single clock cycle.
|
|
Therefore, the number of MACs is:
|
|
\[ \cancel{2} (C_\text{out} H_\text{out} W_\text{out}) (C_\text{in} H_K W_K) \]
|
|
\end{description}
|
|
|
|
\item[Other convolutional layers]
|
|
\phantom{}
|
|
\begin{description}
|
|
\item[1D convolutional layer] \marginnote{1D convolutional layer}
|
|
Suitable for time series.
|
|
\[
|
|
[K * S]_k(i) = \sum_{n=1}^{C_\text{in}} \sum_{l} K_n^{(k)}(l) S_n(i-l) + b^{(k)}
|
|
\]
|
|
|
|
\item[3D convolutional layer] \marginnote{3D convolutional layer}
|
|
Suitable for videos.
|
|
\[
|
|
[K * V]_k(h, j, i) = \sum_{n=1}^{C_\text{in}} \sum_{p} \sum_{m} \sum_{l} K_n^{(k)}(p, m, l) V_n(h-p, j-m, i-l) + b^{(k)}
|
|
\]
|
|
\end{description}
|
|
\end{description}
|
|
|
|
|
|
\subsection{Pooling layer}
|
|
\marginnote{Pooling layer}
|
|
|
|
Kernel that aggregates several values through a fixed function into one output.
|
|
Each input channel is processed independently (i.e. $C_\text{in} = C_\text{out}$).
|
|
|
|
\begin{remark}
|
|
Traditionally, pooling layers were used for downsampling. Therefore, the stride is usually $> 1$.
|
|
\end{remark}
|
|
|
|
\begin{description}
|
|
\item[Max pooling] \marginnote{Max pooling}
|
|
Select the maximum within the kernel.
|
|
|
|
\begin{remark}
|
|
Max pooling is invariant to small (depending on the receptive field, it can also be big w.r.t the input image) spatial translations.
|
|
\end{remark}
|
|
\end{description}
|
|
|
|
\begin{remark}
|
|
Mean pooling can be represented through normal convolutions.
|
|
\end{remark}
|
|
|
|
|
|
\subsection{Batch normalization layer}
|
|
\marginnote{Batch normalization layer}
|
|
|
|
Normalize the output of a layer during training in such a way that it has zero mean and unit variance.
|
|
|
|
\begin{description}
|
|
\item[Training]
|
|
During training, normalization is done on the current batch.
|
|
Given the $B$ activations of a batch $\{ \vec{a}^{(i)} \in \mathbb{R}^{D} \mid i = 1, \dots, B \}$,
|
|
mean and variance are computed as:
|
|
\[
|
|
\vec{\mu}_j = \frac{1}{B} \sum_{i=1}^{B} \vec{a}_j^{(i)}
|
|
\hspace{1.5em}
|
|
\vec{v}_j = \frac{1}{B} \sum_{i=1}^{B} \left( \vec{a}_j^{(i)} - \vec{\mu}_j \right)^2
|
|
\hspace{2em}
|
|
\text{ for $j = 1, \dots, D$}
|
|
\]
|
|
|
|
Then, the normalized activation is computed as:
|
|
\[ \hat{\vec{a}}_j^{(i)} = \frac{\vec{a}_j^{(i)} - \vec{\mu}_j}{\sqrt{\vec{v}_j + \varepsilon}} \hspace{2em} \text{ for $j = 1, \dots, D$} \]
|
|
where $\varepsilon$ is a small constant.
|
|
|
|
To introduce some flexibility, the final activation $\vec{s}^{(i)}$ is learned as:
|
|
\[ \vec{s}_j^{(i)} = \vec{\gamma}_j \hat{\vec{a}}_j^{(i)} + \vec{\beta}_j \hspace{2em} \text{ for $j = 1, \dots, D$} \]
|
|
where $\vec{\gamma}_j$ and $\vec{\beta}_j$ are parameters.
|
|
|
|
To estimate the mean and variance of the entire dataset to use during inference, their running averages are also computed.
|
|
At the $t$-th step, the running averages of mean and variance are computed as:
|
|
\[
|
|
\vec{\mu}_j^{(t)} = (1-\beta) \vec{\mu}_j^{(t-1)} + \beta \vec{\mu}_j
|
|
\hspace{1.5em}
|
|
\vec{v}_j^{(t)} = (1-\beta) \vec{v}_j^{(t-1)} + \beta \vec{v}_j
|
|
\hspace{2em} \text{ for $j = 1, \dots, D$}
|
|
\]
|
|
where $\beta$ is the momentum (usually $\beta = 0.1$).
|
|
|
|
\begin{remark}
|
|
All training steps of batch normalization are differentiable and can be integrated into gradient descent.
|
|
If normalization is done outside gradient descent, the optimization process might undo it.
|
|
\end{remark}
|
|
|
|
\begin{remark}
|
|
For convolutional layers, mean and variance are computed along the spatial dimension (i.e. pixels in the same output channel are normalized in the same way).
|
|
\end{remark}
|
|
|
|
|
|
\item[Inference]
|
|
During inference, the final running averages of mean $\vec{\mu}$ and variance $\vec{v}$ are used to normalize the activations (i.e. they are considered constants).
|
|
Given the learned parameters $\vec{\gamma}$ and $\vec{\beta}$, an activation is normalized as follows:
|
|
\[
|
|
\begin{split}
|
|
\vec{s}_j^{(i)} &= \vec{\gamma}_j \frac{\vec{a}_j^{(i)} - \vec{\mu}_j}{\sqrt{\vec{v}_j + \varepsilon}} + \vec{\beta}_j \\
|
|
&= \left( \frac{\vec{\gamma}_j}{\sqrt{\vec{v}_j + \varepsilon}} \right) \vec{a}_j^{(i)} +
|
|
\left( \vec{\beta}_j - \frac{\vec{\gamma}_j \vec{\mu}_j}{\sqrt{\vec{v}_j + \varepsilon}} \right)
|
|
\end{split}
|
|
\hspace{2em} \text{ for $j = 1, \dots, D$}
|
|
\]
|
|
|
|
\begin{remark}
|
|
Normalization during inference can be seen as a linear transformation. Therefore, it can be merged with the previous layer.
|
|
\end{remark}
|
|
|
|
|
|
\item[Properties]
|
|
The advantages of batch normalization are:
|
|
\begin{itemize}
|
|
\item It allows to use a higher learning rate and makes initialization less important.
|
|
\item Training becomes non-deterministic, introducing some regularization.
|
|
\item During inference, there is no overhead as it can be merged with the previous layer.
|
|
\end{itemize}
|
|
The disadvantages are:
|
|
\begin{itemize}
|
|
\item It is not clear why it works.
|
|
\item Training and inference work differently.
|
|
\item It does not scale with batches that are too small.
|
|
\end{itemize}
|
|
|
|
\begin{remark}[Internal covariate shift]
|
|
A possible motivation for batch normalization is that each layer of a neural network expects an input distribution that changes at each training iteration.
|
|
On the other hand, the distribution of the input itself depends on the previous layer and it also changes at each iteration.
|
|
Therefore, each layer is disrupted by the update of the previous one. Batch normalization aims to minimize this by maintaining a fixed distribution.
|
|
\end{remark}
|
|
\end{description} |