Moved DL in year1

This commit is contained in:
2024-04-14 21:02:30 +02:00
parent 63f2aa68fb
commit c98859ed9e
23 changed files with 1 additions and 1 deletions

View File

@ -0,0 +1 @@
../../ainotes.cls

View File

@ -0,0 +1,14 @@
\documentclass[11pt]{ainotes}
\title{Deep Learning}
\date{2023 -- 2024}
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
\begin{document}
\makenotesfront
\input{./sections/_expressivity.tex}
\input{./sections/_training.tex}
\input{./sections/_convolutional_nn.tex}
\end{document}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 662 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 95 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 395 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 411 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 502 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -0,0 +1,11 @@
{
"name": "Deep Learning",
"year": 1,
"semester": 2,
"pdfs": [
{
"name": null,
"path": "dl.pdf"
}
]
}

View File

@ -0,0 +1,570 @@
\chapter{Convolutional neural networks}
\section{Convolutions}
\begin{description}
\item[Convolution neuron] \marginnote{Convolution neuron}
Neuron influenced by only a subset of neurons in the previous layer.
\item[Receptive field] \marginnote{Receptive field}
Dimension of the input image influencing a neuron.
\item[Convolutional layer] \marginnote{Convolutional layer}
Layer composed of convolutional neurons.
Neurons in the same convolutional layer share the same weights and work as a convolutional filter.
\begin{remark}
The weights of the filters are learned.
\end{remark}
A convolutional layer has the following parameters:
\begin{descriptionlist}
\item[Kernel size] \marginnote{Kernel size}
Dimension (i.e. width and height) of the filter.
\item[Stride] \marginnote{Stride}
Offset between each filter application (i.e. stride $>1$ reduces the size of the output image).
\item[Padding] \marginnote{Padding}
Artificial enlargement of the image.
In practice, there are two modes of padding:
\begin{descriptionlist}
\item[Valid] No padding applied.
\item[Same] Apply the minimum padding needed.
\end{descriptionlist}
\item[Depth] \marginnote{Depth}
Number of different kernels to apply (i.e. augment the number of channels in the output image).
\end{descriptionlist}
The dimension along each axis of the output image is given by:
\[ \frac{W + P - K}{S} + 1 \]
where:
\begin{itemize}
\item $W$ is the size of the image (width or height).
\item $P$ is the padding.
\item $K$ is the kernel size.
\item $S$ is the stride.
\end{itemize}
\begin{remark}
If not specified, a kernel is applied to all the channels of the input image in parallel (but the weights of the kernel change at each channel).
\end{remark}
\end{description}
\subsection{Parameters}
The number of parameters of a convolutional layer is given by:
\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} \cdot D_\text{out} + D_\text{out} \]
where:
\begin{itemize}
\item $K_\text{w}$ is the width of the kernel.
\item $K_\text{h}$ is the height of the kernel.
\item $D_\text{in}$ is the input depth.
\item $D_\text{out}$ is the output depth.
\end{itemize}
Therefore, the number of FLOPS is of order:
\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} \cdot D_\text{out} \cdot (O_\text{w} \cdot O_\text{h}) \]
where:
\begin{itemize}
\item $O_\text{w}$ is the width of the output image.
\item $O_\text{h}$ is the height of the output image.
\end{itemize}
\section{Backpropagation}
A convolution can be expressed as a dense layer by representing it through a sparse matrix.
Therefore, backpropagation can be executed in the standard way,
with the only exception that the positions of the convolution matrix corresponding to
the same cell of the kernel should be updated with the same value (e.g. the mean of all the corresponding updates).
\begin{example}
Given a $4 \times 4$ image $I$ and a $3 \times 3$ kernel $K$ with stride $1$ and no padding:
\[
I = \begin{pmatrix} i_{0,0} & i_{0,1} & i_{0,2} & i_{0,3} \\ i_{1,0} & i_{1,1} & i_{1,2} & i_{1,3} \\
i_{2,0} & i_{2,1} & i_{2,2} & i_{2,3} \\ i_{3,0} & i_{3,1} & i_{3,2} & i_{3,3}
\end{pmatrix}
\hspace{3em}
K = \begin{pmatrix} w_{0,0} & w_{0,1} & w_{0,2} \\ w_{1,0} & w_{1,1} & w_{1,2} \\ w_{2,0} & w_{2,1} & w_{2,2} \end{pmatrix}
\]
The convolutional layer can be represented through a convolutional matrix and by flattening the image as follows:
\[
\begin{pmatrix}
w_{0,0} & 0 & 0 & 0 \\
w_{0,1} & w_{0,0} & 0 & 0 \\
w_{0,2} & w_{0,1} & 0 & 0 \\
0 & w_{0,2} & 0 & 0 \\
w_{1,0} & 0 & w_{0,0} & 0 \\
w_{1,1} & w_{1,0} & w_{0,1} & w_{0,0} \\
w_{1,2} & w_{1,1} & w_{0,2} & w_{0,1} \\
0 & w_{1,2} & 0 & w_{0,2} \\
w_{2,0} & 0 & w_{1,0} & 0 \\
w_{2,1} & w_{2,0} & w_{1,1} & w_{1,0} \\
w_{2,2} & w_{2,1} & w_{1,2} & w_{1,1} \\
0 & w_{2,2} & 0 & w_{1,2} \\
0 & 0 & w_{2,0} & 0 \\
0 & 0 & w_{2,1} & w_{2,0} \\
0 & 0 & w_{2,2} & w_{2,1} \\
0 & 0 & 0 & w_{2,2} \\
\end{pmatrix}^T
\cdot
\begin{pmatrix} i_{0,0} \\ i_{0,1} \\ i_{0,2} \\ i_{0,3} \\ i_{1,0} \\ i_{1,1} \\ i_{1,2} \\ i_{1,3} \\
i_{2,0} \\ i_{2,1} \\ i_{2,2} \\ i_{2,3} \\ i_{3,0} \\ i_{3,1} \\ i_{3,2} \\ i_{3,3}
\end{pmatrix}
=
\begin{pmatrix} o_{0,0} \\ o_{0,1} \\ o_{1,0} \\ o_{1,1} \end{pmatrix}
\mapsto
\begin{pmatrix} o_{0,0} & o_{0,1} \\ o_{1,0} & o_{1,1} \end{pmatrix}
\]
\end{example}
\section{Pooling layer}
\begin{description}
\item[Pooling]
Layer that applies a function as a filter.
\begin{descriptionlist}
\item[Max-pooling] \marginnote{Max-pooling}
Filter that computes the maximum of the pixels within the kernel.
\item[Mean-pooling] \marginnote{Mean-pooling}
Filter that computes the average of the pixels within the kernel.
\end{descriptionlist}
\end{description}
\section{Inception hypothesis}
\begin{description}
\item[Depth-wise separable convolution] \marginnote{Depth-wise separable convolution}
Decompose a 3D kernel into a 2D kernel followed by a 1D kernel.
Given an input image with $C_\text{in}$ channels,
a single pass of a traditional 3D convolution uses a kernel of shape $k \times k \times C_\text{in}$
to obtain an output of $1$ channel.
This is repeated for a desired $C_\text{out}$ number of times (with different kernels).
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/traditional_convolution.png}
\caption{Example of traditional convolution}
\end{figure}
A single pass of a depth-wise separable convolution uses $C_\text{in}$ different $k \times k \times 1$ kernels first to obtain $C_\text{in}$ images.
Then, a $1 \times 1 \times C_\text{in}$ kernel is used to obtain an output image of $1$ channel.
The last 1D kernel is repeated for a $C_\text{out}$ number of times (with different kernels).
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/depthwise_separable_convolution.png}
\caption{Example of depth-wise separable convolution}
\end{figure}
\end{description}
\subsection{Parameters}
The number of parameters of a depth-wise separable convolutional layer is given by:
\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} + (1 \cdot 1 \cdot D_\text{in}) \cdot D_\text{out} \]
where:
\begin{itemize}
\item $K_\text{w}$ is the width of the kernel.
\item $K_\text{h}$ is the height of the kernel.
\item $D_\text{in}$ is the input depth.
\item $D_\text{out}$ is the output depth.
\end{itemize}
\section{Residual learning}
\begin{description}
\item[Residual connection] \marginnote{Residual connection}
Sum the input of a layer to its output.
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_residual_connection.pdf}
\caption{Residual connection}
\end{figure}
\begin{remark}
The sum operation can be substituted with the concatenation.
\end{remark}
\begin{remark}
The effectiveness of residual connections is only shown empirically.
\end{remark}
\begin{remark}
By adding the input, without passing through the activation function,
might help to propagate the gradient from higher layers to lower layers
and avoid the risk of vanishing gradient.
Another interpretation is that, by learning the function $F(x) + x$, it is easier for the model to represent, if it needs to, the identity function as
the problem is reduced to learn $F(x) = 0$.
On the other hand, without a residual connection, learning $F(x) = x$ from scratch might be harder.
\end{remark}
\end{description}
\section{Transfer learning and fine-tuning}
\begin{description}
\item[Transfer learning] \marginnote{Transfer learning}
Reuse an existing model by appending some new layers to it.
Only the new layers are trained.
\item[Fine-tuning] \marginnote{Fine-tuning}
Reuse an existing model by appending some new layers to it.
The existing model (or part of it) is trained alongside the new layers.
\end{description}
\begin{remark}
In computer vision, reusing an existing model makes sense as
the first convolutional layers tend to learn primitive concepts that are independent of the downstream task.
\end{remark}
\section{Other types of convolution}
\begin{description}
\item[Transposed convolution / Deconvolution] \marginnote{Transposed convolution / Deconvolution}
Convolution to upsample the input (i.e. each pixel is upsampled into a $k \times k$ patch).
\begin{remark}
A transposed convolution can be interpreted as a normal convolution with stride $< 1$.
\end{remark}
\item[Dilated convolution] \marginnote{Dilated convolution}
Convolution computed using a kernel that does not consider contiguous pixels.
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/dilated_convolution.png}
\caption{Examples of dilated convolutions}
\end{figure}
\begin{remark}
Dilated convolutions allow the enlargement of the receptive field without an excessive number of parameters.
\end{remark}
\begin{remark}
Dilated convolutions are useful in the first layers when processing high-resolution images (e.g. temporal convolutional networks).
\end{remark}
\end{description}
\section{Normalization layer}
A normalization layer has the empirical effects of:
\begin{itemize}
\item Stabilizing and possibly speeding up the training phase.
\item Increasing the independence of each layer (i.e. maintain a similar magnitude of the weights at each layer).
\end{itemize}
\begin{description}
\item[Batch normalization] \marginnote{Batch normalization}
Given an input batch $X$, a batch normalization layer outputs the following:
\[ \gamma \frac{X - \mu}{\sqrt{\sigma^2 + \varepsilon}} + \beta \]
where:
\begin{itemize}
\item $\gamma$ and $\beta$ are learned parameters.
\item $\varepsilon$ is a small constant.
\item $\mu$ is the mean and $\sigma^2$ is the variance.
Depending on when the layer is applied, these values change:
\begin{descriptionlist}
\item[Training]
$\mu$ and $\sigma^2$ are computed from the input batch $X$.
\item[Inference]
$\mu$ and $\sigma^2$ are computed from the training data.
Usually, it is obtained as the moving average of the values computed from the batches during training.
\end{descriptionlist}
\end{itemize}
\end{description}
\section{Gradient ascent}
\subsection{Hidden layer visualization}
\marginnote{Hidden layer visualization}
Visualize what type of input features activate a neuron.
\begin{description}
\item[Image ascent approach]
During training, the loss function of a neural network $\mathcal{L}(\vec{x}; \vec{\theta})$ is
parametrized on the weights $\vec{\theta}$ while the input $\vec{x}$ is fixed.
To visualize the patterns that activate a (convolutional) neuron, it is possible to invert the optimization process
by fixing the parameters $\vec{\theta}$ and optimizing an image $\vec{x}$ so that the loss function becomes $\mathcal{L}(\vec{\theta}; \vec{x})$.
The process works as follows:
\begin{enumerate}
\item Start with a random image $\vec{x}$.
\item Do a forward pass with $\vec{x}$ as input and keep track of the activation function $a_i(\vec{x})$ of the neuron(s) of interest.
\item Do a backward pass to compute the gradient $\frac{\partial a_i(\vec{x})}{\partial \vec{x}_{i,j}}$ (i.e. chain rule) for each pixel $(i, j)$ of the image.
\item Update the image as $\vec{x} = \vec{x} + \eta \frac{\partial a_i(\vec{x})}{\partial \vec{x}}$.
\item Repeat until the activation function $a_i(\vec{x})$ is high enough.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/cnn_visualization_ascent.png}
\caption{Example of generative image ascent visualization approach}
\end{figure}
\item[Generative approach]
Starting from an image $\hat{\vec{x}}$ that makes a specific layer $l$ output $\Theta_l(\hat{\vec{x}})$,
generate another image $\vec{x}$ that makes the same layer $l$ output a similar value $\Theta_l(\vec{x}) \approx \Theta_l(\hat{\vec{x}})$
(i.e. it cannot distinguish between $\vec{x}$ and $\hat{\vec{x}}$).
Fixed $\hat{\vec{x}}$, the problem can be solved as an optimization problem:
\[ \arg\min_{\vec{x}} \Big\{ l\big( \Theta_l(\vec{x}), \Theta_l(\hat{\vec{x}}) \big) + \lambda \mathcal{R}(\vec{x}) \Big\} \]
where $l$ is a loss function to measure the distance between the two representations and
$\mathcal{R}$ is a regularizer.
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/cnn_visualization_generative.png}
\caption{Example of generative visualization approach}
\end{figure}
\end{description}
\subsection{Inceptionism}
\marginnote{Inceptionism}
Employ the same techniques for hidden layer visualization to create psychedelic and abstract images.
\begin{description}
\item[Deep dream] \marginnote{Deep dream}
Iteratively apply gradient ascent on an image:
\begin{enumerate}
\item Train a neural network for image classification.
\item Repeatedly modify an input image using gradient ascent to improve the activation of a specific neuron.
\end{enumerate}
After enough iterations, the features that the target neuron learned to recognize during training are injected into the input image,
even if that image does not have that specific feature.
\begin{remark}
Strong regularizers are used to prioritize features that statistically resemble real images.
\end{remark}
\item[Content enhancing] \marginnote{Content enhancing}
Same as above, but instead of selecting a neuron, an entire layer is fixed and the input image is injected with whatever that layer detects.
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/inceptionism.png}
\caption{Example of deep dream images}
\end{figure}
\end{description}
\subsection{Style transfer}
\marginnote{Style transfer}
Mimic the style of an image and transfer it to the content of another one.
\begin{description}
\item[Internal representation approach]
Given a convolutional neural network pretrained for classification, the method can be divided into two parts:
\begin{descriptionlist}
\item[Content reconstruction]
Given an image $\hat{\vec{x}}$, consider the output of the $l$-th layer of the network.
Its internal representation of the image has $C^l$ distinct channels (depending on the number of kernels)
each with $M^l = W^l \cdot H^l$ elements (when flattened).
The representation (feature map) of the $l$-th layer can therefore be denoted as $F^l \in \mathbb{R}^{C^l \times M^l}$
and $F^l_{c, k}$ is used to denote the activation of the $c$-th filter applied at position $k$ of the $l$-th layer.
As higher layers of a CNN capture high-level features, one of the high layers is selected and its feature map is used as the content representation.
Given a content representation $\mathcal{C} = \hat{F}^l$ of $\hat{\vec{x}}$, chosen as the feature map at the $l$-th layer,
it is possible to reconstruct the original image $\hat{\vec{x}}$ starting from a random one $\vec{x}$ by minimizing the loss:
\[ \mathcal{L}_\text{content}(\hat{\vec{x}}, \vec{x}, l) = \sum_{c, i} (F^l_{c, i} - \mathcal{C}_{c, i})^2 \]
where $F^l$ is the feature representation of the random image $\vec{x}$.
\item[Style reconstruction]
Given an image $\hat{\vec{y}}$ and its feature maps $F^l$ for $l \in \{1, \dots L\}$,
at each layer $l$, the Gram matrix $G^l \in \mathbb{R}^{C^l \times C^l}$ obtained as the dot product between pairs of channels
(i.e. correlation between features extracted by different kernels):
\[ G^l_{c_1, c_2} = F^l_{c_1} \odot F^l_{c_2} = \sum_{k} (F^l_{c_1, k} \cdot F^l_{c_2, k}) \]
allows to capture the concept of style.
The Gram matrices at each layer are considered as the style representation.
Given the style representation $\mathcal{S}^1, \dots, \mathcal{S}^L$ of $\hat{\vec{y}}$,
it is possible to reconstruct the same style of the original image $\hat{\vec{y}}$ starting from a random image $\vec{y}$ by minimizing the loss:
\[ \mathcal{L}_\text{style}(\hat{\vec{y}}, \vec{y}) = \sum_{l=1}^{L} \gamma_l \left(\sum_{i,j} (G^l_{i, j} - \mathcal{S}^l_{i,j})^2 \right) \]
where $\gamma_l$ is a weight assigned to each layer and $G^l$ is the $l$-th Gram matrix of the random image $\vec{y}$.
\end{descriptionlist}
Put together, given:
\begin{itemize}
\item An image $\hat{\vec{x}}$ from which the content has to be copied.
\item An image $\hat{\vec{y}}$ from which the style has to be copied.
\item The content representation $\mathcal{C}$ of $\hat{\vec{x}}$.
\item The style representation $\mathcal{S}^1, \dots, \mathcal{S}^L$ of $\hat{\vec{y}}$.
\end{itemize}
A new random image $\vec{o}$ is fitted by minimizing the loss:
\[ \mathcal{L}_\text{total} = \alpha \mathcal{L}_\text{content}(\hat{\vec{x}}, \vec{o}, l) + \beta \mathcal{L}_\text{style}(\hat{\vec{y}}, \vec{o}) \]
where $\alpha$ and $\beta$ are hyperparameters.
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{./img/style_transfer.png}
\caption{Internal representation style transfer workflow}
\end{figure}
\item[Perceptual loss approach]
A CNN pretrained for classification is used as a loss network to compute perceptual loss functions
to measure the difference in style and content between images.
The representation for style and content is extracted in a similar way as above.
The loss network is then kept fixed and an image transformation network is trained to transform its input $\vec{x}$
into an image $\vec{y}$ compliant (i.e. minimizes the perceptual losses) with a given style image $\vec{y}_s$ and a content image $\vec{y}_c$
(if the goal is to keep the content of the input, then $\vec{y}_c = \vec{x}$).
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/style_transfer_perceptual_loss.png}
\caption{Perceptual loss style transfer workflow}
\end{figure}
\end{description}
\section{Data manifold}
\subsection{Adversarial attacks}
\marginnote{Adversarial attacks}
Hijack a neural network classifier to forcefully predict a given class.
\begin{description}
\item[Gradient ascent approach]
White-box technique that uses gradient ascent to compute an image that the network classifies with the wanted class.
Let:
\begin{itemize}
\item $\vec{x}$ be the input image.
\item $f(\vec{x})$ the probability distribution that the network outputs.
\item $c$ the wanted class.
\item $p$ the wanted probability distribution (i.e. $p_c = 1$ and $p_i = 0$ elsewhere).
\item $\mathcal{L}$ the loss function.
\end{itemize}
By iteratively updating the input image with the gradient of the loss function $\frac{\partial\mathcal{L}(f(\vec{x}), p)}{\partial\vec{x}}$
computed wrt to $\vec{x}$,
after enough iterations, the classifier will classify the updated $\vec{x}$ as $c$.
\begin{remark}
The updates computed from the gradient of the loss function are usually imperceptible.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.25\linewidth]{./img/fooling_nn.png}
\caption{Examples of hijacked classifications}
\end{figure}
\item[Evolutionary approach]
Black-box technique based on an evolutionary approach.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/fooling_evolutionary.png}
\caption{Workflow for evolutionary-based attacks}
\end{figure}
\end{description}
\subsection{Manifold}
\begin{description}
\item[Manifold] \marginnote{Manifold}
Area of the feature space that represents "natural" images (i.e. images with a meaning an without artificial noise).
This area is usually organized along a smooth surface which is a minimal portion of the entire space of all the possible images.
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/manifold.png}
\caption{Example of manifold in two dimensions}
\end{figure}
\end{description}
\begin{remark}
As one cannot know where the classifier draws the boundaries,
a tiny change in the data might cause a misclassification.
Adversarial attacks also exploit this to cause misclassifications.
\end{remark}
\begin{remark}
Inceptionism aims to modify the data while remaining in the manifold.
\end{remark}
\subsection{Autoencoders}
\marginnote{Autoencoder}
Network composed of two components:
\begin{descriptionlist}
\item[Encoder]
Projects the input into an internal representation of lower dimensionality.
\item[Decoder]
Reconstructs the input from its internal representation.
\end{descriptionlist}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/autoencoder.png}
\caption{Autoencoder structure}
\end{figure}
An autoencoder has the following properties:
\begin{descriptionlist}
\item[Data-specific] It only works on data with a strong correlation (i.e. with regularities in the feature space).
\item[Lossy] By passing through the internal representation, the reconstruction of the input is nearly always degraded.
\item[Self-supervised] Training happens directly on unlabelled data.
\end{descriptionlist}
Applications of autoencoders are:
\begin{descriptionlist}
\item[Denoising]
Train the autoencoder to reconstruct noiseless data.
Given an image, the input is a noisy version of it, while the output is expected to be similar to the original image.
\item[Anomaly detection]
As autoencoders are data-specific, they will perform poorly on data different from those used for training.
This allows to detect anomalies by comparing the quality of the reconstruction.
If the input is substantially different from the training data (or has been attacked with an artificial manipulation),
the reconstructed output is expected to have poor quality.
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/autoencoder_anomaly.png}
\caption{Example of anomaly detection}
\end{figure}
\end{descriptionlist}

View File

@ -0,0 +1,72 @@
\chapter{Neural networks expressivity}
\section{Perceptron}
Single neuron that defines a binary threshold through a hyperplane:
\[
\begin{cases}
1 & \sum_{i} w_i x_i + b \geq 0 \\
0 & \text{otherwise}
\end{cases}
\]
\begin{description}
\item[Expressivity] \marginnote{Perceptron expressivity}
A perceptron can represent a NAND gate but not a XOR gate.
\begin{center}
\begin{minipage}{.2\textwidth}
\centering
\includegraphics[width=\textwidth]{img/_perceptron_nand.pdf}
\tiny NAND
\end{minipage}
\begin{minipage}{.2\textwidth}
\centering
\includegraphics[width=\textwidth]{img/_xor.pdf}
\tiny XOR
\end{minipage}
\end{center}
\begin{remark}
Even if NAND is logically complete, the strict definition of a perceptron is not a composition of them.
\end{remark}
\end{description}
\section{Multi-layer perceptron}
Composition of perceptrons.
\begin{descriptionlist}
\item[Shallow neural network] \marginnote{Shallow NN}
Neural network with one hidden layer.
\item[Deep neural network] \marginnote{Deep NN}
Neural network with more than one hidden layer.
\end{descriptionlist}
\begin{description}
\item[Expressivity] \marginnote{Multi-layer perceptron expressivity}
Shallow neural networks allow to approximate any continuous function
\[ f: \mathbb{R} \rightarrow [0, 1] \]
\begin{remark}
Still, deep neural networks allow to use less neural units.
\end{remark}
\end{description}
\subsection{Parameters}
The number of parameters of a layer is given by:
\[ S_\text{in} \cdot S_\text{out} + S_\text{out} \]
where:
\begin{itemize}
\item $S_\text{in}$ is the dimension of the input of the layer.
\item $S_\text{out}$ is the dimension of the output of the layer.
\end{itemize}
Therefore, the number of FLOPS is of order:
\[ S_\text{in} \cdot S_\text{out} \]

View File

@ -0,0 +1,203 @@
\chapter{Training}
\section{Gradient descent}
\begin{enumerate}
\item
\marginnote{Gradient descent}
Start from a random set of weights $w$.
\item Compute the gradient $\nabla \mathcal{L}$ of the loss function.
\item Make a small step of size $-\nabla \mathcal{L}(w)$.
\item Go to 2., until convergence.
\end{enumerate}
\begin{description}
\item[Learning rate] \marginnote{Learning rate}
Size of the step. Usually denoted with $\mu$.
\[ w = w + \mu \nabla \mathcal{L}(w) \]
\item[Optimizer] \marginnote{Optimizer}
Algorithm that tunes the learning rate during training.
\item[Stochastic gradient descent] \marginnote{Stochastic gradient descent}
Use a subset of the training data to compute the gradient.
\begin{description}
\item[Full-batch] Use the entire dataset.
\item[Mini-batch] Use a subset of the training data.
\item[Online] Use a single sample.
\end{description}
\begin{remark}
SGD with mini-batch converges to the same result obtained using a full-batch approach.
\end{remark}
\item[Momentum] \marginnote{Momentum}
Correct the update $v_t$ at time $t$ considering the update $v_{t-1}$ of time $t-1$.
\[
\begin{split}
w_{t+1} &= w_t + v_t\\
v_t &= \mu \nabla \mathcal{L}(w_t) + \alpha v_{t-1}
\end{split}
\]
\begin{description}
\item[Nesterov momentum] \marginnote{Nesterov momentum}
Apply the momentum before computing the gradient.
\end{description}
\item[Overfitting] \marginnote{Overfitting}
Model too specialized on the training data.
Methods to reduce overfitting are:
\begin{itemize}
\item Increasing the dataset size.
\item Simplifying the model.
\item Early stopping.
\item Regularization.
\item Model averaging.
\item Neurons dropout.
\end{itemize}
\item[Underfitting] \marginnote{Underfitting}
Model too simple and unable to capture features of the training data.
\end{description}
\section{Backpropagation}
\begin{description}
\item[Chain rule] \marginnote{Chain rule}
Refer to \href{\gitSMM{}}{\texttt{SMM for AI (Section 5.1.1)}}.
\item[Backpropagation] \marginnote{Backpropagation}
Algorithm to compute the gradient at each layer of a neural network.
The output of the $i$-th neuron in the layer $l$ of a neural network can be defined as:
\[ a_{l,i} = \sigma_{l,i}( \vec{w}_{l,i}^T \vec{a}_{l-1} + b_{l,i} ) = \sigma_{l,i}(z_{l,i})\]
where:
\begin{itemize}
\item $a_{l,i} \in \mathbb{R}$ is the output of the neuron.
\item $\vec{w}_{l,i} \in \mathbb{R}^{n_{l-1}}$ is the vector of weights.
\item $\vec{a}_{l-1} \in \mathbb{R}^{n_{l-1}}$ is the vector of the outputs of the previous layer.
\item $b_{l,i} \in \mathbb{R}$ is the bias.
\item $\sigma_{l,i}: \mathbb{R} \rightarrow \mathbb{R}$ is the activation function\footnote{Even if it is possible to have a different activation function in each neuron, in practice, each layer has the same activation function.}.
\item $z_{l,i}(\vec{w}_{l,i}, b_{l,i} | \vec{a}_{l-1}) = \vec{w}_{l,i}^T \vec{a}_{l-1} + b_{l,i}$ is the argument of the activation function and is parametrized on $\vec{w}_{l,i}$ and $b_{l,i}$.
\end{itemize}
Hence, the outputs of the $l$-th layer can be defined as:
\[ \vec{a}_l = \sigma_l( \matr{W}_l^T \vec{a}_{l-1} + \vec{b}_l ) = \sigma_l( \vec{z}_l(\matr{W}_l, \vec{b}_l | \vec{a}_{l-1}) ) \]
where:
\begin{itemize}
\item $\sigma_l: \mathbb{R}^{n_l} \rightarrow \mathbb{R}^{n_l}$ is the element-wise activation function.
\item
$\matr{W}_l \in \mathbb{R}^{n_l \times n_{l-1}}$,
$\vec{a}_{l-1} \in \mathbb{R}^{n_{l-1}}$,
$\vec{b}_l \in \mathbb{R}^{n_l}$,
$\vec{a}_l \in \mathbb{R}^{n_l}$.
\end{itemize}
Finally, a neural network with input $\vec{x}$ can be expressed as:
\[
\begin{split}
\vec{a}_0 &= \vec{x} \\
\vec{a}_i &= \sigma_i( \vec{z}_i(\matr{W}_i, \vec{b}_i | \vec{a}_{i-1}) )
\end{split}
\]
Given a neural network with $K$ layers and a loss function $\mathcal{L}$,
we want to compute the derivative of $\mathcal{L}$ w.r.t. the weights of each layer to tune the parameters.
First, we highlight the parameters of each of the functions involved:
\begin{descriptionlist}
\item[Loss] $\mathcal{L}(a_K) = \mathcal{L}(\sigma_K)$ takes as input the output of the network (i.e. the output of the last activation function).
\item[Activation function] $\sigma_i(\vec{z}_i)$ takes as input the value of the neurons at the $i$-th layer.
\item[Neurons] $\vec{z}_i(\matr{W}_i, \vec{b}_i)$ takes as input the weights and biases at the $i$-th layer.
\end{descriptionlist}
Let $\odot$ be the Hadamard product.
By exploiting the chain rule, we can compute the derivatives w.r.t. the weights going backward:
\[
\frac{\partial\mathcal{L}}{\partial\matr{W}_K} =
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} \frac{\partial\vec{z}_K}{\partial\matr{W}_K} =
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} \cdot
\underset{1 \times \mathbb{R}^{n_{K-1}}}{\vec{a}_{K-1}^T}
\in \mathbb{R}^{n_K \times n_{K-1}}
\]
\[
\begin{split}
\frac{\partial\mathcal{L}}{\partial\matr{W}_{K-1}} &=
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_{K}} \frac{\partial\vec{z}_{K}}{\partial\sigma_{K-1}}
\frac{\partial\sigma_{K-1}}{\partial\vec{z}_{K-1}} \frac{\partial\vec{z}_{K-1}}{\partial\matr{W}_{K-1}} \\
&= (
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)}
)^T \cdot
\underset{\mathbb{R}^{n_K} \times \mathbb{R}^{n_{K-1}}}{\matr{W}_K} \odot
\underset{\mathbb{R}^{n_{K-1}} \times 1}{\nabla\sigma_{K-1}(\vec{z}_{K-1})} \cdot
\underset{1 \times \mathbb{R}^{n_{K-2}}}{\vec{a}_{K-2}^T}
\in \mathbb{R}^{n_{K-1} \times n_{K-2}}
\end{split}
\]
\[ \vdots \]
In the same way, we can compute the derivatives w.r.t. the biases:
\[
\frac{\partial\mathcal{L}}{\partial\vec{b}_K} =
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} \frac{\partial\vec{z}_K}{\partial\vec{b}_K} =
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} \cdot
1
\in \mathbb{R}^{n_K}
\]
\[
\begin{split}
\frac{\partial\mathcal{L}}{\partial\vec{b}_{K-1}} &=
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_{K}} \frac{\partial\vec{z}_{K}}{\partial\sigma_{K-1}}
\frac{\partial\sigma_{K-1}}{\partial\vec{z}_{K-1}} \frac{\partial\vec{z}_{K-1}}{\partial\vec{b}_{K-1}} \\
&= (
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)}
)^T \cdot
\underset{\mathbb{R}^{n_K} \times \mathbb{R}^{n_{K-1}}}{\matr{W}_K} \odot
\underset{\mathbb{R}^{n_{K-1}} \times 1}{\nabla\sigma_{K-1}(\vec{z}_{K-1})} \cdot
1
\in \mathbb{R}^{n_{K-1}}
\end{split}
\]
\[ \vdots \]
It can be noticed that many terms are repeated from one layer to another.
By exploiting this, we can store the following intermediate values:
\[
\begin{split}
\delta_K &= \frac{\partial\mathcal{L}}{\partial\vec{z}_K} = \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} =
\nabla\mathcal{L}(\vec{a}_K) \odot \nabla\sigma_K(\vec{z}_K) \\
\delta_l &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l} = \delta_{l+1}^T \cdot \matr{W}_{l+1} \odot \nabla\sigma_l(\vec{z}_l)
\end{split}
\]
and reused them to compute the derivatives as follows:
\[
\begin{split}
\frac{\partial\mathcal{L}}{\partial\matr{W}_l} &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l}\frac{\partial\vec{z}_l}{\partial\matr{W}_l} =
\delta_l \cdot \vec{a}_{l-1}^T \\
\frac{\partial\mathcal{L}}{\partial\vec{b}_l} &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l}\frac{\partial\vec{z}_l}{\partial\vec{b}_l} =
\delta_l \cdot 1
\end{split}
\]
\end{description}
\begin{description}
\item[Vanishing gradient] \marginnote{Vanishing gradient}
As backpropagation consists of a chain of products,
when a component is small (i.e. $< 1$), it will gradually cancel out the gradient when backtracking,
causing the first layers to learn much slower than the last layers.
\begin{remark}
This is an issue of the sigmoid function.
ReLU was designed to solve this problem.
\end{remark}
\end{description}