Moved DL in year1
1
src/year1/deep-learning/ainotes.cls
Symbolic link
@ -0,0 +1 @@
|
||||
../../ainotes.cls
|
||||
14
src/year1/deep-learning/dl.tex
Normal file
@ -0,0 +1,14 @@
|
||||
\documentclass[11pt]{ainotes}
|
||||
|
||||
\title{Deep Learning}
|
||||
\date{2023 -- 2024}
|
||||
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\makenotesfront
|
||||
\input{./sections/_expressivity.tex}
|
||||
\input{./sections/_training.tex}
|
||||
\input{./sections/_convolutional_nn.tex}
|
||||
|
||||
\end{document}
|
||||
BIN
src/year1/deep-learning/img/_perceptron_nand.pdf
Normal file
BIN
src/year1/deep-learning/img/_residual_connection.pdf
Normal file
BIN
src/year1/deep-learning/img/_xor.pdf
Normal file
BIN
src/year1/deep-learning/img/autoencoder.png
Normal file
|
After Width: | Height: | Size: 32 KiB |
BIN
src/year1/deep-learning/img/autoencoder_anomaly.png
Normal file
|
After Width: | Height: | Size: 55 KiB |
BIN
src/year1/deep-learning/img/cnn_visualization_ascent.png
Normal file
|
After Width: | Height: | Size: 1.4 MiB |
BIN
src/year1/deep-learning/img/cnn_visualization_generative.png
Normal file
|
After Width: | Height: | Size: 662 KiB |
BIN
src/year1/deep-learning/img/depthwise_separable_convolution.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
src/year1/deep-learning/img/dilated_convolution.png
Normal file
|
After Width: | Height: | Size: 95 KiB |
BIN
src/year1/deep-learning/img/fooling_evolutionary.png
Normal file
|
After Width: | Height: | Size: 146 KiB |
BIN
src/year1/deep-learning/img/fooling_nn.png
Normal file
|
After Width: | Height: | Size: 395 KiB |
BIN
src/year1/deep-learning/img/inceptionism.png
Normal file
|
After Width: | Height: | Size: 411 KiB |
BIN
src/year1/deep-learning/img/manifold.png
Normal file
|
After Width: | Height: | Size: 34 KiB |
BIN
src/year1/deep-learning/img/style_transfer.png
Normal file
|
After Width: | Height: | Size: 502 KiB |
BIN
src/year1/deep-learning/img/style_transfer_perceptual_loss.png
Normal file
|
After Width: | Height: | Size: 129 KiB |
BIN
src/year1/deep-learning/img/traditional_convolution.png
Normal file
|
After Width: | Height: | Size: 14 KiB |
11
src/year1/deep-learning/metadata.json
Normal file
@ -0,0 +1,11 @@
|
||||
{
|
||||
"name": "Deep Learning",
|
||||
"year": 1,
|
||||
"semester": 2,
|
||||
"pdfs": [
|
||||
{
|
||||
"name": null,
|
||||
"path": "dl.pdf"
|
||||
}
|
||||
]
|
||||
}
|
||||
570
src/year1/deep-learning/sections/_convolutional_nn.tex
Normal file
@ -0,0 +1,570 @@
|
||||
\chapter{Convolutional neural networks}
|
||||
|
||||
|
||||
\section{Convolutions}
|
||||
|
||||
\begin{description}
|
||||
\item[Convolution neuron] \marginnote{Convolution neuron}
|
||||
Neuron influenced by only a subset of neurons in the previous layer.
|
||||
|
||||
\item[Receptive field] \marginnote{Receptive field}
|
||||
Dimension of the input image influencing a neuron.
|
||||
|
||||
\item[Convolutional layer] \marginnote{Convolutional layer}
|
||||
Layer composed of convolutional neurons.
|
||||
Neurons in the same convolutional layer share the same weights and work as a convolutional filter.
|
||||
|
||||
\begin{remark}
|
||||
The weights of the filters are learned.
|
||||
\end{remark}
|
||||
|
||||
A convolutional layer has the following parameters:
|
||||
\begin{descriptionlist}
|
||||
\item[Kernel size] \marginnote{Kernel size}
|
||||
Dimension (i.e. width and height) of the filter.
|
||||
|
||||
\item[Stride] \marginnote{Stride}
|
||||
Offset between each filter application (i.e. stride $>1$ reduces the size of the output image).
|
||||
|
||||
\item[Padding] \marginnote{Padding}
|
||||
Artificial enlargement of the image.
|
||||
|
||||
In practice, there are two modes of padding:
|
||||
\begin{descriptionlist}
|
||||
\item[Valid] No padding applied.
|
||||
\item[Same] Apply the minimum padding needed.
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Depth] \marginnote{Depth}
|
||||
Number of different kernels to apply (i.e. augment the number of channels in the output image).
|
||||
\end{descriptionlist}
|
||||
|
||||
The dimension along each axis of the output image is given by:
|
||||
\[ \frac{W + P - K}{S} + 1 \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $W$ is the size of the image (width or height).
|
||||
\item $P$ is the padding.
|
||||
\item $K$ is the kernel size.
|
||||
\item $S$ is the stride.
|
||||
\end{itemize}
|
||||
|
||||
\begin{remark}
|
||||
If not specified, a kernel is applied to all the channels of the input image in parallel (but the weights of the kernel change at each channel).
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Parameters}
|
||||
|
||||
The number of parameters of a convolutional layer is given by:
|
||||
\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} \cdot D_\text{out} + D_\text{out} \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $K_\text{w}$ is the width of the kernel.
|
||||
\item $K_\text{h}$ is the height of the kernel.
|
||||
\item $D_\text{in}$ is the input depth.
|
||||
\item $D_\text{out}$ is the output depth.
|
||||
\end{itemize}
|
||||
|
||||
Therefore, the number of FLOPS is of order:
|
||||
\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} \cdot D_\text{out} \cdot (O_\text{w} \cdot O_\text{h}) \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $O_\text{w}$ is the width of the output image.
|
||||
\item $O_\text{h}$ is the height of the output image.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\section{Backpropagation}
|
||||
|
||||
A convolution can be expressed as a dense layer by representing it through a sparse matrix.
|
||||
|
||||
Therefore, backpropagation can be executed in the standard way,
|
||||
with the only exception that the positions of the convolution matrix corresponding to
|
||||
the same cell of the kernel should be updated with the same value (e.g. the mean of all the corresponding updates).
|
||||
|
||||
\begin{example}
|
||||
Given a $4 \times 4$ image $I$ and a $3 \times 3$ kernel $K$ with stride $1$ and no padding:
|
||||
\[
|
||||
I = \begin{pmatrix} i_{0,0} & i_{0,1} & i_{0,2} & i_{0,3} \\ i_{1,0} & i_{1,1} & i_{1,2} & i_{1,3} \\
|
||||
i_{2,0} & i_{2,1} & i_{2,2} & i_{2,3} \\ i_{3,0} & i_{3,1} & i_{3,2} & i_{3,3}
|
||||
\end{pmatrix}
|
||||
\hspace{3em}
|
||||
K = \begin{pmatrix} w_{0,0} & w_{0,1} & w_{0,2} \\ w_{1,0} & w_{1,1} & w_{1,2} \\ w_{2,0} & w_{2,1} & w_{2,2} \end{pmatrix}
|
||||
\]
|
||||
The convolutional layer can be represented through a convolutional matrix and by flattening the image as follows:
|
||||
\[
|
||||
\begin{pmatrix}
|
||||
w_{0,0} & 0 & 0 & 0 \\
|
||||
w_{0,1} & w_{0,0} & 0 & 0 \\
|
||||
w_{0,2} & w_{0,1} & 0 & 0 \\
|
||||
0 & w_{0,2} & 0 & 0 \\
|
||||
w_{1,0} & 0 & w_{0,0} & 0 \\
|
||||
w_{1,1} & w_{1,0} & w_{0,1} & w_{0,0} \\
|
||||
w_{1,2} & w_{1,1} & w_{0,2} & w_{0,1} \\
|
||||
0 & w_{1,2} & 0 & w_{0,2} \\
|
||||
w_{2,0} & 0 & w_{1,0} & 0 \\
|
||||
w_{2,1} & w_{2,0} & w_{1,1} & w_{1,0} \\
|
||||
w_{2,2} & w_{2,1} & w_{1,2} & w_{1,1} \\
|
||||
0 & w_{2,2} & 0 & w_{1,2} \\
|
||||
0 & 0 & w_{2,0} & 0 \\
|
||||
0 & 0 & w_{2,1} & w_{2,0} \\
|
||||
0 & 0 & w_{2,2} & w_{2,1} \\
|
||||
0 & 0 & 0 & w_{2,2} \\
|
||||
\end{pmatrix}^T
|
||||
\cdot
|
||||
\begin{pmatrix} i_{0,0} \\ i_{0,1} \\ i_{0,2} \\ i_{0,3} \\ i_{1,0} \\ i_{1,1} \\ i_{1,2} \\ i_{1,3} \\
|
||||
i_{2,0} \\ i_{2,1} \\ i_{2,2} \\ i_{2,3} \\ i_{3,0} \\ i_{3,1} \\ i_{3,2} \\ i_{3,3}
|
||||
\end{pmatrix}
|
||||
=
|
||||
\begin{pmatrix} o_{0,0} \\ o_{0,1} \\ o_{1,0} \\ o_{1,1} \end{pmatrix}
|
||||
\mapsto
|
||||
\begin{pmatrix} o_{0,0} & o_{0,1} \\ o_{1,0} & o_{1,1} \end{pmatrix}
|
||||
\]
|
||||
\end{example}
|
||||
|
||||
|
||||
|
||||
\section{Pooling layer}
|
||||
|
||||
\begin{description}
|
||||
\item[Pooling]
|
||||
Layer that applies a function as a filter.
|
||||
|
||||
\begin{descriptionlist}
|
||||
\item[Max-pooling] \marginnote{Max-pooling}
|
||||
Filter that computes the maximum of the pixels within the kernel.
|
||||
|
||||
\item[Mean-pooling] \marginnote{Mean-pooling}
|
||||
Filter that computes the average of the pixels within the kernel.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Inception hypothesis}
|
||||
|
||||
\begin{description}
|
||||
\item[Depth-wise separable convolution] \marginnote{Depth-wise separable convolution}
|
||||
Decompose a 3D kernel into a 2D kernel followed by a 1D kernel.
|
||||
|
||||
Given an input image with $C_\text{in}$ channels,
|
||||
a single pass of a traditional 3D convolution uses a kernel of shape $k \times k \times C_\text{in}$
|
||||
to obtain an output of $1$ channel.
|
||||
This is repeated for a desired $C_\text{out}$ number of times (with different kernels).
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.65\linewidth]{./img/traditional_convolution.png}
|
||||
\caption{Example of traditional convolution}
|
||||
\end{figure}
|
||||
|
||||
A single pass of a depth-wise separable convolution uses $C_\text{in}$ different $k \times k \times 1$ kernels first to obtain $C_\text{in}$ images.
|
||||
Then, a $1 \times 1 \times C_\text{in}$ kernel is used to obtain an output image of $1$ channel.
|
||||
The last 1D kernel is repeated for a $C_\text{out}$ number of times (with different kernels).
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.85\linewidth]{./img/depthwise_separable_convolution.png}
|
||||
\caption{Example of depth-wise separable convolution}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Parameters}
|
||||
|
||||
The number of parameters of a depth-wise separable convolutional layer is given by:
|
||||
\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} + (1 \cdot 1 \cdot D_\text{in}) \cdot D_\text{out} \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $K_\text{w}$ is the width of the kernel.
|
||||
\item $K_\text{h}$ is the height of the kernel.
|
||||
\item $D_\text{in}$ is the input depth.
|
||||
\item $D_\text{out}$ is the output depth.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\section{Residual learning}
|
||||
|
||||
\begin{description}
|
||||
\item[Residual connection] \marginnote{Residual connection}
|
||||
Sum the input of a layer to its output.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/_residual_connection.pdf}
|
||||
\caption{Residual connection}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
The sum operation can be substituted with the concatenation.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
The effectiveness of residual connections is only shown empirically.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
By adding the input, without passing through the activation function,
|
||||
might help to propagate the gradient from higher layers to lower layers
|
||||
and avoid the risk of vanishing gradient.
|
||||
|
||||
Another interpretation is that, by learning the function $F(x) + x$, it is easier for the model to represent, if it needs to, the identity function as
|
||||
the problem is reduced to learn $F(x) = 0$.
|
||||
On the other hand, without a residual connection, learning $F(x) = x$ from scratch might be harder.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Transfer learning and fine-tuning}
|
||||
|
||||
\begin{description}
|
||||
\item[Transfer learning] \marginnote{Transfer learning}
|
||||
Reuse an existing model by appending some new layers to it.
|
||||
Only the new layers are trained.
|
||||
|
||||
\item[Fine-tuning] \marginnote{Fine-tuning}
|
||||
Reuse an existing model by appending some new layers to it.
|
||||
The existing model (or part of it) is trained alongside the new layers.
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
In computer vision, reusing an existing model makes sense as
|
||||
the first convolutional layers tend to learn primitive concepts that are independent of the downstream task.
|
||||
\end{remark}
|
||||
|
||||
|
||||
|
||||
\section{Other types of convolution}
|
||||
|
||||
\begin{description}
|
||||
\item[Transposed convolution / Deconvolution] \marginnote{Transposed convolution / Deconvolution}
|
||||
Convolution to upsample the input (i.e. each pixel is upsampled into a $k \times k$ patch).
|
||||
|
||||
\begin{remark}
|
||||
A transposed convolution can be interpreted as a normal convolution with stride $< 1$.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\item[Dilated convolution] \marginnote{Dilated convolution}
|
||||
Convolution computed using a kernel that does not consider contiguous pixels.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/dilated_convolution.png}
|
||||
\caption{Examples of dilated convolutions}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
Dilated convolutions allow the enlargement of the receptive field without an excessive number of parameters.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Dilated convolutions are useful in the first layers when processing high-resolution images (e.g. temporal convolutional networks).
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Normalization layer}
|
||||
|
||||
A normalization layer has the empirical effects of:
|
||||
\begin{itemize}
|
||||
\item Stabilizing and possibly speeding up the training phase.
|
||||
\item Increasing the independence of each layer (i.e. maintain a similar magnitude of the weights at each layer).
|
||||
\end{itemize}
|
||||
|
||||
\begin{description}
|
||||
\item[Batch normalization] \marginnote{Batch normalization}
|
||||
Given an input batch $X$, a batch normalization layer outputs the following:
|
||||
\[ \gamma \frac{X - \mu}{\sqrt{\sigma^2 + \varepsilon}} + \beta \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $\gamma$ and $\beta$ are learned parameters.
|
||||
\item $\varepsilon$ is a small constant.
|
||||
\item $\mu$ is the mean and $\sigma^2$ is the variance.
|
||||
Depending on when the layer is applied, these values change:
|
||||
\begin{descriptionlist}
|
||||
\item[Training]
|
||||
$\mu$ and $\sigma^2$ are computed from the input batch $X$.
|
||||
|
||||
\item[Inference]
|
||||
$\mu$ and $\sigma^2$ are computed from the training data.
|
||||
Usually, it is obtained as the moving average of the values computed from the batches during training.
|
||||
\end{descriptionlist}
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Gradient ascent}
|
||||
|
||||
|
||||
\subsection{Hidden layer visualization}
|
||||
\marginnote{Hidden layer visualization}
|
||||
|
||||
Visualize what type of input features activate a neuron.
|
||||
|
||||
\begin{description}
|
||||
\item[Image ascent approach]
|
||||
During training, the loss function of a neural network $\mathcal{L}(\vec{x}; \vec{\theta})$ is
|
||||
parametrized on the weights $\vec{\theta}$ while the input $\vec{x}$ is fixed.
|
||||
|
||||
To visualize the patterns that activate a (convolutional) neuron, it is possible to invert the optimization process
|
||||
by fixing the parameters $\vec{\theta}$ and optimizing an image $\vec{x}$ so that the loss function becomes $\mathcal{L}(\vec{\theta}; \vec{x})$.
|
||||
The process works as follows:
|
||||
\begin{enumerate}
|
||||
\item Start with a random image $\vec{x}$.
|
||||
\item Do a forward pass with $\vec{x}$ as input and keep track of the activation function $a_i(\vec{x})$ of the neuron(s) of interest.
|
||||
\item Do a backward pass to compute the gradient $\frac{\partial a_i(\vec{x})}{\partial \vec{x}_{i,j}}$ (i.e. chain rule) for each pixel $(i, j)$ of the image.
|
||||
\item Update the image as $\vec{x} = \vec{x} + \eta \frac{\partial a_i(\vec{x})}{\partial \vec{x}}$.
|
||||
\item Repeat until the activation function $a_i(\vec{x})$ is high enough.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/cnn_visualization_ascent.png}
|
||||
\caption{Example of generative image ascent visualization approach}
|
||||
\end{figure}
|
||||
|
||||
\item[Generative approach]
|
||||
Starting from an image $\hat{\vec{x}}$ that makes a specific layer $l$ output $\Theta_l(\hat{\vec{x}})$,
|
||||
generate another image $\vec{x}$ that makes the same layer $l$ output a similar value $\Theta_l(\vec{x}) \approx \Theta_l(\hat{\vec{x}})$
|
||||
(i.e. it cannot distinguish between $\vec{x}$ and $\hat{\vec{x}}$).
|
||||
|
||||
Fixed $\hat{\vec{x}}$, the problem can be solved as an optimization problem:
|
||||
\[ \arg\min_{\vec{x}} \Big\{ l\big( \Theta_l(\vec{x}), \Theta_l(\hat{\vec{x}}) \big) + \lambda \mathcal{R}(\vec{x}) \Big\} \]
|
||||
where $l$ is a loss function to measure the distance between the two representations and
|
||||
$\mathcal{R}$ is a regularizer.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/cnn_visualization_generative.png}
|
||||
\caption{Example of generative visualization approach}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Inceptionism}
|
||||
\marginnote{Inceptionism}
|
||||
|
||||
Employ the same techniques for hidden layer visualization to create psychedelic and abstract images.
|
||||
|
||||
\begin{description}
|
||||
\item[Deep dream] \marginnote{Deep dream}
|
||||
Iteratively apply gradient ascent on an image:
|
||||
\begin{enumerate}
|
||||
\item Train a neural network for image classification.
|
||||
\item Repeatedly modify an input image using gradient ascent to improve the activation of a specific neuron.
|
||||
\end{enumerate}
|
||||
|
||||
After enough iterations, the features that the target neuron learned to recognize during training are injected into the input image,
|
||||
even if that image does not have that specific feature.
|
||||
|
||||
\begin{remark}
|
||||
Strong regularizers are used to prioritize features that statistically resemble real images.
|
||||
\end{remark}
|
||||
|
||||
\item[Content enhancing] \marginnote{Content enhancing}
|
||||
Same as above, but instead of selecting a neuron, an entire layer is fixed and the input image is injected with whatever that layer detects.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/inceptionism.png}
|
||||
\caption{Example of deep dream images}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Style transfer}
|
||||
\marginnote{Style transfer}
|
||||
|
||||
Mimic the style of an image and transfer it to the content of another one.
|
||||
|
||||
\begin{description}
|
||||
\item[Internal representation approach]
|
||||
Given a convolutional neural network pretrained for classification, the method can be divided into two parts:
|
||||
\begin{descriptionlist}
|
||||
\item[Content reconstruction]
|
||||
Given an image $\hat{\vec{x}}$, consider the output of the $l$-th layer of the network.
|
||||
Its internal representation of the image has $C^l$ distinct channels (depending on the number of kernels)
|
||||
each with $M^l = W^l \cdot H^l$ elements (when flattened).
|
||||
|
||||
The representation (feature map) of the $l$-th layer can therefore be denoted as $F^l \in \mathbb{R}^{C^l \times M^l}$
|
||||
and $F^l_{c, k}$ is used to denote the activation of the $c$-th filter applied at position $k$ of the $l$-th layer.
|
||||
|
||||
As higher layers of a CNN capture high-level features, one of the high layers is selected and its feature map is used as the content representation.
|
||||
|
||||
Given a content representation $\mathcal{C} = \hat{F}^l$ of $\hat{\vec{x}}$, chosen as the feature map at the $l$-th layer,
|
||||
it is possible to reconstruct the original image $\hat{\vec{x}}$ starting from a random one $\vec{x}$ by minimizing the loss:
|
||||
\[ \mathcal{L}_\text{content}(\hat{\vec{x}}, \vec{x}, l) = \sum_{c, i} (F^l_{c, i} - \mathcal{C}_{c, i})^2 \]
|
||||
where $F^l$ is the feature representation of the random image $\vec{x}$.
|
||||
|
||||
\item[Style reconstruction]
|
||||
Given an image $\hat{\vec{y}}$ and its feature maps $F^l$ for $l \in \{1, \dots L\}$,
|
||||
at each layer $l$, the Gram matrix $G^l \in \mathbb{R}^{C^l \times C^l}$ obtained as the dot product between pairs of channels
|
||||
(i.e. correlation between features extracted by different kernels):
|
||||
\[ G^l_{c_1, c_2} = F^l_{c_1} \odot F^l_{c_2} = \sum_{k} (F^l_{c_1, k} \cdot F^l_{c_2, k}) \]
|
||||
allows to capture the concept of style.
|
||||
|
||||
The Gram matrices at each layer are considered as the style representation.
|
||||
|
||||
Given the style representation $\mathcal{S}^1, \dots, \mathcal{S}^L$ of $\hat{\vec{y}}$,
|
||||
it is possible to reconstruct the same style of the original image $\hat{\vec{y}}$ starting from a random image $\vec{y}$ by minimizing the loss:
|
||||
\[ \mathcal{L}_\text{style}(\hat{\vec{y}}, \vec{y}) = \sum_{l=1}^{L} \gamma_l \left(\sum_{i,j} (G^l_{i, j} - \mathcal{S}^l_{i,j})^2 \right) \]
|
||||
where $\gamma_l$ is a weight assigned to each layer and $G^l$ is the $l$-th Gram matrix of the random image $\vec{y}$.
|
||||
\end{descriptionlist}
|
||||
|
||||
Put together, given:
|
||||
\begin{itemize}
|
||||
\item An image $\hat{\vec{x}}$ from which the content has to be copied.
|
||||
\item An image $\hat{\vec{y}}$ from which the style has to be copied.
|
||||
\item The content representation $\mathcal{C}$ of $\hat{\vec{x}}$.
|
||||
\item The style representation $\mathcal{S}^1, \dots, \mathcal{S}^L$ of $\hat{\vec{y}}$.
|
||||
\end{itemize}
|
||||
A new random image $\vec{o}$ is fitted by minimizing the loss:
|
||||
\[ \mathcal{L}_\text{total} = \alpha \mathcal{L}_\text{content}(\hat{\vec{x}}, \vec{o}, l) + \beta \mathcal{L}_\text{style}(\hat{\vec{y}}, \vec{o}) \]
|
||||
where $\alpha$ and $\beta$ are hyperparameters.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.95\linewidth]{./img/style_transfer.png}
|
||||
\caption{Internal representation style transfer workflow}
|
||||
\end{figure}
|
||||
|
||||
\item[Perceptual loss approach]
|
||||
A CNN pretrained for classification is used as a loss network to compute perceptual loss functions
|
||||
to measure the difference in style and content between images.
|
||||
The representation for style and content is extracted in a similar way as above.
|
||||
|
||||
The loss network is then kept fixed and an image transformation network is trained to transform its input $\vec{x}$
|
||||
into an image $\vec{y}$ compliant (i.e. minimizes the perceptual losses) with a given style image $\vec{y}_s$ and a content image $\vec{y}_c$
|
||||
(if the goal is to keep the content of the input, then $\vec{y}_c = \vec{x}$).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/style_transfer_perceptual_loss.png}
|
||||
\caption{Perceptual loss style transfer workflow}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Data manifold}
|
||||
|
||||
|
||||
\subsection{Adversarial attacks}
|
||||
\marginnote{Adversarial attacks}
|
||||
|
||||
Hijack a neural network classifier to forcefully predict a given class.
|
||||
|
||||
\begin{description}
|
||||
\item[Gradient ascent approach]
|
||||
White-box technique that uses gradient ascent to compute an image that the network classifies with the wanted class.
|
||||
|
||||
Let:
|
||||
\begin{itemize}
|
||||
\item $\vec{x}$ be the input image.
|
||||
\item $f(\vec{x})$ the probability distribution that the network outputs.
|
||||
\item $c$ the wanted class.
|
||||
\item $p$ the wanted probability distribution (i.e. $p_c = 1$ and $p_i = 0$ elsewhere).
|
||||
\item $\mathcal{L}$ the loss function.
|
||||
\end{itemize}
|
||||
|
||||
By iteratively updating the input image with the gradient of the loss function $\frac{\partial\mathcal{L}(f(\vec{x}), p)}{\partial\vec{x}}$
|
||||
computed wrt to $\vec{x}$,
|
||||
after enough iterations, the classifier will classify the updated $\vec{x}$ as $c$.
|
||||
|
||||
\begin{remark}
|
||||
The updates computed from the gradient of the loss function are usually imperceptible.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.25\linewidth]{./img/fooling_nn.png}
|
||||
\caption{Examples of hijacked classifications}
|
||||
\end{figure}
|
||||
|
||||
\item[Evolutionary approach]
|
||||
Black-box technique based on an evolutionary approach.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/fooling_evolutionary.png}
|
||||
\caption{Workflow for evolutionary-based attacks}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Manifold}
|
||||
|
||||
\begin{description}
|
||||
\item[Manifold] \marginnote{Manifold}
|
||||
Area of the feature space that represents "natural" images (i.e. images with a meaning an without artificial noise).
|
||||
|
||||
This area is usually organized along a smooth surface which is a minimal portion of the entire space of all the possible images.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.35\linewidth]{./img/manifold.png}
|
||||
\caption{Example of manifold in two dimensions}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
As one cannot know where the classifier draws the boundaries,
|
||||
a tiny change in the data might cause a misclassification.
|
||||
|
||||
Adversarial attacks also exploit this to cause misclassifications.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Inceptionism aims to modify the data while remaining in the manifold.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\subsection{Autoencoders}
|
||||
\marginnote{Autoencoder}
|
||||
|
||||
Network composed of two components:
|
||||
\begin{descriptionlist}
|
||||
\item[Encoder]
|
||||
Projects the input into an internal representation of lower dimensionality.
|
||||
|
||||
\item[Decoder]
|
||||
Reconstructs the input from its internal representation.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/autoencoder.png}
|
||||
\caption{Autoencoder structure}
|
||||
\end{figure}
|
||||
|
||||
An autoencoder has the following properties:
|
||||
\begin{descriptionlist}
|
||||
\item[Data-specific] It only works on data with a strong correlation (i.e. with regularities in the feature space).
|
||||
\item[Lossy] By passing through the internal representation, the reconstruction of the input is nearly always degraded.
|
||||
\item[Self-supervised] Training happens directly on unlabelled data.
|
||||
\end{descriptionlist}
|
||||
|
||||
Applications of autoencoders are:
|
||||
\begin{descriptionlist}
|
||||
\item[Denoising]
|
||||
Train the autoencoder to reconstruct noiseless data.
|
||||
Given an image, the input is a noisy version of it, while the output is expected to be similar to the original image.
|
||||
|
||||
\item[Anomaly detection]
|
||||
As autoencoders are data-specific, they will perform poorly on data different from those used for training.
|
||||
|
||||
This allows to detect anomalies by comparing the quality of the reconstruction.
|
||||
If the input is substantially different from the training data (or has been attacked with an artificial manipulation),
|
||||
the reconstructed output is expected to have poor quality.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/autoencoder_anomaly.png}
|
||||
\caption{Example of anomaly detection}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
72
src/year1/deep-learning/sections/_expressivity.tex
Normal file
@ -0,0 +1,72 @@
|
||||
\chapter{Neural networks expressivity}
|
||||
|
||||
|
||||
|
||||
\section{Perceptron}
|
||||
|
||||
Single neuron that defines a binary threshold through a hyperplane:
|
||||
\[
|
||||
\begin{cases}
|
||||
1 & \sum_{i} w_i x_i + b \geq 0 \\
|
||||
0 & \text{otherwise}
|
||||
\end{cases}
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Expressivity] \marginnote{Perceptron expressivity}
|
||||
A perceptron can represent a NAND gate but not a XOR gate.
|
||||
\begin{center}
|
||||
\begin{minipage}{.2\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/_perceptron_nand.pdf}
|
||||
\tiny NAND
|
||||
\end{minipage}
|
||||
\begin{minipage}{.2\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/_xor.pdf}
|
||||
\tiny XOR
|
||||
\end{minipage}
|
||||
\end{center}
|
||||
|
||||
\begin{remark}
|
||||
Even if NAND is logically complete, the strict definition of a perceptron is not a composition of them.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Multi-layer perceptron}
|
||||
|
||||
Composition of perceptrons.
|
||||
|
||||
\begin{descriptionlist}
|
||||
\item[Shallow neural network] \marginnote{Shallow NN}
|
||||
Neural network with one hidden layer.
|
||||
|
||||
\item[Deep neural network] \marginnote{Deep NN}
|
||||
Neural network with more than one hidden layer.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{description}
|
||||
\item[Expressivity] \marginnote{Multi-layer perceptron expressivity}
|
||||
Shallow neural networks allow to approximate any continuous function
|
||||
\[ f: \mathbb{R} \rightarrow [0, 1] \]
|
||||
|
||||
\begin{remark}
|
||||
Still, deep neural networks allow to use less neural units.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Parameters}
|
||||
|
||||
The number of parameters of a layer is given by:
|
||||
\[ S_\text{in} \cdot S_\text{out} + S_\text{out} \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $S_\text{in}$ is the dimension of the input of the layer.
|
||||
\item $S_\text{out}$ is the dimension of the output of the layer.
|
||||
\end{itemize}
|
||||
|
||||
Therefore, the number of FLOPS is of order:
|
||||
\[ S_\text{in} \cdot S_\text{out} \]
|
||||
203
src/year1/deep-learning/sections/_training.tex
Normal file
@ -0,0 +1,203 @@
|
||||
\chapter{Training}
|
||||
|
||||
|
||||
\section{Gradient descent}
|
||||
|
||||
\begin{enumerate}
|
||||
\item
|
||||
\marginnote{Gradient descent}
|
||||
Start from a random set of weights $w$.
|
||||
\item Compute the gradient $\nabla \mathcal{L}$ of the loss function.
|
||||
\item Make a small step of size $-\nabla \mathcal{L}(w)$.
|
||||
\item Go to 2., until convergence.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{description}
|
||||
\item[Learning rate] \marginnote{Learning rate}
|
||||
Size of the step. Usually denoted with $\mu$.
|
||||
\[ w = w + \mu \nabla \mathcal{L}(w) \]
|
||||
|
||||
\item[Optimizer] \marginnote{Optimizer}
|
||||
Algorithm that tunes the learning rate during training.
|
||||
|
||||
\item[Stochastic gradient descent] \marginnote{Stochastic gradient descent}
|
||||
Use a subset of the training data to compute the gradient.
|
||||
\begin{description}
|
||||
\item[Full-batch] Use the entire dataset.
|
||||
\item[Mini-batch] Use a subset of the training data.
|
||||
\item[Online] Use a single sample.
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
SGD with mini-batch converges to the same result obtained using a full-batch approach.
|
||||
\end{remark}
|
||||
|
||||
\item[Momentum] \marginnote{Momentum}
|
||||
Correct the update $v_t$ at time $t$ considering the update $v_{t-1}$ of time $t-1$.
|
||||
\[
|
||||
\begin{split}
|
||||
w_{t+1} &= w_t + v_t\\
|
||||
v_t &= \mu \nabla \mathcal{L}(w_t) + \alpha v_{t-1}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Nesterov momentum] \marginnote{Nesterov momentum}
|
||||
Apply the momentum before computing the gradient.
|
||||
\end{description}
|
||||
|
||||
|
||||
\item[Overfitting] \marginnote{Overfitting}
|
||||
Model too specialized on the training data.
|
||||
|
||||
Methods to reduce overfitting are:
|
||||
\begin{itemize}
|
||||
\item Increasing the dataset size.
|
||||
\item Simplifying the model.
|
||||
\item Early stopping.
|
||||
\item Regularization.
|
||||
\item Model averaging.
|
||||
\item Neurons dropout.
|
||||
\end{itemize}
|
||||
|
||||
\item[Underfitting] \marginnote{Underfitting}
|
||||
Model too simple and unable to capture features of the training data.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Backpropagation}
|
||||
|
||||
\begin{description}
|
||||
\item[Chain rule] \marginnote{Chain rule}
|
||||
Refer to \href{\gitSMM{}}{\texttt{SMM for AI (Section 5.1.1)}}.
|
||||
|
||||
\item[Backpropagation] \marginnote{Backpropagation}
|
||||
Algorithm to compute the gradient at each layer of a neural network.
|
||||
|
||||
The output of the $i$-th neuron in the layer $l$ of a neural network can be defined as:
|
||||
\[ a_{l,i} = \sigma_{l,i}( \vec{w}_{l,i}^T \vec{a}_{l-1} + b_{l,i} ) = \sigma_{l,i}(z_{l,i})\]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $a_{l,i} \in \mathbb{R}$ is the output of the neuron.
|
||||
\item $\vec{w}_{l,i} \in \mathbb{R}^{n_{l-1}}$ is the vector of weights.
|
||||
\item $\vec{a}_{l-1} \in \mathbb{R}^{n_{l-1}}$ is the vector of the outputs of the previous layer.
|
||||
\item $b_{l,i} \in \mathbb{R}$ is the bias.
|
||||
\item $\sigma_{l,i}: \mathbb{R} \rightarrow \mathbb{R}$ is the activation function\footnote{Even if it is possible to have a different activation function in each neuron, in practice, each layer has the same activation function.}.
|
||||
\item $z_{l,i}(\vec{w}_{l,i}, b_{l,i} | \vec{a}_{l-1}) = \vec{w}_{l,i}^T \vec{a}_{l-1} + b_{l,i}$ is the argument of the activation function and is parametrized on $\vec{w}_{l,i}$ and $b_{l,i}$.
|
||||
\end{itemize}
|
||||
|
||||
Hence, the outputs of the $l$-th layer can be defined as:
|
||||
\[ \vec{a}_l = \sigma_l( \matr{W}_l^T \vec{a}_{l-1} + \vec{b}_l ) = \sigma_l( \vec{z}_l(\matr{W}_l, \vec{b}_l | \vec{a}_{l-1}) ) \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $\sigma_l: \mathbb{R}^{n_l} \rightarrow \mathbb{R}^{n_l}$ is the element-wise activation function.
|
||||
\item
|
||||
$\matr{W}_l \in \mathbb{R}^{n_l \times n_{l-1}}$,
|
||||
$\vec{a}_{l-1} \in \mathbb{R}^{n_{l-1}}$,
|
||||
$\vec{b}_l \in \mathbb{R}^{n_l}$,
|
||||
$\vec{a}_l \in \mathbb{R}^{n_l}$.
|
||||
\end{itemize}
|
||||
|
||||
Finally, a neural network with input $\vec{x}$ can be expressed as:
|
||||
\[
|
||||
\begin{split}
|
||||
\vec{a}_0 &= \vec{x} \\
|
||||
\vec{a}_i &= \sigma_i( \vec{z}_i(\matr{W}_i, \vec{b}_i | \vec{a}_{i-1}) )
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
Given a neural network with $K$ layers and a loss function $\mathcal{L}$,
|
||||
we want to compute the derivative of $\mathcal{L}$ w.r.t. the weights of each layer to tune the parameters.
|
||||
|
||||
First, we highlight the parameters of each of the functions involved:
|
||||
\begin{descriptionlist}
|
||||
\item[Loss] $\mathcal{L}(a_K) = \mathcal{L}(\sigma_K)$ takes as input the output of the network (i.e. the output of the last activation function).
|
||||
\item[Activation function] $\sigma_i(\vec{z}_i)$ takes as input the value of the neurons at the $i$-th layer.
|
||||
\item[Neurons] $\vec{z}_i(\matr{W}_i, \vec{b}_i)$ takes as input the weights and biases at the $i$-th layer.
|
||||
\end{descriptionlist}
|
||||
|
||||
Let $\odot$ be the Hadamard product.
|
||||
By exploiting the chain rule, we can compute the derivatives w.r.t. the weights going backward:
|
||||
\[
|
||||
\frac{\partial\mathcal{L}}{\partial\matr{W}_K} =
|
||||
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} \frac{\partial\vec{z}_K}{\partial\matr{W}_K} =
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} \cdot
|
||||
\underset{1 \times \mathbb{R}^{n_{K-1}}}{\vec{a}_{K-1}^T}
|
||||
\in \mathbb{R}^{n_K \times n_{K-1}}
|
||||
\]
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial\mathcal{L}}{\partial\matr{W}_{K-1}} &=
|
||||
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_{K}} \frac{\partial\vec{z}_{K}}{\partial\sigma_{K-1}}
|
||||
\frac{\partial\sigma_{K-1}}{\partial\vec{z}_{K-1}} \frac{\partial\vec{z}_{K-1}}{\partial\matr{W}_{K-1}} \\
|
||||
&= (
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)}
|
||||
)^T \cdot
|
||||
\underset{\mathbb{R}^{n_K} \times \mathbb{R}^{n_{K-1}}}{\matr{W}_K} \odot
|
||||
\underset{\mathbb{R}^{n_{K-1}} \times 1}{\nabla\sigma_{K-1}(\vec{z}_{K-1})} \cdot
|
||||
\underset{1 \times \mathbb{R}^{n_{K-2}}}{\vec{a}_{K-2}^T}
|
||||
\in \mathbb{R}^{n_{K-1} \times n_{K-2}}
|
||||
\end{split}
|
||||
\]
|
||||
\[ \vdots \]
|
||||
In the same way, we can compute the derivatives w.r.t. the biases:
|
||||
\[
|
||||
\frac{\partial\mathcal{L}}{\partial\vec{b}_K} =
|
||||
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} \frac{\partial\vec{z}_K}{\partial\vec{b}_K} =
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} \cdot
|
||||
1
|
||||
\in \mathbb{R}^{n_K}
|
||||
\]
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial\mathcal{L}}{\partial\vec{b}_{K-1}} &=
|
||||
\frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_{K}} \frac{\partial\vec{z}_{K}}{\partial\sigma_{K-1}}
|
||||
\frac{\partial\sigma_{K-1}}{\partial\vec{z}_{K-1}} \frac{\partial\vec{z}_{K-1}}{\partial\vec{b}_{K-1}} \\
|
||||
&= (
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot
|
||||
\underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)}
|
||||
)^T \cdot
|
||||
\underset{\mathbb{R}^{n_K} \times \mathbb{R}^{n_{K-1}}}{\matr{W}_K} \odot
|
||||
\underset{\mathbb{R}^{n_{K-1}} \times 1}{\nabla\sigma_{K-1}(\vec{z}_{K-1})} \cdot
|
||||
1
|
||||
\in \mathbb{R}^{n_{K-1}}
|
||||
\end{split}
|
||||
\]
|
||||
\[ \vdots \]
|
||||
|
||||
It can be noticed that many terms are repeated from one layer to another.
|
||||
By exploiting this, we can store the following intermediate values:
|
||||
\[
|
||||
\begin{split}
|
||||
\delta_K &= \frac{\partial\mathcal{L}}{\partial\vec{z}_K} = \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} =
|
||||
\nabla\mathcal{L}(\vec{a}_K) \odot \nabla\sigma_K(\vec{z}_K) \\
|
||||
\delta_l &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l} = \delta_{l+1}^T \cdot \matr{W}_{l+1} \odot \nabla\sigma_l(\vec{z}_l)
|
||||
\end{split}
|
||||
\]
|
||||
and reused them to compute the derivatives as follows:
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial\mathcal{L}}{\partial\matr{W}_l} &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l}\frac{\partial\vec{z}_l}{\partial\matr{W}_l} =
|
||||
\delta_l \cdot \vec{a}_{l-1}^T \\
|
||||
\frac{\partial\mathcal{L}}{\partial\vec{b}_l} &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l}\frac{\partial\vec{z}_l}{\partial\vec{b}_l} =
|
||||
\delta_l \cdot 1
|
||||
\end{split}
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Vanishing gradient] \marginnote{Vanishing gradient}
|
||||
As backpropagation consists of a chain of products,
|
||||
when a component is small (i.e. $< 1$), it will gradually cancel out the gradient when backtracking,
|
||||
causing the first layers to learn much slower than the last layers.
|
||||
|
||||
\begin{remark}
|
||||
This is an issue of the sigmoid function.
|
||||
ReLU was designed to solve this problem.
|
||||
\end{remark}
|
||||
\end{description}
|
||||