diff --git a/src/deep-learning/img/_residual_connection.pdf b/src/deep-learning/img/_residual_connection.pdf
new file mode 100644
index 0000000..d088384
Binary files /dev/null and b/src/deep-learning/img/_residual_connection.pdf differ
diff --git a/src/deep-learning/img/depthwise_separable_convolution.png b/src/deep-learning/img/depthwise_separable_convolution.png
new file mode 100644
index 0000000..8dc273e
Binary files /dev/null and b/src/deep-learning/img/depthwise_separable_convolution.png differ
diff --git a/src/deep-learning/img/dilated_convolution.png b/src/deep-learning/img/dilated_convolution.png
new file mode 100644
index 0000000..0fd3677
Binary files /dev/null and b/src/deep-learning/img/dilated_convolution.png differ
diff --git a/src/deep-learning/img/traditional_convolution.png b/src/deep-learning/img/traditional_convolution.png
new file mode 100644
index 0000000..9a0be28
Binary files /dev/null and b/src/deep-learning/img/traditional_convolution.png differ
diff --git a/src/deep-learning/sections/_convolutional_nn.tex b/src/deep-learning/sections/_convolutional_nn.tex
index a06bb1d..3dacfe2 100644
--- a/src/deep-learning/sections/_convolutional_nn.tex
+++ b/src/deep-learning/sections/_convolutional_nn.tex
@@ -1,6 +1,8 @@
 \chapter{Convolutional neural networks}
 
 
+\section{Convolutions}
+
 \begin{description}
     \item[Convolution neuron] \marginnote{Convolution neuron}
         Neuron influenced by only a subset of neurons in the previous layer.
@@ -53,23 +55,9 @@
 \end{description}
 
 
-\begin{description}
-    \item[Pooling]
-        Layer that applies a function as a filter.
+\subsection{Parameters}
 
-        \begin{descriptionlist}
-            \item[Max-pooling] \marginnote{Max-pooling}
-                Filter that computes the maximum of the pixels within the kernel.
-
-            \item[Mean-pooling] \marginnote{Mean-pooling}
-                Filter that computes the average of the pixels within the kernel.
-        \end{descriptionlist}
-\end{description}
-
-
-\section{Parameters}
-
-The number of parameters of a layer is given by:
+The number of parameters of a convolutional layer is given by:
 \[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} \cdot D_\text{out} + D_\text{out} \]
 where:
 \begin{itemize}
@@ -85,4 +73,227 @@ where:
 \begin{itemize}
     \item $O_\text{w}$ is the width of the output image.
     \item $O_\text{h}$ is the height of the output image.
-\end{itemize}
\ No newline at end of file
+\end{itemize}
+
+
+
+\section{Backpropagation}
+
+A convolution can be expressed as a dense layer by representing it through a sparse matrix.
+
+Therefore, backpropagation can be executed in the standard way, 
+with the only exception that the positions of the convolution matrix corresponding to 
+the same cell of the kernel should be updated with the same value (e.g. the mean of all the corresponding updates).
+
+\begin{example}
+    Given a $4 \times 4$ image $I$ and a $3 \times 3$ kernel $K$ with stride $1$ and no padding:
+    \[
+        I = \begin{pmatrix} i_{0,0} & i_{0,1} & i_{0,2} & i_{0,3} \\ i_{1,0} & i_{1,1} & i_{1,2} & i_{1,3} \\ 
+            i_{2,0} & i_{2,1} & i_{2,2} & i_{2,3} \\ i_{3,0} & i_{3,1} & i_{3,2} & i_{3,3} 
+        \end{pmatrix} 
+        \hspace{3em}
+        K = \begin{pmatrix} w_{0,0} & w_{0,1} & w_{0,2} \\ w_{1,0} & w_{1,1} & w_{1,2} \\ w_{2,0} & w_{2,1} & w_{2,2} \end{pmatrix}
+    \]
+    The convolutional layer can be represented through a convolutional matrix and by flattening the image as follows:
+    \[  
+        \begin{pmatrix}
+            w_{0,0} & 0         & 0         & 0         \\
+            w_{0,1} & w_{0,0}   & 0         & 0         \\
+            w_{0,2} & w_{0,1}   & 0         & 0         \\
+            0       & w_{0,2}   & 0         & 0         \\
+            w_{1,0} & 0         & w_{0,0}   & 0         \\
+            w_{1,1} & w_{1,0}   & w_{0,1}   & w_{0,0}   \\
+            w_{1,2} & w_{1,1}   & w_{0,2}   & w_{0,1}   \\
+            0       & w_{1,2}   & 0         & w_{0,2}   \\
+            w_{2,0} & 0         & w_{1,0}   & 0         \\
+            w_{2,1} & w_{2,0}   & w_{1,1}   & w_{1,0}   \\
+            w_{2,2} & w_{2,1}   & w_{1,2}   & w_{1,1}   \\
+            0       & w_{2,2}   & 0         & w_{1,2}   \\
+            0       & 0         & w_{2,0}   & 0         \\
+            0       & 0         & w_{2,1}   & w_{2,0}   \\
+            0       & 0         & w_{2,2}   & w_{2,1}   \\
+            0       & 0         & 0         & w_{2,2}   \\
+        \end{pmatrix}^T
+        \cdot
+        \begin{pmatrix} i_{0,0} \\ i_{0,1} \\ i_{0,2} \\ i_{0,3} \\ i_{1,0} \\ i_{1,1} \\ i_{1,2} \\ i_{1,3} \\ 
+            i_{2,0} \\ i_{2,1} \\ i_{2,2} \\ i_{2,3} \\ i_{3,0} \\ i_{3,1} \\ i_{3,2} \\ i_{3,3} 
+        \end{pmatrix} 
+        =
+        \begin{pmatrix} o_{0,0} \\ o_{0,1} \\ o_{1,0} \\ o_{1,1} \end{pmatrix} 
+        \mapsto
+        \begin{pmatrix} o_{0,0} & o_{0,1} \\ o_{1,0} & o_{1,1} \end{pmatrix} 
+    \]
+\end{example}
+
+
+
+\section{Pooling layer}
+
+\begin{description}
+    \item[Pooling]
+        Layer that applies a function as a filter.
+
+        \begin{descriptionlist}
+            \item[Max-pooling] \marginnote{Max-pooling}
+                Filter that computes the maximum of the pixels within the kernel.
+
+            \item[Mean-pooling] \marginnote{Mean-pooling}
+                Filter that computes the average of the pixels within the kernel.
+        \end{descriptionlist}
+\end{description}
+
+
+
+\section{Inception hypothesis}
+
+\begin{description}
+    \item[Depth-wise separable convolution] \marginnote{Depth-wise separable convolution}
+        Decompose a 3D kernel into a 2D kernel followed by a 1D kernel.
+
+        Given an input image with $C_\text{in}$ channels, 
+        a single pass of a traditional 3D convolution uses a kernel of shape $k \times k \times C_\text{in}$
+        to obtain an output of $1$ channel. 
+        This is repeated for a desired $C_\text{out}$ number of times (with different kernels).
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.65\linewidth]{./img/traditional_convolution.png}
+            \caption{Example of traditional convolution}
+        \end{figure}
+
+        A single pass of a depth-wise separable convolution uses $C_\text{in}$ different $k \times k \times 1$ kernels first to obtain $C_\text{in}$ images.
+        Then, a $1 \times 1 \times C_\text{in}$ kernel is used to obtain an output image of $1$ channel. 
+        The last 1D kernel is repeated for a $C_\text{out}$ number of times (with different kernels).
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/depthwise_separable_convolution.png}
+            \caption{Example of depth-wise separable convolution}
+        \end{figure}
+\end{description}
+
+
+\subsection{Parameters}
+
+The number of parameters of a depth-wise separable convolutional layer is given by:
+\[ (K_\text{w} \cdot K_\text{h}) \cdot D_\text{in} + (1 \cdot 1 \cdot D_\text{in}) \cdot D_\text{out} \]
+where:
+\begin{itemize}
+    \item $K_\text{w}$ is the width of the kernel.
+    \item $K_\text{h}$ is the height of the kernel.
+    \item $D_\text{in}$ is the input depth.
+    \item $D_\text{out}$ is the output depth.
+\end{itemize}
+
+
+
+\section{Residual learning}
+
+\begin{description}
+    \item[Residual connection] \marginnote{Residual connection}
+        Sum the input of a layer to its output.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.5\linewidth]{./img/_residual_connection.pdf}
+            \caption{Residual connection}
+        \end{figure}
+
+        \begin{remark}
+            The sum operation can be substituted with the concatenation.
+        \end{remark}
+
+        \begin{remark}
+            The effectiveness of residual connections is only shown empirically.
+        \end{remark}
+
+        \begin{remark}
+            By adding the input, without passing through the activation function,
+            might help to propagate the gradient from higher layers to lower layers
+            and avoid the risk of vanishing gradient.
+            
+            Another interpretation is that, by learning the function $F(x) + x$, it is easier for the model to represent, if it needs to, the identity function as 
+            the problem is reduced to learn $F(x) = 0$.
+            On the other hand, without a residual connection, learning $F(x) = x$ from scratch might be harder.
+        \end{remark}
+\end{description}
+
+
+
+\section{Transfer learning and fine-tuning}
+
+\begin{description}
+    \item[Transfer learning] \marginnote{Transfer learning}
+        Reuse an existing model by appending some new layers to it.
+        Only the new layers are trained.
+
+    \item[Fine-tuning] \marginnote{Fine-tuning}
+        Reuse an existing model by appending some new layers to it.
+        The existing model (or part of it) is trained alongside the new layers.
+\end{description}
+
+\begin{remark}
+    In computer vision, reusing an existing model makes sense as 
+    the first convolutional layers tend to learn primitive concepts that are independent of the downstream task.
+\end{remark}
+
+
+
+\section{Other convolution types}
+
+\begin{description}
+    \item[Transposed convolution / Deconvolution] \marginnote{Transposed convolution / Deconvolution}
+        Convolution to upsample the input (i.e. each pixel is upsampled into a $k \times k$ patch).
+
+        \begin{remark}
+            A transposed convolution can be interpreted as a normal convolution with stride $< 1$.
+        \end{remark}
+
+
+    \item[Dilated convolution] \marginnote{Dilated convolution}
+        Convolution computed using a kernel that does not consider contiguous pixels.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.5\linewidth]{./img/dilated_convolution.png}
+            \caption{Examples of dilated convolutions}
+        \end{figure}
+
+        \begin{remark}
+            Dilated convolutions allow the enlargement of the receptive field without an excessive number of parameters.
+        \end{remark}
+
+        \begin{remark}
+            Dilated convolutions are useful in the first layers when processing high-resolution images (e.g. temporal convolutional networks).
+        \end{remark}
+\end{description}
+
+
+
+
+\section{Normalization layer}
+
+A normalization layer has the empirical effects of:
+\begin{itemize}
+    \item Stabilizing and possibly speeding up the training phase.
+    \item Increasing the independence of each layer (i.e. maintain a similar magnitude of the weights at each layer).
+\end{itemize}
+
+\begin{description}
+    \item[Batch normalization] \marginnote{Batch normalization}
+        Given an input batch $X$, a batch normalization layer outputs the following:
+        \[ \gamma \frac{X - \mu}{\sqrt{\sigma^2 + \varepsilon}} + \beta \]
+        where:
+        \begin{itemize}
+            \item $\gamma$ and $\beta$ are learned parameters.
+            \item $\varepsilon$ is a small constant.
+            \item $\mu$ is the mean and $\sigma^2$ is the variance.
+                Depending on when the layer is applied, these values change:
+                \begin{descriptionlist}
+                    \item[Training]
+                        $\mu$ and $\sigma^2$ are computed from the input batch $X$.
+        
+                    \item[Inference] 
+                        $\mu$ and $\sigma^2$ are computed from the training data. 
+                        Usually, it is obtained as the moving average of the values computed from the batches during training.
+                \end{descriptionlist}
+        \end{itemize}
+
+\end{description}
\ No newline at end of file