diff --git a/src/year1/image-processing-and-computer-vision/module2/img/bovw.png b/src/year1/image-processing-and-computer-vision/module2/img/bovw.png
new file mode 100644
index 0000000..b193eb1
Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/bovw.png differ
diff --git a/src/year1/image-processing-and-computer-vision/module2/img/convolution_matrix.png b/src/year1/image-processing-and-computer-vision/module2/img/convolution_matrix.png
new file mode 100644
index 0000000..9c37eb8
Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/convolution_matrix.png differ
diff --git a/src/year1/image-processing-and-computer-vision/module2/img/data_representation_linear.png b/src/year1/image-processing-and-computer-vision/module2/img/data_representation_linear.png
new file mode 100644
index 0000000..c064aa9
Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/data_representation_linear.png differ
diff --git a/src/year1/image-processing-and-computer-vision/module2/img/relu_separability_1.png b/src/year1/image-processing-and-computer-vision/module2/img/relu_separability_1.png
new file mode 100644
index 0000000..3085486
Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/relu_separability_1.png differ
diff --git a/src/year1/image-processing-and-computer-vision/module2/img/relu_separability_2.png b/src/year1/image-processing-and-computer-vision/module2/img/relu_separability_2.png
new file mode 100644
index 0000000..43814fa
Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/relu_separability_2.png differ
diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
index 370915d..084a0d5 100644
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
@@ -247,6 +247,14 @@
             \item Larger batches provide a smoother estimation of the gradient and allow to better exploit parallel hardware (below a certain limit, there is no gain in time).
             \item Smaller batches require more iterations to train but might have a regularization effect for better generalization.
         \end{itemize}
+
+    \item[Gradient computation] \marginnote{Gradient computation}
+        Gradients can be computed:
+        \begin{descriptionlist}
+            \item[Numerically] Slow and approximate but easy to implement.
+            \item[Analytically] Using the chain rule.
+            \item[Automatically] Using automatic differentiation (e.g. backpropagation).
+        \end{descriptionlist}
 \end{description}
 
 
@@ -272,6 +280,185 @@ The prediction is obtained as the index of the maximum score.
     Each row of $\matr{W} \in \mathbb{R}^{c \times i}$ is a class template that is cross-correlated with the image to obtain a score.
 \end{remark}
 
-\marginnote{Affine classifier}
-In practice, a linear classifier is actually an affine classifier parametrized on $\theta = (\matr{W} \in \mathbb{R}^{c \times i}, \vec{b} \in \mathbb{R}^{c})$:
-\[ f(\vec{x}; \theta) = \matr{W}\vec{x} + \vec{b} = \texttt{logits} \]
\ No newline at end of file
+\begin{remark}
+    \marginnote{Affine classifier}
+    In practice, a linear classifier is actually an affine classifier parametrized on $\theta = (\matr{W} \in \mathbb{R}^{c \times i}, \vec{b} \in \mathbb{R}^{c})$:
+    \[ f(\vec{x}; \theta) = \matr{W}\vec{x} + \vec{b} = \texttt{logits} \]
+\end{remark}
+
+\begin{remark}
+    Linear classifiers are limited by the expressiveness of the input data as pixels alone do not contain relevant features.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.45\linewidth]{./img/data_representation_linear.png}
+        \caption{
+            \parbox[t]{0.6\linewidth}{
+                Example of non-linearly separable data points that become linearly separable in polar coordinates
+            }
+        }
+    \end{figure}
+\end{remark}
+
+
+
+\section{Bag of visual words}
+
+\begin{description}
+    \item[Codeword] \marginnote{Codeword}
+        Visual feature (e.g. an edge with a particular direction) that appears in an image.
+
+    \item[Bag of visual words (BOVW)] \marginnote{Bag of visual words (BOVW)}
+        Encoding of an image into a histogram of codeword frequencies.
+\end{description}
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.6\linewidth]{./img/bovw.png}
+\end{figure}
+
+
+
+\section{Neural networks}
+
+\begin{description}    
+    \item[Shallow neural network] \marginnote{Shallow neural network}
+        Linear transformations with an activation function:
+        \[ 
+            \begin{split}
+                f(\vec{x}, \matr{\theta}) &= \matr{W}_2 \vec{h} + \vec{b}_2  \\
+                &= \matr{W}_2 \phi(\matr{W}_1 \vec{x} + \vec{b}_1) + \vec{b}_2 = \vec{s}
+            \end{split}
+        \]
+        where:
+            \begin{itemize}
+                \item $\matr{\theta} = (W_1 \in \mathbb{R}^{h \times i}, b_1 \in \mathbb{R}^{h}, W_2 \in \mathbb{R}^{c \times h}, b_2 \in \mathbb{R}^{c})$
+                    are the parameters of the linear transformations with an inner representation of size $h$.
+                \item $\phi$ is an activation function.
+                \item $\vec{h}$ and $\vec{s}$ are activations.
+            \end{itemize}
+
+    \item[Activation function] \marginnote{Activation function}
+        Function to introduce non-linearity.
+
+        \begin{remark}
+            Without an activation function, a neural network is equivalent to a plain linear transformation.
+        \end{remark}
+
+        Examples of activation functions are:
+        \begin{descriptionlist}
+            \item[Sigmoid] 
+                Defined as:
+                \[ 
+                    \sigma(a) = \frac{1}{1+\exp(-a)} \hspace{2em}
+                    \frac{\partial \sigma(a)}{\partial a} = \sigma(a) \big( 1-\sigma(a) \big)
+                \]
+                It is subject to the vanishing gradient problem.
+
+            \item[Rectified linear unit (ReLU)] 
+                Defined as:
+                \[ 
+                    \texttt{ReLU}(a) = \max\{ 0, a \} \hspace{2em}
+                    \frac{\partial \texttt{ReLU}(a)}{\partial a} = \begin{cases}
+                        1 & \text{if } a \geq 0\\
+                        0 & \text{otherwise}
+                    \end{cases}
+                \]
+                It is subject to the dead neuron problem for negative inputs.
+
+            \item[Leaky ReLU] 
+                Defined as:
+                \[ 
+                    \texttt{leaky\_ReLU}(a) = \begin{cases}
+                        a & \text{if $a \geq 0$} \\
+                        0.01 & \text{otherwise}
+                    \end{cases} \hspace{2em}
+                    \frac{\partial \texttt{leaky\_ReLU}(a)}{\partial a} = \begin{cases}
+                        1 & \text{if } a \geq 0 \\
+                        0.01 & \text{otherwise}
+                    \end{cases}
+                \]
+        \end{descriptionlist}
+
+        \begin{example}[Linear separability]
+            Linear transformations do not change the linear separability of the data points.
+            A non-linear function can make linear separation possible.
+
+            \begin{figure}[H]
+                \centering
+                \begin{subfigure}{0.55\linewidth}
+                    \centering
+                    \includegraphics[width=\linewidth]{./img/relu_separability_1.png}
+                \end{subfigure}
+
+                \begin{subfigure}{0.55\linewidth}
+                    \centering
+                    \includegraphics[width=\linewidth]{./img/relu_separability_2.png}
+                \end{subfigure}
+            \end{figure}
+        \end{example}
+    
+    \item[Deep neural network] \marginnote{Deep neural network}
+        Multiple layers of linear transformations and activation functions:
+        \[
+            \begin{split}
+                f(\vec{x}, \matr{\theta}) &= \matr{W}_L \vec{h}_{L-1} + \vec{b}_L \\
+                    &= \matr{W}_L \phi_L(\matr{W}_{L-1} \vec{h}_{L-2} + \vec{b}_{L-1}) + \vec{b}_L \\
+                    &= \matr{W}_L \phi_{L}(\matr{W}_{L-1} \phi_{L-1}(\cdots \phi_{1}(\matr{W}_{1} \vec{x} + \vec{b}_{1}) \cdots) + \vec{b}_{L-1}) + \vec{b}_L = \vec{s} \\
+            \end{split}  
+        \]
+
+        \begin{description}
+            \item[Depth] Number of layers.
+            \item[Width] Number of activations at each layer.  
+        \end{description}
+\end{description}
+
+
+
+\section{Convolutional neural networks}
+
+
+\subsection{Image filtering}
+
+Consider the case of vertical edge detection.
+Image filtering can be implemented through:
+\begin{descriptionlist}
+    \item[Fully-connected layer] \marginnote{Image filtering with fully-connected layers}
+        Use an FC layer to transform the image.
+
+        Given an image of size $H \times W$, the layer requires:
+        \begin{itemize}
+            \item $(H \cdot W) \cdot (H \cdot (W-1)) \approx H^2W^2$ parameters.
+            \item $2 (H \cdot W) \cdot (H \cdot (W-1)) \approx 2H^2W^2$ FLOPS (multiplications and additions).
+        \end{itemize}
+
+    \item[Convolution/Correlation] \marginnote{Image filtering with convolutions}
+        Use a convolution (more precisely, a cross-correlation) to transform the image.
+
+        \begin{remark}
+            Convolutions preserve the spatial structure of the image, have shared parameters and extract local features.
+        \end{remark}
+
+        Given an image of size $H \times W$, a convolution requires:
+        \begin{itemize}
+            \item $2$ parameters.
+            \item $3 (H \cdot (W-1)) \approx 3HW$ FLOPS.
+        \end{itemize}
+
+        \begin{description}
+            \item[Convolution matrix] 
+                A convolution can be expressed as a multiplication matrix such that:
+                \begin{itemize}
+                    \item The parameters are shared across rows.
+                    \item The resulting matrix is sparse.
+                    \item It adapts to varying input sizes.
+                    \item It is equivariant to translation (but not w.r.t. rotation and scale).
+                \end{itemize}
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.5\linewidth]{./img/convolution_matrix.png}
+                \end{figure}
+        \end{description}
+\end{descriptionlist}