diff --git a/src/year1/image-processing-and-computer-vision/module2/img/01_loss_spam.png b/src/year1/image-processing-and-computer-vision/module2/img/01_loss_spam.png
new file mode 100644
index 0000000..a49404d
Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/01_loss_spam.png differ
diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
index 3c00b35..370915d 100644
--- a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
+++ b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex
@@ -103,3 +103,175 @@
 \end{remark}
 
 
+
+\section{Learning}
+
+\begin{description}
+    \item[Learning problem] \marginnote{Learning problem}
+        Find the best model $h^*$ from the hypothesis space $\mathbb{H}$ that minimizes a loss function $\mathcal{L}$:
+        \[ h^* = \arg\min_{h \in \mathbb{H}} \mathcal{L}(h, \matr{D}^\text{train}) \]
+
+        In machine learning, models are usually parametrized. The problem then becomes to find the best set of parameters $\matr{\theta}^*$ from the parameter space $\Theta$:
+        \[ \matr{\matr{\theta}}^* = \arg\min_{\matr{\theta} \in \Theta} \mathcal{L}(\matr{\theta}, \matr{D}^\text{train}) \]
+\end{description}
+
+
+\subsection{Loss function}
+
+\begin{description}
+    \item[Loss function] \marginnote{Loss function}
+        Easy to optimize function that acts as a proxy to measure the goodness of a model.
+
+        The loss computed on a dataset is usually obtained as the average of the values of the single samples:
+        \[ \mathcal{L}(\matr{\theta}, \matr{D}^\text{train}) = \frac{1}{N} \sum_{i}^{\vert \matr{D}^\text{train} \vert} \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) \]
+
+
+    \item[0-1 loss] \marginnote{0-1 loss}
+        Loss computed as the number of misclassifications:
+        \[ \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) = \vert \text{misclassifications} \vert \]
+
+        This loss is not ideal as it is insensitive to small (or even large) changes in the parameters.
+        Moreover, it does not tell in which direction should the parameters be modified to reduce the loss.
+
+        \begin{remark}
+            This loss can be minimized using a combinatorial optimization approach but it does not scale well with large datasets.
+        \end{remark}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.3\linewidth]{./img/01_loss_spam.png}
+            \caption{\parbox[t]{0.7\linewidth}{
+                    Example of linear classifier for spam detection. 
+                    Small changes on the boundary line do not change the 0-1 loss. 
+                    The loss itself does not tell which is the best direction to move the line.
+            }}
+        \end{figure}
+
+
+    \item[Root mean square error] \marginnote{Root mean square error}
+        Loss computed as the direct comparison between the prediction and target label:
+        \[ \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) = \Vert f(\vec{x}^{(i)}; \matr{\theta}) - y^{(i)} \Vert_2 \]
+        Note that $y^{(i)}$ might be encoded (e.g. one-hot).
+
+
+    \item[Cross-entropy loss] \marginnote{Cross-entropy loss}
+        Transform the logits of a model into a probability distribution and estimate the parameters through MLE.
+
+        \begin{descriptionlist}
+            \item[Softmax] \marginnote{Softmax}
+                Function that converts its input into a probability distribution.
+                Given the logits $\vec{s} \in \mathbb{R}^{c}$, the score $\vec{s}_j$ of class $j$ is converted into a probability as follows:
+                \[ 
+                    \mathcal{P}_\text{model}(Y = j | X = \vec{x}^{(i)}; \matr{\theta}) = 
+                        \texttt{softmax}_j(\vec{s}) = 
+                        \frac{\exp(\vec{s}_j)}{\sum_{k=1}^{c} \exp(\vec{s}_k)} 
+                \]
+
+                For numerical stability, \texttt{softmax} is usually computed as:
+                \[
+                    \begin{split}
+                        \texttt{softmax}_j(\vec{s} - \max\{ \vec{s} \}) &= \frac{\exp(\vec{s}_j - \max\{ \vec{s} \})}{\sum_{k=1}^{c} \exp(\vec{s}_k - \max\{ \vec{s} \})} \\
+                        &= \frac{\cancel{\exp(- \max\{ \vec{s} \})}\exp(\vec{s}_j)}{\cancel{\exp(- \max\{ \vec{s} \})}\sum_{k=1}^{c} \exp(\vec{s}_k)} = \texttt{softmax}_j(\vec{s})
+                    \end{split}    
+                \]
+            
+            \item[Maximum likelihood estimation] \marginnote{Cross-entropy loss}
+                Use MLE to estimate the parameters on the probability distribution outputted by the \texttt{softmax} function:
+                \[
+                    \begin{split}
+                        \matr{\theta}^* &= \arg\max_\matr{\theta} \mathcal{P}_\text{model}(y^{(1)}, \dots, y^{(N)} | \vec{x}^{(1)}, \dots, \vec{x}^{(N)}; \matr{\theta}) \\
+                            &= \arg\max_\matr{\theta} \prod_{i=1}^{N} \mathcal{P}_\text{model}(Y = y^{(i)} | X=\vec{x}^{(i)}; \matr{\theta}) \\
+                            &= \arg\max_\matr{\theta} \sum_{i=1}^{N} \log\mathcal{P}_\text{model}(Y = y^{(i)} | X=\vec{x}^{(i)}; \matr{\theta}) \\
+                            &= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\log\mathcal{P}_\text{model}(Y = y^{(i)} | X=\vec{x}^{(i)}; \matr{\theta}) \\
+                            &= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\log\left( \frac{\exp(\vec{s}_{y^{(i)}})}{\sum_{k=1}^{c} \exp(\vec{s}_k)} \right) \\
+                            &= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\log\left( \exp(\vec{s}_{y^{(i)}}) \right) + \log\left( \sum_{k=1}^{c} \exp(\vec{s}_k) \right) \\
+                            &= \arg\min_\matr{\theta} \sum_{i=1}^{N} -\vec{s}_{y^{(i)}} + \log\left( \sum_{k=1}^{c} \exp(\vec{s}_k) \right) \\
+                    \end{split}    
+                \]
+
+                The second term ($\log\left( \sum_{k=1}^{c} \exp(\vec{s}_k)\right)$) is called \texttt{logsumexp} and approximates the max function.
+                Therefore, the loss can be seen as:
+                \[ 
+                    \mathcal{L}\big( \matr{\theta}, (\vec{x}^{(i)}, y^{(i)}) \big) 
+                    = -\vec{s}_{y^{(i)}} + \log\left( \sum_{k=1}^{c} \exp(\vec{s}_k) \right) 
+                    \approx -\vec{s}_{y^{(i)}} + \max\{ \vec{s} \}
+                \]
+        \end{descriptionlist}
+        
+\end{description}
+
+
+\subsection{Gradient descent}
+
+\begin{description}
+    \item[Gradient descent] \marginnote{Gradient descent}
+        An epoch $e$ of gradient descent does the following:
+        \begin{enumerate}
+            \item Classify all training data to obtain the predictions $\hat{y}^{(i)} = f(\vec{x}^{(i)}; \matr{\theta}^{(e-1)})$
+                and the loss $\mathcal{L}(\matr{\theta}^{(e-1)}, \matr{D}^\text{train})$.
+            \item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}} (\matr{\theta}^{(e-1)}, \matr{D}^\text{train})$.
+            \item Update the parameters $\matr{\theta}^{(e)} = \matr{\theta}^{(e-1)} - \texttt{lr} \cdot \nabla \mathcal{L}$.
+        \end{enumerate}
+
+    \item[Stochastic gradient descent] \marginnote{Stochastic gradient descent}
+        Reduce the computational cost of gradient descent by computing the gradient of a single sample.
+        An epoch $e$ of SGD does the following:
+        \begin{enumerate}
+            \item Shuffle the training data $\matr{D}^\text{train}$.
+            \item For $i = 0, \dots, N-1$:
+            \begin{enumerate}
+                \item Classify $\vec{x}^{(i)}$ to obtain the prediction $\hat{y}^{(i)} = f(\vec{x}^{(i)}; \matr{\theta}^{(e*N+i)})$
+                and the loss $\mathcal{L}\big( \matr{\theta}^{(e*N+i)}, (\vec{x}^{(i)}, y^{(i)}) \big)$.
+                \item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}}\big( \matr{\theta}^{(e*N+i)}, (\vec{x}^{(i)}, y^{(i)}) \big)$.
+                \item Update the parameters $\matr{\theta}^{(e*N+i+1)} = \matr{\theta}^{(e*N+i)} - \texttt{lr} \cdot \nabla \mathcal{L}$.
+            \end{enumerate}
+        \end{enumerate}
+
+    \item[SGD with mini-batches] \marginnote{SGD with mini-batches}
+        Increase the update accuracy of SGD by using a mini-batch.
+        An epoch $e$ of SGD with mini-batches of size $B$ does the following:
+        \begin{enumerate}
+            \item Shuffle the training data $\matr{D}^\text{train}$.
+            \item For $u = 0, \dots, U$, with $U = \lceil \frac{N}{B} \rceil$:
+            \begin{enumerate}
+                \item Classify the examples $\matr{X}^{(u)} = \{ \vec{x}^{(Bu)}, \dots, \vec{x}^{(B(u+1)-1)} \}$ 
+                    to obtain the predictions $\hat{Y}^{(u)} = f(\vec{X}^{(u)}; \matr{\theta}^{(e*U+u)})$
+                    and the loss $\mathcal{L}\big( \matr{\theta}^{(e*U+u)}, (\matr{X}^{(u)}, \hat{Y}^{(u)}) \big)$.
+                \item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}}\big( \matr{\theta}^{(e*U+u)}, (\matr{X}^{(u)}, \hat{Y}^{(u)}) \big)$.
+                \item Update the parameters $\matr{\theta}^{(e*U+u+1)} = \matr{\theta}^{(e*U+u)} - \texttt{lr} \cdot \nabla \mathcal{L}$.
+            \end{enumerate}
+        \end{enumerate}
+
+        The following properties generally hold:
+        \begin{itemize}
+            \item Larger batches provide a smoother estimation of the gradient and allow to better exploit parallel hardware (below a certain limit, there is no gain in time).
+            \item Smaller batches require more iterations to train but might have a regularization effect for better generalization.
+        \end{itemize}
+\end{description}
+
+
+
+\section{Linear classifier}
+\marginnote{Linear classifier}
+
+Determine the class by computing a linear combination of the input.
+
+Given $c$ classes and a flattened image $\vec{x} \in \mathbb{R}^{i}$, a linear classifier $f$ parametrized on $\matr{W} \in \mathbb{R}^{c \times i}$ is defined as:
+\[ f(\vec{x}; \matr{W}) = \matr{W}\vec{x} = \texttt{logits} \]
+where the $\texttt{logits} \in \mathbb{R}^{c}$ vector contains a score for each class.
+
+The prediction is obtained as the index of the maximum score.
+
+\begin{remark}
+    Predicting directly the integer encoded classes is not ideal as it would give a (probably) inexistent semantic ordering
+    (e.g. if $2$ encodes bird and $3$ encodes cat, $2.5$ should not mean half bird and half cat).
+\end{remark}
+
+\begin{remark}
+    Linear classifiers can be seen as a template-matching method.
+    Each row of $\matr{W} \in \mathbb{R}^{c \times i}$ is a class template that is cross-correlated with the image to obtain a score.
+\end{remark}
+
+\marginnote{Affine classifier}
+In practice, a linear classifier is actually an affine classifier parametrized on $\theta = (\matr{W} \in \mathbb{R}^{c \times i}, \vec{b} \in \mathbb{R}^{c})$:
+\[ f(\vec{x}; \theta) = \matr{W}\vec{x} + \vec{b} = \texttt{logits} \]
\ No newline at end of file