diff --git a/src/machine-learning-and-data-mining/img/ensemble_error.png b/src/machine-learning-and-data-mining/img/ensemble_error.png
new file mode 100644
index 0000000..a93393f
Binary files /dev/null and b/src/machine-learning-and-data-mining/img/ensemble_error.png differ
diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex
index d9d0867..3593190 100644
--- a/src/machine-learning-and-data-mining/main.tex
+++ b/src/machine-learning-and-data-mining/main.tex
@@ -30,5 +30,6 @@
     \input{sections/_crisp.tex}
     \input{sections/_machine_learning.tex}
     \input{sections/_classification.tex}
+    \input{sections/_regression.tex}
 
 \end{document}
\ No newline at end of file
diff --git a/src/machine-learning-and-data-mining/sections/_classification.tex b/src/machine-learning-and-data-mining/sections/_classification.tex
index 36eb7ee..01c8909 100644
--- a/src/machine-learning-and-data-mining/sections/_classification.tex
+++ b/src/machine-learning-and-data-mining/sections/_classification.tex
@@ -794,3 +794,73 @@ Inputs are fed to the network and backpropagation is used to update the weights.
         to predict a new observation, the $k$ most similar entries in the training set are selected
         and the class of the new data is determined as the most frequent class among the $k$ entries.
 \end{description}
+
+
+
+\section{Binary to multi-class classification}
+
+\begin{description}
+    \item[One-vs-one strategy (OVO)] \marginnote{One-vs-one strategy (OVO)}
+        Train a classifier for all the possible pairs of classes (this will result in $\frac{C \cdot (C-1)}{2}$ pairs).
+        The class assigned to a new observation is determined through a majority vote.
+
+    \item[One-vs-rest strategy (OVR)] \marginnote{One-vs-rest strategy (OVR)}
+        Train $C$ classifiers where each is specialized to classify a specific class as positive and the others as negative.
+        The class assigned to a new observation is determined by the confidence score of each classifier.
+\end{description}
+
+
+
+\section{Ensemble methods}
+\marginnote{Ensemble methods}
+Train a set of base classifiers and make predictions by majority vote.
+If all the classifiers have the same but independent error rate, 
+the overall error of the ensemble model is lower (derived from a binomial distribution).
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.6\textwidth]{img/ensemble_error.png}
+    \caption{Relationship between the error of base classifiers and ensemble models}
+\end{figure}
+
+Different strategies to train an ensemble classifier can be used:
+\begin{descriptionlist}
+    \item[Dataset manipulation] Resampling the dataset for each base classifier:
+        \begin{description}
+            \item[Bagging] 
+                Sample with replacement with a uniform distribution.
+            \item[Boosting] 
+                Iteratively change the distribution of the training data 
+                prioritizing examples difficult to classify.
+                \begin{description}
+                    \item[Adaboost] \marginnote{Adaboost}
+                        Iteratively train base classifiers on a dataset where samples 
+                        misclassified at the previous iteration have a higher weight.
+                \end{description}
+        \end{description}
+    
+    \item[Feature manipulation]
+        Train a base classifier using only a subset of the features.
+
+    \item[Class labels manipulation]
+        Train a base classifier to classify a partition of the class labels.
+        For instance, class labels can be partitioned into two groups $A_1$ and $A_2$, and
+        the base classifier is trained to assign as label one of the two groups.
+        During inference, when a group is predicted, all labels within that group receive a vote.
+\end{descriptionlist}
+
+
+\subsection{Random forests}
+\marginnote{Random forests}
+
+Different decision trees trained on a different random sampling of the training set and different subset of features.
+A prediction is made by averaging the output of each tree.
+
+\begin{description}
+    \item[Bias] \marginnote{Bias}
+        Simplicity of the target function of a model.
+    \item[Variance] \marginnote{Variance}
+        Amount of change of the target function when using different training data (i.e. how much the model overfits).
+\end{description}
+
+Random forests aim to reduce the high variance of decision trees.
\ No newline at end of file
diff --git a/src/machine-learning-and-data-mining/sections/_regression.tex b/src/machine-learning-and-data-mining/sections/_regression.tex
new file mode 100644
index 0000000..1b65d57
--- /dev/null
+++ b/src/machine-learning-and-data-mining/sections/_regression.tex
@@ -0,0 +1,56 @@
+\chapter{Regression}
+
+\begin{description}
+    \item[Linear regression] \marginnote{Linear regression}
+        Given:
+        \begin{itemize}
+            \item A dataset $\matr{X}$ of $N$ rows and $D$ features.
+            \item A response vector $\vec{y}$ of $N$ continuous values.
+        \end{itemize}
+        We want to learn the parameters $\vec{w} \in \mathbb{R}^D$ such that:
+        \[ \vec{y} \approx \matr{X}\vec{w}^T \]
+
+    \item[Mean squared error] \marginnote{Mean squared error}
+        To find the parameters for linear regression,
+        we minimize as loss function the mean squared error:
+        \[  
+            \mathcal{L}(\vec{w}) = \Vert \matr{X}\vec{w}^T - \vec{y} \Vert^2    
+        \]
+        Its gradient is:
+        \[ \nabla\mathcal{L}(\vec{w}) = 2\matr{X}^T(\matr{X}\vec{w}^T - \vec{y}) \]
+        Constraining it to 0, we obtain the problem:
+        \[ \matr{X}^T\matr{X}\vec{w}^T = \matr{X}^T\vec{y} \]
+        If $\matr{X}^T\matr{X}$ is invertible, this can be solved analytically but could lead to overfitting.
+        Numerical methods are therefore more suited.
+
+        Note that:
+        \begin{itemize}
+            \item MSE is influenced by the magnitude of the data.
+            \item It measures the fitness of a model in absolute terms.
+            \item It is suited to compare different models.
+        \end{itemize}
+
+    \item[Coefficient of determination] \marginnote{Coefficient of determination}
+        Given:
+        \begin{itemize}
+            \item The mean of the observed data: $y_\text{avg} = \frac{1}{N} \sum_i \vec{y}_i$.
+            \item The sum of the squared residuals: $SS_\text{res} = \sum_i (\vec{y}_i - \vec{w}^T\vec{x}_i)^2$.
+            \item The total sum of squares: $SS_\text{tot} = \sum_i (\vec{y}_i - y_\text{avg})^2$.
+        \end{itemize}
+        The coefficient of determination is given by:
+        \[ \text{R}^2 = 1 - \frac{SS_\text{res}}{SS_\text{tot}} \]
+
+        Intuitively, $\text{R}^2$ compares the model with a horizontal straight line ($y_\text{avg}$).
+        When $\text{R}^2 = 1$, the model has a perfect fit.
+        When $\text{R}^2$ is outside the range $[0, 1]$, then the model is worse than a straight line.
+
+        Note that:
+        \begin{itemize}
+            \item $\text{R}^2$ is a standardized index.
+            \item $\text{R}^2$ tells how well the variables of the predictor can explain the variation in the target.
+            \item $\text{R}^2$ is not suited for non-linear models.
+        \end{itemize}
+
+    \item[Polynomial regression] \marginnote{Polynomial regression}
+        Find a polynomial instead of a hyperplane.
+\end{description}
\ No newline at end of file