diff --git a/src/machine-learning-and-data-mining/img/ensemble_error.png b/src/machine-learning-and-data-mining/img/ensemble_error.png new file mode 100644 index 0000000..a93393f Binary files /dev/null and b/src/machine-learning-and-data-mining/img/ensemble_error.png differ diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex index d9d0867..3593190 100644 --- a/src/machine-learning-and-data-mining/main.tex +++ b/src/machine-learning-and-data-mining/main.tex @@ -30,5 +30,6 @@ \input{sections/_crisp.tex} \input{sections/_machine_learning.tex} \input{sections/_classification.tex} + \input{sections/_regression.tex} \end{document} \ No newline at end of file diff --git a/src/machine-learning-and-data-mining/sections/_classification.tex b/src/machine-learning-and-data-mining/sections/_classification.tex index 36eb7ee..01c8909 100644 --- a/src/machine-learning-and-data-mining/sections/_classification.tex +++ b/src/machine-learning-and-data-mining/sections/_classification.tex @@ -794,3 +794,73 @@ Inputs are fed to the network and backpropagation is used to update the weights. to predict a new observation, the $k$ most similar entries in the training set are selected and the class of the new data is determined as the most frequent class among the $k$ entries. \end{description} + + + +\section{Binary to multi-class classification} + +\begin{description} + \item[One-vs-one strategy (OVO)] \marginnote{One-vs-one strategy (OVO)} + Train a classifier for all the possible pairs of classes (this will result in $\frac{C \cdot (C-1)}{2}$ pairs). + The class assigned to a new observation is determined through a majority vote. + + \item[One-vs-rest strategy (OVR)] \marginnote{One-vs-rest strategy (OVR)} + Train $C$ classifiers where each is specialized to classify a specific class as positive and the others as negative. + The class assigned to a new observation is determined by the confidence score of each classifier. +\end{description} + + + +\section{Ensemble methods} +\marginnote{Ensemble methods} +Train a set of base classifiers and make predictions by majority vote. +If all the classifiers have the same but independent error rate, +the overall error of the ensemble model is lower (derived from a binomial distribution). + +\begin{figure}[h] + \centering + \includegraphics[width=0.6\textwidth]{img/ensemble_error.png} + \caption{Relationship between the error of base classifiers and ensemble models} +\end{figure} + +Different strategies to train an ensemble classifier can be used: +\begin{descriptionlist} + \item[Dataset manipulation] Resampling the dataset for each base classifier: + \begin{description} + \item[Bagging] + Sample with replacement with a uniform distribution. + \item[Boosting] + Iteratively change the distribution of the training data + prioritizing examples difficult to classify. + \begin{description} + \item[Adaboost] \marginnote{Adaboost} + Iteratively train base classifiers on a dataset where samples + misclassified at the previous iteration have a higher weight. + \end{description} + \end{description} + + \item[Feature manipulation] + Train a base classifier using only a subset of the features. + + \item[Class labels manipulation] + Train a base classifier to classify a partition of the class labels. + For instance, class labels can be partitioned into two groups $A_1$ and $A_2$, and + the base classifier is trained to assign as label one of the two groups. + During inference, when a group is predicted, all labels within that group receive a vote. +\end{descriptionlist} + + +\subsection{Random forests} +\marginnote{Random forests} + +Different decision trees trained on a different random sampling of the training set and different subset of features. +A prediction is made by averaging the output of each tree. + +\begin{description} + \item[Bias] \marginnote{Bias} + Simplicity of the target function of a model. + \item[Variance] \marginnote{Variance} + Amount of change of the target function when using different training data (i.e. how much the model overfits). +\end{description} + +Random forests aim to reduce the high variance of decision trees. \ No newline at end of file diff --git a/src/machine-learning-and-data-mining/sections/_regression.tex b/src/machine-learning-and-data-mining/sections/_regression.tex new file mode 100644 index 0000000..1b65d57 --- /dev/null +++ b/src/machine-learning-and-data-mining/sections/_regression.tex @@ -0,0 +1,56 @@ +\chapter{Regression} + +\begin{description} + \item[Linear regression] \marginnote{Linear regression} + Given: + \begin{itemize} + \item A dataset $\matr{X}$ of $N$ rows and $D$ features. + \item A response vector $\vec{y}$ of $N$ continuous values. + \end{itemize} + We want to learn the parameters $\vec{w} \in \mathbb{R}^D$ such that: + \[ \vec{y} \approx \matr{X}\vec{w}^T \] + + \item[Mean squared error] \marginnote{Mean squared error} + To find the parameters for linear regression, + we minimize as loss function the mean squared error: + \[ + \mathcal{L}(\vec{w}) = \Vert \matr{X}\vec{w}^T - \vec{y} \Vert^2 + \] + Its gradient is: + \[ \nabla\mathcal{L}(\vec{w}) = 2\matr{X}^T(\matr{X}\vec{w}^T - \vec{y}) \] + Constraining it to 0, we obtain the problem: + \[ \matr{X}^T\matr{X}\vec{w}^T = \matr{X}^T\vec{y} \] + If $\matr{X}^T\matr{X}$ is invertible, this can be solved analytically but could lead to overfitting. + Numerical methods are therefore more suited. + + Note that: + \begin{itemize} + \item MSE is influenced by the magnitude of the data. + \item It measures the fitness of a model in absolute terms. + \item It is suited to compare different models. + \end{itemize} + + \item[Coefficient of determination] \marginnote{Coefficient of determination} + Given: + \begin{itemize} + \item The mean of the observed data: $y_\text{avg} = \frac{1}{N} \sum_i \vec{y}_i$. + \item The sum of the squared residuals: $SS_\text{res} = \sum_i (\vec{y}_i - \vec{w}^T\vec{x}_i)^2$. + \item The total sum of squares: $SS_\text{tot} = \sum_i (\vec{y}_i - y_\text{avg})^2$. + \end{itemize} + The coefficient of determination is given by: + \[ \text{R}^2 = 1 - \frac{SS_\text{res}}{SS_\text{tot}} \] + + Intuitively, $\text{R}^2$ compares the model with a horizontal straight line ($y_\text{avg}$). + When $\text{R}^2 = 1$, the model has a perfect fit. + When $\text{R}^2$ is outside the range $[0, 1]$, then the model is worse than a straight line. + + Note that: + \begin{itemize} + \item $\text{R}^2$ is a standardized index. + \item $\text{R}^2$ tells how well the variables of the predictor can explain the variation in the target. + \item $\text{R}^2$ is not suited for non-linear models. + \end{itemize} + + \item[Polynomial regression] \marginnote{Polynomial regression} + Find a polynomial instead of a hyperplane. +\end{description} \ No newline at end of file