mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 19:12:22 +01:00
Add ML/DM ensemble models and regression
This commit is contained in:
BIN
src/machine-learning-and-data-mining/img/ensemble_error.png
Normal file
BIN
src/machine-learning-and-data-mining/img/ensemble_error.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 37 KiB |
@ -30,5 +30,6 @@
|
|||||||
\input{sections/_crisp.tex}
|
\input{sections/_crisp.tex}
|
||||||
\input{sections/_machine_learning.tex}
|
\input{sections/_machine_learning.tex}
|
||||||
\input{sections/_classification.tex}
|
\input{sections/_classification.tex}
|
||||||
|
\input{sections/_regression.tex}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
@ -794,3 +794,73 @@ Inputs are fed to the network and backpropagation is used to update the weights.
|
|||||||
to predict a new observation, the $k$ most similar entries in the training set are selected
|
to predict a new observation, the $k$ most similar entries in the training set are selected
|
||||||
and the class of the new data is determined as the most frequent class among the $k$ entries.
|
and the class of the new data is determined as the most frequent class among the $k$ entries.
|
||||||
\end{description}
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Binary to multi-class classification}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[One-vs-one strategy (OVO)] \marginnote{One-vs-one strategy (OVO)}
|
||||||
|
Train a classifier for all the possible pairs of classes (this will result in $\frac{C \cdot (C-1)}{2}$ pairs).
|
||||||
|
The class assigned to a new observation is determined through a majority vote.
|
||||||
|
|
||||||
|
\item[One-vs-rest strategy (OVR)] \marginnote{One-vs-rest strategy (OVR)}
|
||||||
|
Train $C$ classifiers where each is specialized to classify a specific class as positive and the others as negative.
|
||||||
|
The class assigned to a new observation is determined by the confidence score of each classifier.
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Ensemble methods}
|
||||||
|
\marginnote{Ensemble methods}
|
||||||
|
Train a set of base classifiers and make predictions by majority vote.
|
||||||
|
If all the classifiers have the same but independent error rate,
|
||||||
|
the overall error of the ensemble model is lower (derived from a binomial distribution).
|
||||||
|
|
||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=0.6\textwidth]{img/ensemble_error.png}
|
||||||
|
\caption{Relationship between the error of base classifiers and ensemble models}
|
||||||
|
\end{figure}
|
||||||
|
|
||||||
|
Different strategies to train an ensemble classifier can be used:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Dataset manipulation] Resampling the dataset for each base classifier:
|
||||||
|
\begin{description}
|
||||||
|
\item[Bagging]
|
||||||
|
Sample with replacement with a uniform distribution.
|
||||||
|
\item[Boosting]
|
||||||
|
Iteratively change the distribution of the training data
|
||||||
|
prioritizing examples difficult to classify.
|
||||||
|
\begin{description}
|
||||||
|
\item[Adaboost] \marginnote{Adaboost}
|
||||||
|
Iteratively train base classifiers on a dataset where samples
|
||||||
|
misclassified at the previous iteration have a higher weight.
|
||||||
|
\end{description}
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
\item[Feature manipulation]
|
||||||
|
Train a base classifier using only a subset of the features.
|
||||||
|
|
||||||
|
\item[Class labels manipulation]
|
||||||
|
Train a base classifier to classify a partition of the class labels.
|
||||||
|
For instance, class labels can be partitioned into two groups $A_1$ and $A_2$, and
|
||||||
|
the base classifier is trained to assign as label one of the two groups.
|
||||||
|
During inference, when a group is predicted, all labels within that group receive a vote.
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Random forests}
|
||||||
|
\marginnote{Random forests}
|
||||||
|
|
||||||
|
Different decision trees trained on a different random sampling of the training set and different subset of features.
|
||||||
|
A prediction is made by averaging the output of each tree.
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Bias] \marginnote{Bias}
|
||||||
|
Simplicity of the target function of a model.
|
||||||
|
\item[Variance] \marginnote{Variance}
|
||||||
|
Amount of change of the target function when using different training data (i.e. how much the model overfits).
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
Random forests aim to reduce the high variance of decision trees.
|
||||||
@ -0,0 +1,56 @@
|
|||||||
|
\chapter{Regression}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Linear regression] \marginnote{Linear regression}
|
||||||
|
Given:
|
||||||
|
\begin{itemize}
|
||||||
|
\item A dataset $\matr{X}$ of $N$ rows and $D$ features.
|
||||||
|
\item A response vector $\vec{y}$ of $N$ continuous values.
|
||||||
|
\end{itemize}
|
||||||
|
We want to learn the parameters $\vec{w} \in \mathbb{R}^D$ such that:
|
||||||
|
\[ \vec{y} \approx \matr{X}\vec{w}^T \]
|
||||||
|
|
||||||
|
\item[Mean squared error] \marginnote{Mean squared error}
|
||||||
|
To find the parameters for linear regression,
|
||||||
|
we minimize as loss function the mean squared error:
|
||||||
|
\[
|
||||||
|
\mathcal{L}(\vec{w}) = \Vert \matr{X}\vec{w}^T - \vec{y} \Vert^2
|
||||||
|
\]
|
||||||
|
Its gradient is:
|
||||||
|
\[ \nabla\mathcal{L}(\vec{w}) = 2\matr{X}^T(\matr{X}\vec{w}^T - \vec{y}) \]
|
||||||
|
Constraining it to 0, we obtain the problem:
|
||||||
|
\[ \matr{X}^T\matr{X}\vec{w}^T = \matr{X}^T\vec{y} \]
|
||||||
|
If $\matr{X}^T\matr{X}$ is invertible, this can be solved analytically but could lead to overfitting.
|
||||||
|
Numerical methods are therefore more suited.
|
||||||
|
|
||||||
|
Note that:
|
||||||
|
\begin{itemize}
|
||||||
|
\item MSE is influenced by the magnitude of the data.
|
||||||
|
\item It measures the fitness of a model in absolute terms.
|
||||||
|
\item It is suited to compare different models.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\item[Coefficient of determination] \marginnote{Coefficient of determination}
|
||||||
|
Given:
|
||||||
|
\begin{itemize}
|
||||||
|
\item The mean of the observed data: $y_\text{avg} = \frac{1}{N} \sum_i \vec{y}_i$.
|
||||||
|
\item The sum of the squared residuals: $SS_\text{res} = \sum_i (\vec{y}_i - \vec{w}^T\vec{x}_i)^2$.
|
||||||
|
\item The total sum of squares: $SS_\text{tot} = \sum_i (\vec{y}_i - y_\text{avg})^2$.
|
||||||
|
\end{itemize}
|
||||||
|
The coefficient of determination is given by:
|
||||||
|
\[ \text{R}^2 = 1 - \frac{SS_\text{res}}{SS_\text{tot}} \]
|
||||||
|
|
||||||
|
Intuitively, $\text{R}^2$ compares the model with a horizontal straight line ($y_\text{avg}$).
|
||||||
|
When $\text{R}^2 = 1$, the model has a perfect fit.
|
||||||
|
When $\text{R}^2$ is outside the range $[0, 1]$, then the model is worse than a straight line.
|
||||||
|
|
||||||
|
Note that:
|
||||||
|
\begin{itemize}
|
||||||
|
\item $\text{R}^2$ is a standardized index.
|
||||||
|
\item $\text{R}^2$ tells how well the variables of the predictor can explain the variation in the target.
|
||||||
|
\item $\text{R}^2$ is not suited for non-linear models.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\item[Polynomial regression] \marginnote{Polynomial regression}
|
||||||
|
Find a polynomial instead of a hyperplane.
|
||||||
|
\end{description}
|
||||||
Reference in New Issue
Block a user