Add ML/DM ensemble models and regression

2025-12-15 19:12:22 +01:00 · 2023-11-25 14:15:15 +01:00
parent 58eee04158
commit 098bb90f07
4 changed files with 127 additions and 0 deletions
--- a/src/machine-learning-and-data-mining/img/ensemble_error.png
+++ b/src/machine-learning-and-data-mining/img/ensemble_error.png
--- a/src/machine-learning-and-data-mining/main.tex
+++ b/src/machine-learning-and-data-mining/main.tex
@ -30,5 +30,6 @@
    \input{sections/_crisp.tex}
    \input{sections/_machine_learning.tex}
    \input{sections/_classification.tex}
    \input{sections/_regression.tex}
 \end{document}
--- a/src/machine-learning-and-data-mining/sections/_classification.tex
+++ b/src/machine-learning-and-data-mining/sections/_classification.tex
@ -794,3 +794,73 @@ Inputs are fed to the network and backpropagation is used to update the weights.
        to predict a new observation, the $k$ most similar entries in the training set are selected
        and the class of the new data is determined as the most frequent class among the $k$ entries.
 \end{description}
 \section{Binary to multi-class classification}
 \begin{description}
    \item[One-vs-one strategy (OVO)] \marginnote{One-vs-one strategy (OVO)}
        Train a classifier for all the possible pairs of classes (this will result in $\frac{C \cdot (C-1)}{2}$ pairs).
        The class assigned to a new observation is determined through a majority vote.
    \item[One-vs-rest strategy (OVR)] \marginnote{One-vs-rest strategy (OVR)}
        Train $C$ classifiers where each is specialized to classify a specific class as positive and the others as negative.
        The class assigned to a new observation is determined by the confidence score of each classifier.
 \end{description}
 \section{Ensemble methods}
 \marginnote{Ensemble methods}
 Train a set of base classifiers and make predictions by majority vote.
 If all the classifiers have the same but independent error rate, 
 the overall error of the ensemble model is lower (derived from a binomial distribution).
 \begin{figure}[h]
    \centering
    \includegraphics[width=0.6\textwidth]{img/ensemble_error.png}
    \caption{Relationship between the error of base classifiers and ensemble models}
 \end{figure}
 Different strategies to train an ensemble classifier can be used:
 \begin{descriptionlist}
    \item[Dataset manipulation] Resampling the dataset for each base classifier:
        \begin{description}
            \item[Bagging] 
                Sample with replacement with a uniform distribution.
            \item[Boosting] 
                Iteratively change the distribution of the training data 
                prioritizing examples difficult to classify.
                \begin{description}
                    \item[Adaboost] \marginnote{Adaboost}
                        Iteratively train base classifiers on a dataset where samples 
                        misclassified at the previous iteration have a higher weight.
                \end{description}
        \end{description}
    \item[Feature manipulation]
        Train a base classifier using only a subset of the features.
    \item[Class labels manipulation]
        Train a base classifier to classify a partition of the class labels.
        For instance, class labels can be partitioned into two groups $A_1$ and $A_2$, and
        the base classifier is trained to assign as label one of the two groups.
        During inference, when a group is predicted, all labels within that group receive a vote.
 \end{descriptionlist}
 \subsection{Random forests}
 \marginnote{Random forests}
 Different decision trees trained on a different random sampling of the training set and different subset of features.
 A prediction is made by averaging the output of each tree.
 \begin{description}
    \item[Bias] \marginnote{Bias}
        Simplicity of the target function of a model.
    \item[Variance] \marginnote{Variance}
        Amount of change of the target function when using different training data (i.e. how much the model overfits).
 \end{description}
 Random forests aim to reduce the high variance of decision trees.
--- a/src/machine-learning-and-data-mining/sections/_regression.tex
+++ b/src/machine-learning-and-data-mining/sections/_regression.tex
@ -0,0 +1,56 @@
 \chapter{Regression}
 \begin{description}
    \item[Linear regression] \marginnote{Linear regression}
        Given:
        \begin{itemize}
            \item A dataset $\matr{X}$ of $N$ rows and $D$ features.
            \item A response vector $\vec{y}$ of $N$ continuous values.
        \end{itemize}
        We want to learn the parameters $\vec{w} \in \mathbb{R}^D$ such that:
        \[ \vec{y} \approx \matr{X}\vec{w}^T \]
    \item[Mean squared error] \marginnote{Mean squared error}
        To find the parameters for linear regression,
        we minimize as loss function the mean squared error:
        \[  
            \mathcal{L}(\vec{w}) = \Vert \matr{X}\vec{w}^T - \vec{y} \Vert^2    
        \]
        Its gradient is:
        \[ \nabla\mathcal{L}(\vec{w}) = 2\matr{X}^T(\matr{X}\vec{w}^T - \vec{y}) \]
        Constraining it to 0, we obtain the problem:
        \[ \matr{X}^T\matr{X}\vec{w}^T = \matr{X}^T\vec{y} \]
        If $\matr{X}^T\matr{X}$ is invertible, this can be solved analytically but could lead to overfitting.
        Numerical methods are therefore more suited.
        Note that:
        \begin{itemize}
            \item MSE is influenced by the magnitude of the data.
            \item It measures the fitness of a model in absolute terms.
            \item It is suited to compare different models.
        \end{itemize}
    \item[Coefficient of determination] \marginnote{Coefficient of determination}
        Given:
        \begin{itemize}
            \item The mean of the observed data: $y_\text{avg} = \frac{1}{N} \sum_i \vec{y}_i$.
            \item The sum of the squared residuals: $SS_\text{res} = \sum_i (\vec{y}_i - \vec{w}^T\vec{x}_i)^2$.
            \item The total sum of squares: $SS_\text{tot} = \sum_i (\vec{y}_i - y_\text{avg})^2$.
        \end{itemize}
        The coefficient of determination is given by:
        \[ \text{R}^2 = 1 - \frac{SS_\text{res}}{SS_\text{tot}} \]
        Intuitively, $\text{R}^2$ compares the model with a horizontal straight line ($y_\text{avg}$).
        When $\text{R}^2 = 1$, the model has a perfect fit.
        When $\text{R}^2$ is outside the range $[0, 1]$, then the model is worse than a straight line.
        Note that:
        \begin{itemize}
            \item $\text{R}^2$ is a standardized index.
            \item $\text{R}^2$ tells how well the variables of the predictor can explain the variation in the target.
            \item $\text{R}^2$ is not suited for non-linear models.
        \end{itemize}
    \item[Polynomial regression] \marginnote{Polynomial regression}
        Find a polynomial instead of a hyperplane.
 \end{description}