Add ML/DM data preprocessing

2026-02-04 07:41:43 +01:00 · 2023-12-02 18:20:18 +01:00
parent 0a7c8033e1
commit 1c9924dc72
5 changed files with 331 additions and 8 deletions
--- a/src/machine-learning-and-data-mining/img/mahalanobis.png
+++ b/src/machine-learning-and-data-mining/img/mahalanobis.png
--- a/src/machine-learning-and-data-mining/main.tex
+++ b/src/machine-learning-and-data-mining/main.tex
@ -29,7 +29,9 @@
    \input{sections/_data_lake.tex}
    \input{sections/_crisp.tex}
    \input{sections/_machine_learning.tex}
    \input{sections/_data_prepro.tex}
    \input{sections/_classification.tex}
    \input{sections/_regression.tex}
    \input{sections/_clustering.tex}
 \end{document}
--- a/src/machine-learning-and-data-mining/sections/_classification.tex
+++ b/src/machine-learning-and-data-mining/sections/_classification.tex
@ -269,14 +269,6 @@ a macro (unweighted) average or a class-weighted average.
        When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$), 
        when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and
        when $\kappa = 0$, there is random agreement.
    \item[Cost sensitive learning] \marginnote{Cost sensitive learning}
        Assign a cost to the errors. This can be done by:
        \begin{itemize}
            \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
            \item Weighting the classes (possible in some algorithms).
        \end{itemize}
 \end{description}
@ -317,6 +309,35 @@ a macro (unweighted) average or a class-weighted average.
 \end{description}
 \subsection{Data imbalance}
 A classifier may not perform well when predicting a minority class of the training data.
 Possible solutions are:
 \begin{descriptionlist}
    \item[Undersampling] \marginnote{Undersampling}
        Randomly reduce the number of example of the majority classes.
    \item[Oversampling] \marginnote{Oversampling}
        Increase the examples of the minority classes.
        \begin{description}
            \item[Synthetic minority oversampling technique (SMOTE)] \marginnote{SMOTE}
                \begin{enumerate}
                    \item Randomly select an example $x$ belonging to the minority class.
                    \item Select a random neighbor $z_i$ among its $k$-nearest neighbors $z_1, \dots, z_k$.
                    \item Synthetize a new example by selecting a random point of the feature space between $x$ and $z_i$.
                \end{enumerate}
        \end{description}
    \item[Cost sensitive learning] \marginnote{Cost sensitive learning}
        Assign a cost to the errors. Higher weights are assigned to minority classes.
        This can be done by:
        \begin{itemize}
            \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
            \item Weighting the classes (possible in some algorithms).
        \end{itemize}
 \end{descriptionlist}
 \section{Decision trees}
--- a/src/machine-learning-and-data-mining/sections/_clustering.tex
+++ b/src/machine-learning-and-data-mining/sections/_clustering.tex
@ -0,0 +1,137 @@
 \chapter{Clustering}
 \section{Similarity and dissimilarity}
 \begin{description}
    \item[Similarity] \marginnote{Similarity}
        Measures how alike two objects are.
        Often defined in the range $[0, 1]$.
    \item[Dissimilarity] \marginnote{Dissimilarity}
        Measures how two objects differ.
        0 indicates no difference while the upper-bound varies.
 \end{description}
 \begin{table}[ht]
    \centering
    \renewcommand{\arraystretch}{2}
    \begin{tabular}{c | c | c}
        \textbf{Attribute type} & \textbf{Dissimilarity} & \textbf{Similarity} \\
        \hline
        Nominal & $d(p, q) = \begin{cases} 0 & \text{if } p=q \\ 1 & \text{if } p \neq q \end{cases}$ & $s(p, q) = 1 - d(p, q)$ \\
        \hline
        Ordinal & $d(p, q) = \frac{\vert p - q \vert}{V}$ with $p, q \in \{ 0, \dots, V \}$ & $s(p, q) = 1 - d(p, q)$ \\
        \hline
        Interval or ratio & $d(p, q) = \vert p - q \vert$ & $s(p, q) = \frac{1}{1 + d(p, q)}$
    \end{tabular}
    \caption{Similarity and dissimilarity by attribute type}
 \end{table}
 \begin{description}
    \item[Similarity properties] \phantom{}
        \begin{enumerate}
            \item $\texttt{sim}(p, q) = 1$ iff $p = q$. 
            \item $\texttt{sim}(p, q) = \texttt{sim}(q, p)$. 
        \end{enumerate}
 \end{description}
 \subsection{Distance}
 Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics are:
 \begin{descriptionlist}
    \item[Minkowski distance ($L_r$)] \marginnote{Minkowski distance}
        \[ \texttt{dist}(p, q) = \left( \sum_{d=1}^{D} \vert p_d - q_d \vert^r \right)^{\frac{1}{r}} \]
        where $r$ is a parameter.
        Common values for $r$ are:
        \begin{descriptionlist}
            \item[$r = 1$] 
                Corresponds to the $L_1$ norm.
                It is useful for discriminating 0 distance and near-0 distance as 
                an $\varepsilon$ change in the data corresponds to an $\varepsilon$ change in the distance.
            \item[$r = 2$]
                Corresponds to the Euclidean distance or $L_2$ norm.
            \item[$r = \infty$]
                Corresponds to the $L_\infty$ norm.
                Considers only the dimensions with the maximum difference.
        \end{descriptionlist}
    \item[Mahalanobis distance] \marginnote{Mahalanobis distance}
        \[ \texttt{dist}(p, q) = \sqrt{ (p-q) \matr{\Sigma}^{-1} (p-q)^T } \]
        where $\matr{\Sigma}$ is the covariance matrix of the dataset.
        The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them 
        points towards a direction of greater variation of the data.
        \begin{figure}[h]
            \centering
            \includegraphics[width=0.35\textwidth]{img/mahalanobis.png}
            \caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.}
        \end{figure}
 \end{descriptionlist}
 \subsubsection{Distance properties}
 \begin{descriptionlist}
    \item[Positive definiteness] 
        $\texttt{dist}(p, q) \geq 0$ and $\texttt{dist}(p, q) = 0$ iff $p = q$.
    \item[Symmetry] 
        $\texttt{dist}(p, q) = \texttt{dist}(q, p)$
    \item[Triangle inequality] 
        $\texttt{dist}(p, q) \leq \texttt{dist}(p, r) + \texttt{dist}(r, q)$
 \end{descriptionlist}
 \subsection{Vector similarity}
 \begin{description}
    \item[Binary vectors]
        Given two examples $p$ and $q$ with binary features, we can compute the following values:
        \[ 
            \begin{split}
                M_{00} &= \text{ number of features that equals to 0 for both $p$ and $q$} \\
                M_{01} &= \text{ number of features that equals to 0 for $p$ and 1 for $q$} \\
                M_{10} &= \text{ number of features that equals to 1 for $p$ and 0 for $q$} \\
                M_{11} &= \text{ number of features that equals to 1 for both $p$ and $q$}
            \end{split}    
        \]
        Possible distance metrics are:
        \begin{descriptionlist}
            \item[Simple matching coefficient] \marginnote{Simple matching coefficient}
                $\texttt{SMC}(p, q) = \frac{M_{00} + M_{11}}{M_{00} + M_{01} + M_{10} + M_{11}}$ 
            \item[Jaccard coefficient] \marginnote{Jaccard coefficient}
                $\texttt{JC}(p, q) = \frac{M_{11}}{M_{01} + M_{10} + M_{11}}$ 
        \end{descriptionlist}
    \item[Cosine similarity] \marginnote{Cosine similarity}
        Cosine of the angle between two vectors:
        \[ \texttt{cos}(p, q) = \frac{p \cdot q}{\Vert p \Vert \cdot \Vert q \Vert} \]
    \item[Extended Jaccard coefficient (Tanimoto)] \marginnote{Extended Jaccard coefficient (Tanimoto)}
        Variation of the Jaccard coefficient for continuous values:
        \[ \texttt{T}(p, q) = \frac{p \cdot q}{\Vert p \Vert^2 + \Vert q \Vert^2 - p \cdot q} \]
 \end{description}
 \subsection{Correlation}
 \begin{description}
    \item[Pearson's correlation] \marginnote{Pearson's correlation}
        Measure of linear relationship between a pair of quantitative attributes $e_1$ and $e_2$.
        To compute the Pearson's correlation, the values of $e_1$ and $e_2$ are first standardized and then ordered to obtain the vectors $\vec{e}_1$ and $\vec{e}_2$.
        The correlation is then computed as the dot product between $\vec{e}_1$ and $\vec{e}_2$:
        \[ \texttt{corr}(e_1, e_2) = \langle \vec{e}_1, \vec{e}_2 \rangle \]
        Pearson's correlation has the following properties:
        \begin{itemize}
            \item If the variables are independent, then the correlation is 0 (but not vice versa).
            \item If the correlation is 0, then there is no linear relationship between the variables.
            \item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship.
        \end{itemize}
    \item[Symmetric uncertainty]
        Measure of correlation for nominal attributes:
        \[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \]
        where $H$ is the entropy.
 \end{description}
--- a/src/machine-learning-and-data-mining/sections/_data_prepro.tex
+++ b/src/machine-learning-and-data-mining/sections/_data_prepro.tex
@ -0,0 +1,163 @@
 \chapter{Data preprocessing}
 \section{Aggregation}
 \marginnote{Aggregation}
 Combining multiple attributes into a single one.
 Useful for:
 \begin{descriptionlist}
    \item[Data reduction]
        Reduce the number of attributes.
    \item[Change of scale] 
        View the data in a more general level of detail (e.g. from cities and regions to countries).
    \item[Data stability] 
        Aggregated data tend to have less variability.
 \end{descriptionlist}
 \section{Sampling}
 \marginnote{Sampling}
 Sampling can be used when the full dataset is too expensive to obtain or too expensive to process.
 Obviously a sample has to be representative.
 Type of sampling techniques are:
 \begin{descriptionlist}
    \item[Simple random] \marginnote{Simple random}
        Extraction of a single element following a given probability distribution.
    \item[With replacement] \marginnote{With replacement}
        Multiple extractions with repetitions following a given probability distribution
        (i.e. multiple simple random extractions).
        If the population is small, the sample may underestimate the actual population.
    \item[Without replacement] \marginnote{Without replacement}
        Multiple extractions without repetitions following a given probability distribution.
    \item[Stratified] \marginnote{Stratified}
        Split the data and sample from each partition.
        Useful when the partitions are homogenous.
 \end{descriptionlist}
 \begin{description}
    \item[Sample size]
        The sampling size represents a tradeoff between data reduction and precision.
        In a labeled dataset, it is important to consider the probability of sampling data of all the possible classes.
 \end{description}
 \section{Dimensionality reduction}
 \begin{description}
    \item[Curse of dimensionality] \marginnote{Curse of dimensionality}
        Data with a high number of dimensions result in a sparse feature space
        where distance metrics are ineffective. 
    \item[Dimensionality reduction] \marginnote{Dimensionality reduction}
        Useful to:
        \begin{itemize}
            \item Avoid the curse of dimensionality.
            \item Reduce noise.
            \item Reduce the time and space complexity of mining and learning algorithms.
            \item Visualize multi-dimensional data.
        \end{itemize}
 \end{description}
 \subsection{Principal component analysis} \marginnote{PCA} 
 Projection of the data into a lower-dimensional space that maximizes the variance of the data.
 It can be proven that this problem can be solved by finding the eigenvectors of the covariance matrix of the data.
 \subsection{Feature subset selection} \marginnote{Feature subset selection} 
    Local technique to reduce dimensionality by:
    \begin{itemize}
        \item Removing redundant attributes.
        \item Removing irrelevant attributes.
    \end{itemize}
    This can be achieved by:
    \begin{descriptionlist}
        \item[Brute force] 
            Try all the possible subsets of the dataset.
        \item[Embedded approach]
            Feature selection is naturally done by the learning algorithm (e.g. decision trees).
        \item[Filter approach]  
            Features are filtered using domain-specific knowledge.
        \item[Wrapper approaches]  
            A mining algorithm is used to select the best features.
    \end{descriptionlist}
 \section{Feature creation}
 \marginnote{Feature creation}
 Useful to help a learning algorithm capture data characteristics.
 Possible approaches are:
 \begin{descriptionlist}
    \item[Feature extraction] 
        Features extracted from the existing ones (e.g. from a picture of a face, the eye distance can be a new feature).
    \item[Mapping] 
        Projecting the data into a new feature space.
    \item[New features] 
        Add new, possibly redundant, features.
 \end{descriptionlist}
 \section{Data type conversion}
 \subsection{One-hot encoding} \marginnote{One-hot encoding}
    A discrete feature $E \in \{ e_1, \dots, e_n \}$ with $n$ unique values is replaced with 
    $n$ new binary features $H_{e_1}, \dots, H_{e_n}$ each corresponding to a value of $E$.
    For each entry, if its feature $E$ has value $e_i$, then $H_{e_i} = \texttt{true}$ and the rests are \texttt{false}.
 \subsection{Ordinal encoding} \marginnote{Ordinal encoding}
    A feature whose values have an ordering can be converted in a consecutive sequence of integers
    (e.g. ["good", "neutral", "bad"] $\mapsto$ [1, 0, -1]).
 \subsection{Discretization} \marginnote{Discretization}
    Convert a continuous feature to a discrete one.
    \begin{description}
        \item[Binarization] \marginnote{Binarization}
            Given a continuous feature and a threshold, 
            it can be replaced with a new binary feature that is \texttt{true} if the value is above the threshold and \texttt{false} otherwise.
        \item[Thresholding] \marginnote{Thresholding}
            Same as binarization but using multiple thresholds.
        \item[K-bins] \marginnote{K-bins}
            A continuous feature is discretized using $k$ bins each representing an integer from $0$ to $k-1$.
    \end{description}
 \section{Attribute transformation}
 Useful for normalizing features with different scales and outliers.
 \begin{description}
    \item[Mapping] \marginnote{Mapping}
        Map the domain of a feature into a new set of values (i.e. apply a function).
    \item[Standardization] \marginnote{Standardization}
        Transform a feature with Gaussian distribution into a standard distribution.
        \[ x = \frac{x - \mu}{\sigma} \]
    \item[Rescaling] \marginnote{Rescaling}
        Map a feature into a fixed range (e.g. scale to $[0, 1]$ or $[-1, 1]$).
    \item[Affine transformation] \marginnote{Affine transformation}
        Apply a linear transformation on a feature before rescaling it.
        This method is more robust to outliers.
    \item[Normalization] \marginnote{Normalization}
        Normalize each data row to unit norm.
 \end{description}