Add ML/DM data preprocessing

2026-02-04 15:51:43 +01:00 · 2023-12-02 18:20:18 +01:00
parent 0a7c8033e1
commit 1c9924dc72
5 changed files with 331 additions and 8 deletions
--- a/src/machine-learning-and-data-mining/img/mahalanobis.png
+++ b/src/machine-learning-and-data-mining/img/mahalanobis.png
--- a/src/machine-learning-and-data-mining/main.tex
+++ b/src/machine-learning-and-data-mining/main.tex
@ -29,7 +29,9 @@
    \input{sections/_data_lake.tex}
    \input{sections/_crisp.tex}
    \input{sections/_machine_learning.tex}
+    \input{sections/_data_prepro.tex}
    \input{sections/_classification.tex}
    \input{sections/_regression.tex}
+    \input{sections/_clustering.tex}

 \end{document}
--- a/src/machine-learning-and-data-mining/sections/_classification.tex
+++ b/src/machine-learning-and-data-mining/sections/_classification.tex
@ -269,14 +269,6 @@ a macro (unweighted) average or a class-weighted average.
        When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$), 
        when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and
        when $\kappa = 0$, there is random agreement.
-
-
-    \item[Cost sensitive learning] \marginnote{Cost sensitive learning}
-        Assign a cost to the errors. This can be done by:
-        \begin{itemize}
-            \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
-            \item Weighting the classes (possible in some algorithms).
-        \end{itemize}
 \end{description}


@ -317,6 +309,35 @@ a macro (unweighted) average or a class-weighted average.
 \end{description}


+\subsection{Data imbalance}
+A classifier may not perform well when predicting a minority class of the training data.
+Possible solutions are:
+\begin{descriptionlist}
+    \item[Undersampling] \marginnote{Undersampling}
+        Randomly reduce the number of example of the majority classes.
+
+    \item[Oversampling] \marginnote{Oversampling}
+        Increase the examples of the minority classes.
+
+        \begin{description}
+            \item[Synthetic minority oversampling technique (SMOTE)] \marginnote{SMOTE}
+                \begin{enumerate}
+                    \item Randomly select an example $x$ belonging to the minority class.
+                    \item Select a random neighbor $z_i$ among its $k$-nearest neighbors $z_1, \dots, z_k$.
+                    \item Synthetize a new example by selecting a random point of the feature space between $x$ and $z_i$.
+                \end{enumerate}
+        \end{description}
+
+    \item[Cost sensitive learning] \marginnote{Cost sensitive learning}
+        Assign a cost to the errors. Higher weights are assigned to minority classes.
+        This can be done by:
+        \begin{itemize}
+            \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
+            \item Weighting the classes (possible in some algorithms).
+        \end{itemize}
+\end{descriptionlist}
+
+

 \section{Decision trees}

--- a/src/machine-learning-and-data-mining/sections/_clustering.tex
+++ b/src/machine-learning-and-data-mining/sections/_clustering.tex
@ -0,0 +1,137 @@
+\chapter{Clustering}
+
+
+\section{Similarity and dissimilarity}
+
+\begin{description}
+    \item[Similarity] \marginnote{Similarity}
+        Measures how alike two objects are.
+        Often defined in the range $[0, 1]$.
+
+    \item[Dissimilarity] \marginnote{Dissimilarity}
+        Measures how two objects differ.
+        0 indicates no difference while the upper-bound varies.
+\end{description}
+
+\begin{table}[ht]
+    \centering
+    \renewcommand{\arraystretch}{2}
+    \begin{tabular}{c | c | c}
+        \textbf{Attribute type} & \textbf{Dissimilarity} & \textbf{Similarity} \\
+        \hline
+        Nominal & $d(p, q) = \begin{cases} 0 & \text{if } p=q \\ 1 & \text{if } p \neq q \end{cases}$ & $s(p, q) = 1 - d(p, q)$ \\
+        \hline
+        Ordinal & $d(p, q) = \frac{\vert p - q \vert}{V}$ with $p, q \in \{ 0, \dots, V \}$ & $s(p, q) = 1 - d(p, q)$ \\
+        \hline
+        Interval or ratio & $d(p, q) = \vert p - q \vert$ & $s(p, q) = \frac{1}{1 + d(p, q)}$
+    \end{tabular}
+    \caption{Similarity and dissimilarity by attribute type}
+\end{table}
+
+\begin{description}
+    \item[Similarity properties] \phantom{}
+        \begin{enumerate}
+            \item $\texttt{sim}(p, q) = 1$ iff $p = q$. 
+            \item $\texttt{sim}(p, q) = \texttt{sim}(q, p)$. 
+        \end{enumerate}
+\end{description}
+
+
+\subsection{Distance}
+
+Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics are:
+\begin{descriptionlist}
+    \item[Minkowski distance ($L_r$)] \marginnote{Minkowski distance}
+        \[ \texttt{dist}(p, q) = \left( \sum_{d=1}^{D} \vert p_d - q_d \vert^r \right)^{\frac{1}{r}} \]
+        where $r$ is a parameter.
+
+        Common values for $r$ are:
+        \begin{descriptionlist}
+            \item[$r = 1$] 
+                Corresponds to the $L_1$ norm.
+                It is useful for discriminating 0 distance and near-0 distance as 
+                an $\varepsilon$ change in the data corresponds to an $\varepsilon$ change in the distance.
+            \item[$r = 2$]
+                Corresponds to the Euclidean distance or $L_2$ norm.
+            \item[$r = \infty$]
+                Corresponds to the $L_\infty$ norm.
+                Considers only the dimensions with the maximum difference.
+        \end{descriptionlist}
+    
+    \item[Mahalanobis distance] \marginnote{Mahalanobis distance}
+        \[ \texttt{dist}(p, q) = \sqrt{ (p-q) \matr{\Sigma}^{-1} (p-q)^T } \]
+        where $\matr{\Sigma}$ is the covariance matrix of the dataset.
+        The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them 
+        points towards a direction of greater variation of the data.
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.35\textwidth]{img/mahalanobis.png}
+            \caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.}
+        \end{figure}
+\end{descriptionlist}
+
+\subsubsection{Distance properties}
+\begin{descriptionlist}
+    \item[Positive definiteness] 
+        $\texttt{dist}(p, q) \geq 0$ and $\texttt{dist}(p, q) = 0$ iff $p = q$.
+    \item[Symmetry] 
+        $\texttt{dist}(p, q) = \texttt{dist}(q, p)$
+    \item[Triangle inequality] 
+        $\texttt{dist}(p, q) \leq \texttt{dist}(p, r) + \texttt{dist}(r, q)$
+\end{descriptionlist}
+
+
+
+\subsection{Vector similarity}
+
+\begin{description}
+    \item[Binary vectors]
+        Given two examples $p$ and $q$ with binary features, we can compute the following values:
+        \[ 
+            \begin{split}
+                M_{00} &= \text{ number of features that equals to 0 for both $p$ and $q$} \\
+                M_{01} &= \text{ number of features that equals to 0 for $p$ and 1 for $q$} \\
+                M_{10} &= \text{ number of features that equals to 1 for $p$ and 0 for $q$} \\
+                M_{11} &= \text{ number of features that equals to 1 for both $p$ and $q$}
+            \end{split}    
+        \]
+        Possible distance metrics are:
+        \begin{descriptionlist}
+            \item[Simple matching coefficient] \marginnote{Simple matching coefficient}
+                $\texttt{SMC}(p, q) = \frac{M_{00} + M_{11}}{M_{00} + M_{01} + M_{10} + M_{11}}$ 
+            \item[Jaccard coefficient] \marginnote{Jaccard coefficient}
+                $\texttt{JC}(p, q) = \frac{M_{11}}{M_{01} + M_{10} + M_{11}}$ 
+        \end{descriptionlist}
+
+    \item[Cosine similarity] \marginnote{Cosine similarity}
+        Cosine of the angle between two vectors:
+        \[ \texttt{cos}(p, q) = \frac{p \cdot q}{\Vert p \Vert \cdot \Vert q \Vert} \]
+
+    \item[Extended Jaccard coefficient (Tanimoto)] \marginnote{Extended Jaccard coefficient (Tanimoto)}
+        Variation of the Jaccard coefficient for continuous values:
+        \[ \texttt{T}(p, q) = \frac{p \cdot q}{\Vert p \Vert^2 + \Vert q \Vert^2 - p \cdot q} \]
+\end{description}
+
+
+\subsection{Correlation}
+
+\begin{description}
+    \item[Pearson's correlation] \marginnote{Pearson's correlation}
+        Measure of linear relationship between a pair of quantitative attributes $e_1$ and $e_2$.
+        To compute the Pearson's correlation, the values of $e_1$ and $e_2$ are first standardized and then ordered to obtain the vectors $\vec{e}_1$ and $\vec{e}_2$.
+        The correlation is then computed as the dot product between $\vec{e}_1$ and $\vec{e}_2$:
+        \[ \texttt{corr}(e_1, e_2) = \langle \vec{e}_1, \vec{e}_2 \rangle \]
+
+        Pearson's correlation has the following properties:
+        \begin{itemize}
+            \item If the variables are independent, then the correlation is 0 (but not vice versa).
+            \item If the correlation is 0, then there is no linear relationship between the variables.
+            \item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship.
+        \end{itemize}
+
+    \item[Symmetric uncertainty]
+        Measure of correlation for nominal attributes:
+        \[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \]
+        where $H$ is the entropy.
+\end{description}
--- a/src/machine-learning-and-data-mining/sections/_data_prepro.tex
+++ b/src/machine-learning-and-data-mining/sections/_data_prepro.tex
@ -0,0 +1,163 @@
+\chapter{Data preprocessing}
+
+\section{Aggregation}
+\marginnote{Aggregation}
+
+Combining multiple attributes into a single one.
+Useful for:
+\begin{descriptionlist}
+    \item[Data reduction]
+        Reduce the number of attributes.
+
+    \item[Change of scale] 
+        View the data in a more general level of detail (e.g. from cities and regions to countries).
+
+    \item[Data stability] 
+        Aggregated data tend to have less variability.
+\end{descriptionlist}
+
+
+
+\section{Sampling}
+\marginnote{Sampling}
+Sampling can be used when the full dataset is too expensive to obtain or too expensive to process.
+Obviously a sample has to be representative.
+
+Type of sampling techniques are:
+\begin{descriptionlist}
+    \item[Simple random] \marginnote{Simple random}
+        Extraction of a single element following a given probability distribution.
+    
+    \item[With replacement] \marginnote{With replacement}
+        Multiple extractions with repetitions following a given probability distribution
+        (i.e. multiple simple random extractions).
+
+        If the population is small, the sample may underestimate the actual population.
+
+    \item[Without replacement] \marginnote{Without replacement}
+        Multiple extractions without repetitions following a given probability distribution.
+
+    \item[Stratified] \marginnote{Stratified}
+        Split the data and sample from each partition.
+        Useful when the partitions are homogenous.
+\end{descriptionlist}
+
+\begin{description}
+    \item[Sample size]
+        The sampling size represents a tradeoff between data reduction and precision.
+        In a labeled dataset, it is important to consider the probability of sampling data of all the possible classes.
+\end{description}
+
+
+
+\section{Dimensionality reduction}
+
+\begin{description}
+    \item[Curse of dimensionality] \marginnote{Curse of dimensionality}
+        Data with a high number of dimensions result in a sparse feature space
+        where distance metrics are ineffective. 
+
+    \item[Dimensionality reduction] \marginnote{Dimensionality reduction}
+        Useful to:
+        \begin{itemize}
+            \item Avoid the curse of dimensionality.
+            \item Reduce noise.
+            \item Reduce the time and space complexity of mining and learning algorithms.
+            \item Visualize multi-dimensional data.
+        \end{itemize}
+\end{description}
+
+\subsection{Principal component analysis} \marginnote{PCA} 
+Projection of the data into a lower-dimensional space that maximizes the variance of the data.
+It can be proven that this problem can be solved by finding the eigenvectors of the covariance matrix of the data.
+
+\subsection{Feature subset selection} \marginnote{Feature subset selection} 
+    Local technique to reduce dimensionality by:
+    \begin{itemize}
+        \item Removing redundant attributes.
+        \item Removing irrelevant attributes.
+    \end{itemize}
+
+    This can be achieved by:
+    \begin{descriptionlist}
+        \item[Brute force] 
+            Try all the possible subsets of the dataset.
+
+        \item[Embedded approach]
+            Feature selection is naturally done by the learning algorithm (e.g. decision trees).
+
+        \item[Filter approach]  
+            Features are filtered using domain-specific knowledge.
+
+        \item[Wrapper approaches]  
+            A mining algorithm is used to select the best features.
+    \end{descriptionlist}
+
+
+
+
+\section{Feature creation}
+\marginnote{Feature creation}
+Useful to help a learning algorithm capture data characteristics.
+Possible approaches are:
+\begin{descriptionlist}
+    \item[Feature extraction] 
+        Features extracted from the existing ones (e.g. from a picture of a face, the eye distance can be a new feature).
+
+    \item[Mapping] 
+        Projecting the data into a new feature space.
+
+    \item[New features] 
+        Add new, possibly redundant, features.
+\end{descriptionlist}
+
+
+
+\section{Data type conversion}
+
+\subsection{One-hot encoding} \marginnote{One-hot encoding}
+    A discrete feature $E \in \{ e_1, \dots, e_n \}$ with $n$ unique values is replaced with 
+    $n$ new binary features $H_{e_1}, \dots, H_{e_n}$ each corresponding to a value of $E$.
+    For each entry, if its feature $E$ has value $e_i$, then $H_{e_i} = \texttt{true}$ and the rests are \texttt{false}.
+
+\subsection{Ordinal encoding} \marginnote{Ordinal encoding}
+    A feature whose values have an ordering can be converted in a consecutive sequence of integers
+    (e.g. ["good", "neutral", "bad"] $\mapsto$ [1, 0, -1]).
+
+\subsection{Discretization} \marginnote{Discretization}
+    Convert a continuous feature to a discrete one.
+    \begin{description}
+        \item[Binarization] \marginnote{Binarization}
+            Given a continuous feature and a threshold, 
+            it can be replaced with a new binary feature that is \texttt{true} if the value is above the threshold and \texttt{false} otherwise.
+        
+        \item[Thresholding] \marginnote{Thresholding}
+            Same as binarization but using multiple thresholds.
+
+        \item[K-bins] \marginnote{K-bins}
+            A continuous feature is discretized using $k$ bins each representing an integer from $0$ to $k-1$.
+    \end{description}
+
+
+
+\section{Attribute transformation}
+Useful for normalizing features with different scales and outliers.
+
+\begin{description}
+    \item[Mapping] \marginnote{Mapping}
+        Map the domain of a feature into a new set of values (i.e. apply a function).
+
+    \item[Standardization] \marginnote{Standardization}
+        Transform a feature with Gaussian distribution into a standard distribution.
+        \[ x = \frac{x - \mu}{\sigma} \]
+
+    \item[Rescaling] \marginnote{Rescaling}
+        Map a feature into a fixed range (e.g. scale to $[0, 1]$ or $[-1, 1]$).
+
+    \item[Affine transformation] \marginnote{Affine transformation}
+        Apply a linear transformation on a feature before rescaling it.
+        This method is more robust to outliers.
+
+    \item[Normalization] \marginnote{Normalization}
+        Normalize each data row to unit norm.
+\end{description}