mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 02:52:22 +01:00
Add ML/DM data preprocessing
This commit is contained in:
BIN
src/machine-learning-and-data-mining/img/mahalanobis.png
Normal file
BIN
src/machine-learning-and-data-mining/img/mahalanobis.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 35 KiB |
@ -29,7 +29,9 @@
|
|||||||
\input{sections/_data_lake.tex}
|
\input{sections/_data_lake.tex}
|
||||||
\input{sections/_crisp.tex}
|
\input{sections/_crisp.tex}
|
||||||
\input{sections/_machine_learning.tex}
|
\input{sections/_machine_learning.tex}
|
||||||
|
\input{sections/_data_prepro.tex}
|
||||||
\input{sections/_classification.tex}
|
\input{sections/_classification.tex}
|
||||||
\input{sections/_regression.tex}
|
\input{sections/_regression.tex}
|
||||||
|
\input{sections/_clustering.tex}
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
@ -269,14 +269,6 @@ a macro (unweighted) average or a class-weighted average.
|
|||||||
When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$),
|
When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$),
|
||||||
when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and
|
when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and
|
||||||
when $\kappa = 0$, there is random agreement.
|
when $\kappa = 0$, there is random agreement.
|
||||||
|
|
||||||
|
|
||||||
\item[Cost sensitive learning] \marginnote{Cost sensitive learning}
|
|
||||||
Assign a cost to the errors. This can be done by:
|
|
||||||
\begin{itemize}
|
|
||||||
\item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
|
|
||||||
\item Weighting the classes (possible in some algorithms).
|
|
||||||
\end{itemize}
|
|
||||||
\end{description}
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
@ -317,6 +309,35 @@ a macro (unweighted) average or a class-weighted average.
|
|||||||
\end{description}
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Data imbalance}
|
||||||
|
A classifier may not perform well when predicting a minority class of the training data.
|
||||||
|
Possible solutions are:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Undersampling] \marginnote{Undersampling}
|
||||||
|
Randomly reduce the number of example of the majority classes.
|
||||||
|
|
||||||
|
\item[Oversampling] \marginnote{Oversampling}
|
||||||
|
Increase the examples of the minority classes.
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Synthetic minority oversampling technique (SMOTE)] \marginnote{SMOTE}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item Randomly select an example $x$ belonging to the minority class.
|
||||||
|
\item Select a random neighbor $z_i$ among its $k$-nearest neighbors $z_1, \dots, z_k$.
|
||||||
|
\item Synthetize a new example by selecting a random point of the feature space between $x$ and $z_i$.
|
||||||
|
\end{enumerate}
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
\item[Cost sensitive learning] \marginnote{Cost sensitive learning}
|
||||||
|
Assign a cost to the errors. Higher weights are assigned to minority classes.
|
||||||
|
This can be done by:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
|
||||||
|
\item Weighting the classes (possible in some algorithms).
|
||||||
|
\end{itemize}
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
\section{Decision trees}
|
\section{Decision trees}
|
||||||
|
|
||||||
|
|||||||
137
src/machine-learning-and-data-mining/sections/_clustering.tex
Normal file
137
src/machine-learning-and-data-mining/sections/_clustering.tex
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
\chapter{Clustering}
|
||||||
|
|
||||||
|
|
||||||
|
\section{Similarity and dissimilarity}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Similarity] \marginnote{Similarity}
|
||||||
|
Measures how alike two objects are.
|
||||||
|
Often defined in the range $[0, 1]$.
|
||||||
|
|
||||||
|
\item[Dissimilarity] \marginnote{Dissimilarity}
|
||||||
|
Measures how two objects differ.
|
||||||
|
0 indicates no difference while the upper-bound varies.
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
\begin{table}[ht]
|
||||||
|
\centering
|
||||||
|
\renewcommand{\arraystretch}{2}
|
||||||
|
\begin{tabular}{c | c | c}
|
||||||
|
\textbf{Attribute type} & \textbf{Dissimilarity} & \textbf{Similarity} \\
|
||||||
|
\hline
|
||||||
|
Nominal & $d(p, q) = \begin{cases} 0 & \text{if } p=q \\ 1 & \text{if } p \neq q \end{cases}$ & $s(p, q) = 1 - d(p, q)$ \\
|
||||||
|
\hline
|
||||||
|
Ordinal & $d(p, q) = \frac{\vert p - q \vert}{V}$ with $p, q \in \{ 0, \dots, V \}$ & $s(p, q) = 1 - d(p, q)$ \\
|
||||||
|
\hline
|
||||||
|
Interval or ratio & $d(p, q) = \vert p - q \vert$ & $s(p, q) = \frac{1}{1 + d(p, q)}$
|
||||||
|
\end{tabular}
|
||||||
|
\caption{Similarity and dissimilarity by attribute type}
|
||||||
|
\end{table}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Similarity properties] \phantom{}
|
||||||
|
\begin{enumerate}
|
||||||
|
\item $\texttt{sim}(p, q) = 1$ iff $p = q$.
|
||||||
|
\item $\texttt{sim}(p, q) = \texttt{sim}(q, p)$.
|
||||||
|
\end{enumerate}
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Distance}
|
||||||
|
|
||||||
|
Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics are:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Minkowski distance ($L_r$)] \marginnote{Minkowski distance}
|
||||||
|
\[ \texttt{dist}(p, q) = \left( \sum_{d=1}^{D} \vert p_d - q_d \vert^r \right)^{\frac{1}{r}} \]
|
||||||
|
where $r$ is a parameter.
|
||||||
|
|
||||||
|
Common values for $r$ are:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[$r = 1$]
|
||||||
|
Corresponds to the $L_1$ norm.
|
||||||
|
It is useful for discriminating 0 distance and near-0 distance as
|
||||||
|
an $\varepsilon$ change in the data corresponds to an $\varepsilon$ change in the distance.
|
||||||
|
\item[$r = 2$]
|
||||||
|
Corresponds to the Euclidean distance or $L_2$ norm.
|
||||||
|
\item[$r = \infty$]
|
||||||
|
Corresponds to the $L_\infty$ norm.
|
||||||
|
Considers only the dimensions with the maximum difference.
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
\item[Mahalanobis distance] \marginnote{Mahalanobis distance}
|
||||||
|
\[ \texttt{dist}(p, q) = \sqrt{ (p-q) \matr{\Sigma}^{-1} (p-q)^T } \]
|
||||||
|
where $\matr{\Sigma}$ is the covariance matrix of the dataset.
|
||||||
|
The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them
|
||||||
|
points towards a direction of greater variation of the data.
|
||||||
|
|
||||||
|
\begin{figure}[h]
|
||||||
|
\centering
|
||||||
|
\includegraphics[width=0.35\textwidth]{img/mahalanobis.png}
|
||||||
|
\caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.}
|
||||||
|
\end{figure}
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
\subsubsection{Distance properties}
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Positive definiteness]
|
||||||
|
$\texttt{dist}(p, q) \geq 0$ and $\texttt{dist}(p, q) = 0$ iff $p = q$.
|
||||||
|
\item[Symmetry]
|
||||||
|
$\texttt{dist}(p, q) = \texttt{dist}(q, p)$
|
||||||
|
\item[Triangle inequality]
|
||||||
|
$\texttt{dist}(p, q) \leq \texttt{dist}(p, r) + \texttt{dist}(r, q)$
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Vector similarity}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Binary vectors]
|
||||||
|
Given two examples $p$ and $q$ with binary features, we can compute the following values:
|
||||||
|
\[
|
||||||
|
\begin{split}
|
||||||
|
M_{00} &= \text{ number of features that equals to 0 for both $p$ and $q$} \\
|
||||||
|
M_{01} &= \text{ number of features that equals to 0 for $p$ and 1 for $q$} \\
|
||||||
|
M_{10} &= \text{ number of features that equals to 1 for $p$ and 0 for $q$} \\
|
||||||
|
M_{11} &= \text{ number of features that equals to 1 for both $p$ and $q$}
|
||||||
|
\end{split}
|
||||||
|
\]
|
||||||
|
Possible distance metrics are:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Simple matching coefficient] \marginnote{Simple matching coefficient}
|
||||||
|
$\texttt{SMC}(p, q) = \frac{M_{00} + M_{11}}{M_{00} + M_{01} + M_{10} + M_{11}}$
|
||||||
|
\item[Jaccard coefficient] \marginnote{Jaccard coefficient}
|
||||||
|
$\texttt{JC}(p, q) = \frac{M_{11}}{M_{01} + M_{10} + M_{11}}$
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
\item[Cosine similarity] \marginnote{Cosine similarity}
|
||||||
|
Cosine of the angle between two vectors:
|
||||||
|
\[ \texttt{cos}(p, q) = \frac{p \cdot q}{\Vert p \Vert \cdot \Vert q \Vert} \]
|
||||||
|
|
||||||
|
\item[Extended Jaccard coefficient (Tanimoto)] \marginnote{Extended Jaccard coefficient (Tanimoto)}
|
||||||
|
Variation of the Jaccard coefficient for continuous values:
|
||||||
|
\[ \texttt{T}(p, q) = \frac{p \cdot q}{\Vert p \Vert^2 + \Vert q \Vert^2 - p \cdot q} \]
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
\subsection{Correlation}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Pearson's correlation] \marginnote{Pearson's correlation}
|
||||||
|
Measure of linear relationship between a pair of quantitative attributes $e_1$ and $e_2$.
|
||||||
|
To compute the Pearson's correlation, the values of $e_1$ and $e_2$ are first standardized and then ordered to obtain the vectors $\vec{e}_1$ and $\vec{e}_2$.
|
||||||
|
The correlation is then computed as the dot product between $\vec{e}_1$ and $\vec{e}_2$:
|
||||||
|
\[ \texttt{corr}(e_1, e_2) = \langle \vec{e}_1, \vec{e}_2 \rangle \]
|
||||||
|
|
||||||
|
Pearson's correlation has the following properties:
|
||||||
|
\begin{itemize}
|
||||||
|
\item If the variables are independent, then the correlation is 0 (but not vice versa).
|
||||||
|
\item If the correlation is 0, then there is no linear relationship between the variables.
|
||||||
|
\item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
\item[Symmetric uncertainty]
|
||||||
|
Measure of correlation for nominal attributes:
|
||||||
|
\[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \]
|
||||||
|
where $H$ is the entropy.
|
||||||
|
\end{description}
|
||||||
163
src/machine-learning-and-data-mining/sections/_data_prepro.tex
Normal file
163
src/machine-learning-and-data-mining/sections/_data_prepro.tex
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
\chapter{Data preprocessing}
|
||||||
|
|
||||||
|
\section{Aggregation}
|
||||||
|
\marginnote{Aggregation}
|
||||||
|
|
||||||
|
Combining multiple attributes into a single one.
|
||||||
|
Useful for:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Data reduction]
|
||||||
|
Reduce the number of attributes.
|
||||||
|
|
||||||
|
\item[Change of scale]
|
||||||
|
View the data in a more general level of detail (e.g. from cities and regions to countries).
|
||||||
|
|
||||||
|
\item[Data stability]
|
||||||
|
Aggregated data tend to have less variability.
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Sampling}
|
||||||
|
\marginnote{Sampling}
|
||||||
|
Sampling can be used when the full dataset is too expensive to obtain or too expensive to process.
|
||||||
|
Obviously a sample has to be representative.
|
||||||
|
|
||||||
|
Type of sampling techniques are:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Simple random] \marginnote{Simple random}
|
||||||
|
Extraction of a single element following a given probability distribution.
|
||||||
|
|
||||||
|
\item[With replacement] \marginnote{With replacement}
|
||||||
|
Multiple extractions with repetitions following a given probability distribution
|
||||||
|
(i.e. multiple simple random extractions).
|
||||||
|
|
||||||
|
If the population is small, the sample may underestimate the actual population.
|
||||||
|
|
||||||
|
\item[Without replacement] \marginnote{Without replacement}
|
||||||
|
Multiple extractions without repetitions following a given probability distribution.
|
||||||
|
|
||||||
|
\item[Stratified] \marginnote{Stratified}
|
||||||
|
Split the data and sample from each partition.
|
||||||
|
Useful when the partitions are homogenous.
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Sample size]
|
||||||
|
The sampling size represents a tradeoff between data reduction and precision.
|
||||||
|
In a labeled dataset, it is important to consider the probability of sampling data of all the possible classes.
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Dimensionality reduction}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Curse of dimensionality] \marginnote{Curse of dimensionality}
|
||||||
|
Data with a high number of dimensions result in a sparse feature space
|
||||||
|
where distance metrics are ineffective.
|
||||||
|
|
||||||
|
\item[Dimensionality reduction] \marginnote{Dimensionality reduction}
|
||||||
|
Useful to:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Avoid the curse of dimensionality.
|
||||||
|
\item Reduce noise.
|
||||||
|
\item Reduce the time and space complexity of mining and learning algorithms.
|
||||||
|
\item Visualize multi-dimensional data.
|
||||||
|
\end{itemize}
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
\subsection{Principal component analysis} \marginnote{PCA}
|
||||||
|
Projection of the data into a lower-dimensional space that maximizes the variance of the data.
|
||||||
|
It can be proven that this problem can be solved by finding the eigenvectors of the covariance matrix of the data.
|
||||||
|
|
||||||
|
\subsection{Feature subset selection} \marginnote{Feature subset selection}
|
||||||
|
Local technique to reduce dimensionality by:
|
||||||
|
\begin{itemize}
|
||||||
|
\item Removing redundant attributes.
|
||||||
|
\item Removing irrelevant attributes.
|
||||||
|
\end{itemize}
|
||||||
|
|
||||||
|
This can be achieved by:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Brute force]
|
||||||
|
Try all the possible subsets of the dataset.
|
||||||
|
|
||||||
|
\item[Embedded approach]
|
||||||
|
Feature selection is naturally done by the learning algorithm (e.g. decision trees).
|
||||||
|
|
||||||
|
\item[Filter approach]
|
||||||
|
Features are filtered using domain-specific knowledge.
|
||||||
|
|
||||||
|
\item[Wrapper approaches]
|
||||||
|
A mining algorithm is used to select the best features.
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Feature creation}
|
||||||
|
\marginnote{Feature creation}
|
||||||
|
Useful to help a learning algorithm capture data characteristics.
|
||||||
|
Possible approaches are:
|
||||||
|
\begin{descriptionlist}
|
||||||
|
\item[Feature extraction]
|
||||||
|
Features extracted from the existing ones (e.g. from a picture of a face, the eye distance can be a new feature).
|
||||||
|
|
||||||
|
\item[Mapping]
|
||||||
|
Projecting the data into a new feature space.
|
||||||
|
|
||||||
|
\item[New features]
|
||||||
|
Add new, possibly redundant, features.
|
||||||
|
\end{descriptionlist}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Data type conversion}
|
||||||
|
|
||||||
|
\subsection{One-hot encoding} \marginnote{One-hot encoding}
|
||||||
|
A discrete feature $E \in \{ e_1, \dots, e_n \}$ with $n$ unique values is replaced with
|
||||||
|
$n$ new binary features $H_{e_1}, \dots, H_{e_n}$ each corresponding to a value of $E$.
|
||||||
|
For each entry, if its feature $E$ has value $e_i$, then $H_{e_i} = \texttt{true}$ and the rests are \texttt{false}.
|
||||||
|
|
||||||
|
\subsection{Ordinal encoding} \marginnote{Ordinal encoding}
|
||||||
|
A feature whose values have an ordering can be converted in a consecutive sequence of integers
|
||||||
|
(e.g. ["good", "neutral", "bad"] $\mapsto$ [1, 0, -1]).
|
||||||
|
|
||||||
|
\subsection{Discretization} \marginnote{Discretization}
|
||||||
|
Convert a continuous feature to a discrete one.
|
||||||
|
\begin{description}
|
||||||
|
\item[Binarization] \marginnote{Binarization}
|
||||||
|
Given a continuous feature and a threshold,
|
||||||
|
it can be replaced with a new binary feature that is \texttt{true} if the value is above the threshold and \texttt{false} otherwise.
|
||||||
|
|
||||||
|
\item[Thresholding] \marginnote{Thresholding}
|
||||||
|
Same as binarization but using multiple thresholds.
|
||||||
|
|
||||||
|
\item[K-bins] \marginnote{K-bins}
|
||||||
|
A continuous feature is discretized using $k$ bins each representing an integer from $0$ to $k-1$.
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Attribute transformation}
|
||||||
|
Useful for normalizing features with different scales and outliers.
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Mapping] \marginnote{Mapping}
|
||||||
|
Map the domain of a feature into a new set of values (i.e. apply a function).
|
||||||
|
|
||||||
|
\item[Standardization] \marginnote{Standardization}
|
||||||
|
Transform a feature with Gaussian distribution into a standard distribution.
|
||||||
|
\[ x = \frac{x - \mu}{\sigma} \]
|
||||||
|
|
||||||
|
\item[Rescaling] \marginnote{Rescaling}
|
||||||
|
Map a feature into a fixed range (e.g. scale to $[0, 1]$ or $[-1, 1]$).
|
||||||
|
|
||||||
|
\item[Affine transformation] \marginnote{Affine transformation}
|
||||||
|
Apply a linear transformation on a feature before rescaling it.
|
||||||
|
This method is more robust to outliers.
|
||||||
|
|
||||||
|
\item[Normalization] \marginnote{Normalization}
|
||||||
|
Normalize each data row to unit norm.
|
||||||
|
\end{description}
|
||||||
Reference in New Issue
Block a user