diff --git a/src/machine-learning-and-data-mining/img/mahalanobis.png b/src/machine-learning-and-data-mining/img/mahalanobis.png new file mode 100644 index 0000000..fc5520d Binary files /dev/null and b/src/machine-learning-and-data-mining/img/mahalanobis.png differ diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex index 3593190..e25791d 100644 --- a/src/machine-learning-and-data-mining/main.tex +++ b/src/machine-learning-and-data-mining/main.tex @@ -29,7 +29,9 @@ \input{sections/_data_lake.tex} \input{sections/_crisp.tex} \input{sections/_machine_learning.tex} + \input{sections/_data_prepro.tex} \input{sections/_classification.tex} \input{sections/_regression.tex} + \input{sections/_clustering.tex} \end{document} \ No newline at end of file diff --git a/src/machine-learning-and-data-mining/sections/_classification.tex b/src/machine-learning-and-data-mining/sections/_classification.tex index 01c8909..33310b2 100644 --- a/src/machine-learning-and-data-mining/sections/_classification.tex +++ b/src/machine-learning-and-data-mining/sections/_classification.tex @@ -269,14 +269,6 @@ a macro (unweighted) average or a class-weighted average. When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$), when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and when $\kappa = 0$, there is random agreement. - - - \item[Cost sensitive learning] \marginnote{Cost sensitive learning} - Assign a cost to the errors. This can be done by: - \begin{itemize} - \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification. - \item Weighting the classes (possible in some algorithms). - \end{itemize} \end{description} @@ -317,6 +309,35 @@ a macro (unweighted) average or a class-weighted average. \end{description} +\subsection{Data imbalance} +A classifier may not perform well when predicting a minority class of the training data. +Possible solutions are: +\begin{descriptionlist} + \item[Undersampling] \marginnote{Undersampling} + Randomly reduce the number of example of the majority classes. + + \item[Oversampling] \marginnote{Oversampling} + Increase the examples of the minority classes. + + \begin{description} + \item[Synthetic minority oversampling technique (SMOTE)] \marginnote{SMOTE} + \begin{enumerate} + \item Randomly select an example $x$ belonging to the minority class. + \item Select a random neighbor $z_i$ among its $k$-nearest neighbors $z_1, \dots, z_k$. + \item Synthetize a new example by selecting a random point of the feature space between $x$ and $z_i$. + \end{enumerate} + \end{description} + + \item[Cost sensitive learning] \marginnote{Cost sensitive learning} + Assign a cost to the errors. Higher weights are assigned to minority classes. + This can be done by: + \begin{itemize} + \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification. + \item Weighting the classes (possible in some algorithms). + \end{itemize} +\end{descriptionlist} + + \section{Decision trees} diff --git a/src/machine-learning-and-data-mining/sections/_clustering.tex b/src/machine-learning-and-data-mining/sections/_clustering.tex new file mode 100644 index 0000000..3ba3d57 --- /dev/null +++ b/src/machine-learning-and-data-mining/sections/_clustering.tex @@ -0,0 +1,137 @@ +\chapter{Clustering} + + +\section{Similarity and dissimilarity} + +\begin{description} + \item[Similarity] \marginnote{Similarity} + Measures how alike two objects are. + Often defined in the range $[0, 1]$. + + \item[Dissimilarity] \marginnote{Dissimilarity} + Measures how two objects differ. + 0 indicates no difference while the upper-bound varies. +\end{description} + +\begin{table}[ht] + \centering + \renewcommand{\arraystretch}{2} + \begin{tabular}{c | c | c} + \textbf{Attribute type} & \textbf{Dissimilarity} & \textbf{Similarity} \\ + \hline + Nominal & $d(p, q) = \begin{cases} 0 & \text{if } p=q \\ 1 & \text{if } p \neq q \end{cases}$ & $s(p, q) = 1 - d(p, q)$ \\ + \hline + Ordinal & $d(p, q) = \frac{\vert p - q \vert}{V}$ with $p, q \in \{ 0, \dots, V \}$ & $s(p, q) = 1 - d(p, q)$ \\ + \hline + Interval or ratio & $d(p, q) = \vert p - q \vert$ & $s(p, q) = \frac{1}{1 + d(p, q)}$ + \end{tabular} + \caption{Similarity and dissimilarity by attribute type} +\end{table} + +\begin{description} + \item[Similarity properties] \phantom{} + \begin{enumerate} + \item $\texttt{sim}(p, q) = 1$ iff $p = q$. + \item $\texttt{sim}(p, q) = \texttt{sim}(q, p)$. + \end{enumerate} +\end{description} + + +\subsection{Distance} + +Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics are: +\begin{descriptionlist} + \item[Minkowski distance ($L_r$)] \marginnote{Minkowski distance} + \[ \texttt{dist}(p, q) = \left( \sum_{d=1}^{D} \vert p_d - q_d \vert^r \right)^{\frac{1}{r}} \] + where $r$ is a parameter. + + Common values for $r$ are: + \begin{descriptionlist} + \item[$r = 1$] + Corresponds to the $L_1$ norm. + It is useful for discriminating 0 distance and near-0 distance as + an $\varepsilon$ change in the data corresponds to an $\varepsilon$ change in the distance. + \item[$r = 2$] + Corresponds to the Euclidean distance or $L_2$ norm. + \item[$r = \infty$] + Corresponds to the $L_\infty$ norm. + Considers only the dimensions with the maximum difference. + \end{descriptionlist} + + \item[Mahalanobis distance] \marginnote{Mahalanobis distance} + \[ \texttt{dist}(p, q) = \sqrt{ (p-q) \matr{\Sigma}^{-1} (p-q)^T } \] + where $\matr{\Sigma}$ is the covariance matrix of the dataset. + The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them + points towards a direction of greater variation of the data. + + \begin{figure}[h] + \centering + \includegraphics[width=0.35\textwidth]{img/mahalanobis.png} + \caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.} + \end{figure} +\end{descriptionlist} + +\subsubsection{Distance properties} +\begin{descriptionlist} + \item[Positive definiteness] + $\texttt{dist}(p, q) \geq 0$ and $\texttt{dist}(p, q) = 0$ iff $p = q$. + \item[Symmetry] + $\texttt{dist}(p, q) = \texttt{dist}(q, p)$ + \item[Triangle inequality] + $\texttt{dist}(p, q) \leq \texttt{dist}(p, r) + \texttt{dist}(r, q)$ +\end{descriptionlist} + + + +\subsection{Vector similarity} + +\begin{description} + \item[Binary vectors] + Given two examples $p$ and $q$ with binary features, we can compute the following values: + \[ + \begin{split} + M_{00} &= \text{ number of features that equals to 0 for both $p$ and $q$} \\ + M_{01} &= \text{ number of features that equals to 0 for $p$ and 1 for $q$} \\ + M_{10} &= \text{ number of features that equals to 1 for $p$ and 0 for $q$} \\ + M_{11} &= \text{ number of features that equals to 1 for both $p$ and $q$} + \end{split} + \] + Possible distance metrics are: + \begin{descriptionlist} + \item[Simple matching coefficient] \marginnote{Simple matching coefficient} + $\texttt{SMC}(p, q) = \frac{M_{00} + M_{11}}{M_{00} + M_{01} + M_{10} + M_{11}}$ + \item[Jaccard coefficient] \marginnote{Jaccard coefficient} + $\texttt{JC}(p, q) = \frac{M_{11}}{M_{01} + M_{10} + M_{11}}$ + \end{descriptionlist} + + \item[Cosine similarity] \marginnote{Cosine similarity} + Cosine of the angle between two vectors: + \[ \texttt{cos}(p, q) = \frac{p \cdot q}{\Vert p \Vert \cdot \Vert q \Vert} \] + + \item[Extended Jaccard coefficient (Tanimoto)] \marginnote{Extended Jaccard coefficient (Tanimoto)} + Variation of the Jaccard coefficient for continuous values: + \[ \texttt{T}(p, q) = \frac{p \cdot q}{\Vert p \Vert^2 + \Vert q \Vert^2 - p \cdot q} \] +\end{description} + + +\subsection{Correlation} + +\begin{description} + \item[Pearson's correlation] \marginnote{Pearson's correlation} + Measure of linear relationship between a pair of quantitative attributes $e_1$ and $e_2$. + To compute the Pearson's correlation, the values of $e_1$ and $e_2$ are first standardized and then ordered to obtain the vectors $\vec{e}_1$ and $\vec{e}_2$. + The correlation is then computed as the dot product between $\vec{e}_1$ and $\vec{e}_2$: + \[ \texttt{corr}(e_1, e_2) = \langle \vec{e}_1, \vec{e}_2 \rangle \] + + Pearson's correlation has the following properties: + \begin{itemize} + \item If the variables are independent, then the correlation is 0 (but not vice versa). + \item If the correlation is 0, then there is no linear relationship between the variables. + \item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship. + \end{itemize} + + \item[Symmetric uncertainty] + Measure of correlation for nominal attributes: + \[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \] + where $H$ is the entropy. +\end{description} \ No newline at end of file diff --git a/src/machine-learning-and-data-mining/sections/_data_prepro.tex b/src/machine-learning-and-data-mining/sections/_data_prepro.tex new file mode 100644 index 0000000..95a1d88 --- /dev/null +++ b/src/machine-learning-and-data-mining/sections/_data_prepro.tex @@ -0,0 +1,163 @@ +\chapter{Data preprocessing} + +\section{Aggregation} +\marginnote{Aggregation} + +Combining multiple attributes into a single one. +Useful for: +\begin{descriptionlist} + \item[Data reduction] + Reduce the number of attributes. + + \item[Change of scale] + View the data in a more general level of detail (e.g. from cities and regions to countries). + + \item[Data stability] + Aggregated data tend to have less variability. +\end{descriptionlist} + + + +\section{Sampling} +\marginnote{Sampling} +Sampling can be used when the full dataset is too expensive to obtain or too expensive to process. +Obviously a sample has to be representative. + +Type of sampling techniques are: +\begin{descriptionlist} + \item[Simple random] \marginnote{Simple random} + Extraction of a single element following a given probability distribution. + + \item[With replacement] \marginnote{With replacement} + Multiple extractions with repetitions following a given probability distribution + (i.e. multiple simple random extractions). + + If the population is small, the sample may underestimate the actual population. + + \item[Without replacement] \marginnote{Without replacement} + Multiple extractions without repetitions following a given probability distribution. + + \item[Stratified] \marginnote{Stratified} + Split the data and sample from each partition. + Useful when the partitions are homogenous. +\end{descriptionlist} + +\begin{description} + \item[Sample size] + The sampling size represents a tradeoff between data reduction and precision. + In a labeled dataset, it is important to consider the probability of sampling data of all the possible classes. +\end{description} + + + +\section{Dimensionality reduction} + +\begin{description} + \item[Curse of dimensionality] \marginnote{Curse of dimensionality} + Data with a high number of dimensions result in a sparse feature space + where distance metrics are ineffective. + + \item[Dimensionality reduction] \marginnote{Dimensionality reduction} + Useful to: + \begin{itemize} + \item Avoid the curse of dimensionality. + \item Reduce noise. + \item Reduce the time and space complexity of mining and learning algorithms. + \item Visualize multi-dimensional data. + \end{itemize} +\end{description} + +\subsection{Principal component analysis} \marginnote{PCA} +Projection of the data into a lower-dimensional space that maximizes the variance of the data. +It can be proven that this problem can be solved by finding the eigenvectors of the covariance matrix of the data. + +\subsection{Feature subset selection} \marginnote{Feature subset selection} + Local technique to reduce dimensionality by: + \begin{itemize} + \item Removing redundant attributes. + \item Removing irrelevant attributes. + \end{itemize} + + This can be achieved by: + \begin{descriptionlist} + \item[Brute force] + Try all the possible subsets of the dataset. + + \item[Embedded approach] + Feature selection is naturally done by the learning algorithm (e.g. decision trees). + + \item[Filter approach] + Features are filtered using domain-specific knowledge. + + \item[Wrapper approaches] + A mining algorithm is used to select the best features. + \end{descriptionlist} + + + + +\section{Feature creation} +\marginnote{Feature creation} +Useful to help a learning algorithm capture data characteristics. +Possible approaches are: +\begin{descriptionlist} + \item[Feature extraction] + Features extracted from the existing ones (e.g. from a picture of a face, the eye distance can be a new feature). + + \item[Mapping] + Projecting the data into a new feature space. + + \item[New features] + Add new, possibly redundant, features. +\end{descriptionlist} + + + +\section{Data type conversion} + +\subsection{One-hot encoding} \marginnote{One-hot encoding} + A discrete feature $E \in \{ e_1, \dots, e_n \}$ with $n$ unique values is replaced with + $n$ new binary features $H_{e_1}, \dots, H_{e_n}$ each corresponding to a value of $E$. + For each entry, if its feature $E$ has value $e_i$, then $H_{e_i} = \texttt{true}$ and the rests are \texttt{false}. + +\subsection{Ordinal encoding} \marginnote{Ordinal encoding} + A feature whose values have an ordering can be converted in a consecutive sequence of integers + (e.g. ["good", "neutral", "bad"] $\mapsto$ [1, 0, -1]). + +\subsection{Discretization} \marginnote{Discretization} + Convert a continuous feature to a discrete one. + \begin{description} + \item[Binarization] \marginnote{Binarization} + Given a continuous feature and a threshold, + it can be replaced with a new binary feature that is \texttt{true} if the value is above the threshold and \texttt{false} otherwise. + + \item[Thresholding] \marginnote{Thresholding} + Same as binarization but using multiple thresholds. + + \item[K-bins] \marginnote{K-bins} + A continuous feature is discretized using $k$ bins each representing an integer from $0$ to $k-1$. + \end{description} + + + +\section{Attribute transformation} +Useful for normalizing features with different scales and outliers. + +\begin{description} + \item[Mapping] \marginnote{Mapping} + Map the domain of a feature into a new set of values (i.e. apply a function). + + \item[Standardization] \marginnote{Standardization} + Transform a feature with Gaussian distribution into a standard distribution. + \[ x = \frac{x - \mu}{\sigma} \] + + \item[Rescaling] \marginnote{Rescaling} + Map a feature into a fixed range (e.g. scale to $[0, 1]$ or $[-1, 1]$). + + \item[Affine transformation] \marginnote{Affine transformation} + Apply a linear transformation on a feature before rescaling it. + This method is more robust to outliers. + + \item[Normalization] \marginnote{Normalization} + Normalize each data row to unit norm. +\end{description}