diff --git a/src/machine-learning-and-data-mining/img/_iris_boxplot_general.pdf b/src/machine-learning-and-data-mining/img/_iris_boxplot_general.pdf new file mode 100644 index 0000000..9367ac7 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_iris_boxplot_general.pdf differ diff --git a/src/machine-learning-and-data-mining/img/_iris_boxplot_inside.pdf b/src/machine-learning-and-data-mining/img/_iris_boxplot_inside.pdf new file mode 100644 index 0000000..13f2070 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_iris_boxplot_inside.pdf differ diff --git a/src/machine-learning-and-data-mining/img/_iris_decision_tree_example.pdf b/src/machine-learning-and-data-mining/img/_iris_decision_tree_example.pdf new file mode 100644 index 0000000..0a84c24 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_iris_decision_tree_example.pdf differ diff --git a/src/machine-learning-and-data-mining/img/_iris_histogram.pdf b/src/machine-learning-and-data-mining/img/_iris_histogram.pdf new file mode 100644 index 0000000..d42ac96 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_iris_histogram.pdf differ diff --git a/src/machine-learning-and-data-mining/img/_iris_pairplot.pdf b/src/machine-learning-and-data-mining/img/_iris_pairplot.pdf new file mode 100644 index 0000000..53f3ee2 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_iris_pairplot.pdf differ diff --git a/src/machine-learning-and-data-mining/img/binary_entropy.png b/src/machine-learning-and-data-mining/img/binary_entropy.png new file mode 100644 index 0000000..ce1bd36 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/binary_entropy.png differ diff --git a/src/machine-learning-and-data-mining/img/impurity_comparison.png b/src/machine-learning-and-data-mining/img/impurity_comparison.png new file mode 100644 index 0000000..edb12a2 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/impurity_comparison.png differ diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex index 1dd47b7..d9d0867 100644 --- a/src/machine-learning-and-data-mining/main.tex +++ b/src/machine-learning-and-data-mining/main.tex @@ -29,5 +29,6 @@ \input{sections/_data_lake.tex} \input{sections/_crisp.tex} \input{sections/_machine_learning.tex} + \input{sections/_classification.tex} \end{document} \ No newline at end of file diff --git a/src/machine-learning-and-data-mining/sections/_classification.tex b/src/machine-learning-and-data-mining/sections/_classification.tex new file mode 100644 index 0000000..43b5851 --- /dev/null +++ b/src/machine-learning-and-data-mining/sections/_classification.tex @@ -0,0 +1,239 @@ +\chapter{Classification} + +\begin{description} + \item[(Supervised) classification] \marginnote{Classification} + Given a finite set of classes $C$ and a dataset $\matr{X}$ of $N$ individuals, + each associated to a class $y(\vec{x}) \in C$, + we want to learn a model $\mathcal{M}$ able to + guess the value of $y(\bar{\vec{x}})$ for unseen individuals. + + Classification can be: + \begin{descriptionlist} + \item[Crisp] \marginnote{Crisp classification} + Each individual has one and only one label. + \item[Probabilistic] \marginnote{Probabilistic classification} + Each individual is assigned to a label with a certain probability. + \end{descriptionlist} + + \item[Classification model] \marginnote{Classification model} + A classification model (classifier) makes a prediction by taking as input + a data element $\vec{x}$ and a decision function $y_\vec{\uptheta}$ parametrized on $\vec{\uptheta}$: + \[ \mathcal{M}(\vec{x}, \vec{\uptheta}) = y_\vec{\uptheta}(\vec{x}) \] + + \item[Vapnik-Chervonenkis dimension] \marginnote{Vapnik-Chervonenkis dimension} + A dataset with $N$ elements defines $2^N$ learning problems. + A model $\mathcal{M}$ has Vapnik-Chervonenkis (VC) dimension $N$ if + it is able to solve all the possible learning problems with $N$ elements. + + \begin{example} + A straight line has VC dimension 3. + \end{example} + + \item[Data exploration] \marginnote{Data exploration} + \begin{figure}[ht] + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=\linewidth]{img/_iris_boxplot_general.pdf} + \caption{Iris dataset general boxplot} + \end{subfigure}% + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=\linewidth]{img/_iris_boxplot_inside.pdf} + \caption{Iris dataset class boxplot} + \end{subfigure} + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=\linewidth]{img/_iris_histogram.pdf} + \caption{Iris dataset histograms} + \end{subfigure}% + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=\linewidth]{img/_iris_pairplot.pdf} + \caption{Iris dataset pairplots} + \end{subfigure} + \end{figure} + + \item[Dataset split] + A supervised dataset can be randomly split into: + \begin{descriptionlist} + \item[Train set] \marginnote{Train set} + Used to learn the model. Usually the largest split. + \item[Test set] \marginnote{Test set} + Used to evaluate the trained model. + \item[Validation set] \marginnote{Validation set} + Used to evaluate the model during training. + \end{descriptionlist} + It is assumed that the splits have similar characteristics. + + \item[Overfitting] \marginnote{Overfitting} + Given a dataset $\matr{X}$, a model $\mathcal{M}$ is overfitting if + there exists another model $\mathcal{M}'$ such that: + \[ + \begin{split} + \texttt{error}_\text{train}(\mathcal{M}) &< \texttt{error}_\text{train}(\mathcal{M}') \\ + \texttt{error}_\matr{X}(\mathcal{M}) &> \texttt{error}_\matr{X}(\mathcal{M}') \\ + \end{split} + \] + + Possible causes of overfitting are: + \begin{itemize} + \item Noisy data. + \item Lack of representative instances. + \end{itemize} +\end{description} + + + +\section{Decision trees} + +\subsection{Information theory} \label{sec:information_theory} + +\begin{description} + \item[Shannon theorem] \marginnote{Shannon theorem} + Let $\matr{X} = \{ \vec{v}_1, \dots, \vec{v}_V \}$ be a data source where + each of the possible value has probability $p_i = \prob{\vec{v}_i}$. + The best encoding allows to transmit $\matr{X}$ with + an average number of bits given by the \textbf{entropy} of $X$: \marginnote{Entropy} + \[ H(\matr{X}) = - \sum_j p_j \log_2(p_j) \] + $H(\matr{X})$ can be seen as a weighted sum of the surprise factor $-\log_2(p_j)$. + If $p_j \sim 1$, then the surprise of observing $\vec{v}_j$ is low, vice versa, + if $p_j \sim 0$, the surprise of observing $\vec{v}_j$ is high. + + Therefore, when $H(\matr{X})$ is high, $\matr{X}$ is close to an uniform distribution. + When $H(\matr{X})$ is low, $\matr{X}$ is close to a constant. + + \begin{example}[Binary source] \phantom{}\\ + \begin{minipage}{.50\linewidth} + The two values of a binary source $\matr{X}$ have respectively probability $p$ and $(1-p)$. + When $p \sim 0$ or $p \sim 1$, $H(\matr{X}) \sim 0$.\\ + When $p \sim 0.5$, $H(\matr{X}) \sim \log_2(2)=1$ + \end{minipage} + \begin{minipage}{.45\linewidth} + \centering + \includegraphics[width=\linewidth]{img/binary_entropy.png} + \end{minipage} + \end{example} + + \item[Entropy threshold split] \marginnote{Entropy threshold split} + Given a dataset $\matr{D}$, + a real-valued attribute $d \in \matr{D}$, + a threshold $t$ in the domain of $d$ and + the class attribute $c$ of $\matr{D}$. + The entropy of the class $c$ of the dataset $\matr{D}$ split with threshold $t$ on $d$ is a weighted sum: + \[ H(c \,\vert\, d \,:\, t) = \prob{d < t}H(c \,\vert\, d < t) + \prob{d \geq t}H(c \,\vert\, d \geq t) \] + + \item[Information gain] \marginnote{Information gain} + Information gain measures the reduction in entropy after applying a split. + It is computed as: + \[ IG(c \,\vert\, d \,:\, t) = H(c) - H(c \,\vert\, d \,:\, t) \] + When $H(c \,\vert\, d \,:\, t)$ is low, $IG(c \,\vert\, d \,:\, t)$ is high + as splitting with threshold $t$ result in purer groups. + Vice versa, when $H(c \,\vert\, d \,:\, t)$ is high, $IG(c \,\vert\, d \,:\, t)$ is low + as splitting with threshold $t$ is not very useful. + + The information gain of a class $c$ split on a feature $d$ is given by: + \[ IG(c \,\vert\, d) = \max_t IG(c \,\vert\, d \,:\, t) \] +\end{description} + + +\subsection{Tree construction} + +\begin{description} + \item[Decision tree (C4.5)] \marginnote{Decision tree} + Tree-shaped classifier where leaves are class predictions and + inner nodes represent conditions that guide to a leaf. + This type of classifier is non-linear (i.e. does not represent a linear separation). + + Each node of the tree contains: + \begin{itemize} + \item The applied splitting criteria (i.e. feature and threshold). + Leaves do not have this value. + \item The entropy of the current split. + \item Dataset coverage of the current split. + \item Classes distribution. + \end{itemize} + + \begin{figure}[h] + \centering + \includegraphics[width=0.5\textwidth]{img/_iris_decision_tree_example.pdf} + \caption{Example of decision tree} + \end{figure} + + Note: the weighted sum of the entropies of the children is always smaller than the entropy of the parent. + + Possible stopping conditions are: + \begin{itemize} + \item When most of the leaves are pure (i.e. nothing useful to split). + \item When some leaves are impure but none of the possible splits have positive $IG$. + Impure leaves are labeled with the majority class. + \end{itemize} + + \item[Purity] \marginnote{Purity} + Value to maximize when splitting a node of a decision tree. + + Nodes with uniformly distributed classes have a low purity. + Nodes with a single class have the highest purity. + + Possible impurity measures are: + \begin{descriptionlist} + \item[Entropy/Information gain] See \Cref{sec:information_theory}. + + \item[Gini index] \marginnote{Gini index} + Let $\matr{X}$ be a dataset with classes $C$. + The Gini index measures how often an element of $\matr{X}$ would be misclassified + if the labels were randomly assigned based on the frequencies of the classes in $\matr{X}$. + + Given a class $i \in C$, $p_i$ is the probability (i.e. frequency) of classifying an element with $i$ and + $(1 - p_i)$ is the probability of classifying it with a different label. + The Gini index is given by: + \[ + \begin{split} + GINI(\matr{X}) = \sum_i^C p_i (1-p_i) &= \sum_i^C p_i - \sum_i^C p_i^2 \\ + &= 1 - \sum_i^C p_i^2 + \end{split} + \] + When $\matr{X}$ is uniformly distributed, $GINI(\matr{X}) \sim (1-\frac{1}{\vert C \vert})$. + When $\matr{X}$ is constant, $GINI(\matr{X}) \sim 0$. + + Given a node $p$ split in $n$ children $p_1, \dots, p_n$, + the Gini gain of the split is given by: + \[ GINI_\text{gain} = GINI(p) - \sum_{i=1}^n \frac{\vert p_i \vert}{\vert p \vert} GINI(p_i) \] + + \item[Misclassification error] \marginnote{Misclassification error} + Skipped. + \end{descriptionlist} + + \begin{figure}[h] + \centering + \includegraphics[width=0.35\textwidth]{img/impurity_comparison.png} + \caption{Comparison of impurity measures} + \end{figure} + + Compared to Gini index, entropy is more robust to noise. +\end{description} + +\begin{algorithm}[H] +\caption{Decision tree construction using information gain as impurity measure} +\begin{lstlisting} + def buildTree(split): + node = Node() + if len(split.classes) == 1: # Pure split + node.label = split.classes[0] + node.isLeaf = True + else: + ig, attribute, threshold = getMaxInformationGain(split) + if ig < 0: + node.label = split.majorityClass() + node.isLeaf = True + else: + node.left = buildTree(split[attribute < threshold]) + node.right = buildTree(split[attribute >= threshold]) + return node +\end{lstlisting} +\end{algorithm} + + +\begin{description} + \item[Pruning] \marginnote{Pruning} + Remove branches to reduce overfitting. +\end{description} \ No newline at end of file