diff --git a/src/machine-learning-and-data-mining/img/elbow_method.png b/src/machine-learning-and-data-mining/img/elbow_method.png new file mode 100644 index 0000000..cadb3ab Binary files /dev/null and b/src/machine-learning-and-data-mining/img/elbow_method.png differ diff --git a/src/machine-learning-and-data-mining/img/kernel_density_estimation.png b/src/machine-learning-and-data-mining/img/kernel_density_estimation.png new file mode 100644 index 0000000..0940435 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/kernel_density_estimation.png differ diff --git a/src/machine-learning-and-data-mining/sections/_clustering.tex b/src/machine-learning-and-data-mining/sections/_clustering.tex index 3ba3d57..7474ec0 100644 --- a/src/machine-learning-and-data-mining/sections/_clustering.tex +++ b/src/machine-learning-and-data-mining/sections/_clustering.tex @@ -130,8 +130,425 @@ Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics ar \item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship. \end{itemize} - \item[Symmetric uncertainty] + \item[Symmetric uncertainty] \marginnote{Symmetric uncertainty} Measure of correlation for nominal attributes: \[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \] where $H$ is the entropy. +\end{description} + + + + +\section{Clustering definitions} + +\begin{description} + \item[Clustering] \marginnote{Clustering} + Given a set of $D$-dimensional objects $\vec{x}_i$, + we want to partition them into $K$ clusters (and potentially recognize outliers). + In other words, we are looking for a mapping: + \[ \texttt{cluster}(\vec{x}_i) \in \{ 1, \dots, K \} \] + such that objects in the same cluster are similar. + + \item[Centroid] \marginnote{Centroid} + Average of the coordinates of the points in a cluster. + For a cluster $K_i$, the $d$-th coordinate of its centroid is given by: + \[ + \texttt{centroid}(K_i)\texttt{[$d$]} + = \frac{1}{\vert K_i \vert} + \sum_{\vec{x} \in K_i} \vec{x}\texttt{[$d$]} + \] + + \item[Medoid] \marginnote{Medoid} + Element of the cluster with minimum average dissimilarity to all other points. + Differently from the centroid, the medoid must be an existing point of the dataset. + + \item[Proximity functions] \marginnote{Proximity function} + Measures to determine the similarity of two data points: + \begin{descriptionlist} + \item[Euclidean distance] + \end{descriptionlist} +\end{description} + + +\section{Metrics} + +\begin{description} + \item[Cohesion] \marginnote{Cohesion} + Measures the similarity (proximity) of the objects in the same cluster. + Given a cluster $K_i$, cohesion is computed as: + \[ \texttt{cohesion}(K_i) = \sum_{\vec{x} \in K_i} \texttt{dist}(\vec{x}, \vec{c}_i) \] + where $\vec{c}_i$ can be the centroid or medoid + and \texttt{dist} is a proximity function. + + \item[Separation] \marginnote{Separation} + Measures the distance of two clusters. + Given two clusters $K_i$ and $K_j$, their separation is: + \[ \texttt{separation}(K_i, K_j) = \texttt{dist}(\vec{c}_i, \vec{c}_j) \] + where $\vec{c}_i$ and $\vec{c}_j$ are respectively the centroids of $K_i$ and $K_j$, and \texttt{dist} is a proximity function. + + \item[Sum of squared errors] \marginnote{Sum of squared errors} + Measures for each cluster the distance between its points to its centroid. + Can be seen as the application of distortion (\Cref{desc:distortion}) to clustering: + \[ \texttt{SSE}_j = \sum_{\vec{x}_i \in K_j} \texttt{dist}(\vec{x}_i, \vec{c}_j)^2 \] + where $K_j$ is the $j$-th cluster and $\vec{c}_j$ is its centroid. + + If $\texttt{SSE}_j$ is high, the cluster has low quality. + If $\texttt{SSE}_j = 0$, all points in the cluster correspond to the centroid. + + The sum of squared errors of $K$ clusters is: + \[ \texttt{SSE} = \sum_{j=1}^{K} \texttt{SSE}_j \] + + \item[Sum of squares between clusters] \marginnote{Sum of squares between clusters} + Given the global centroid of the dataset $\vec{c}$ and + $K$ clusters each with $N_i$ objects, + the sum of squares between clusters is given by: + \[ \texttt{SSB} = \sum_{i=1}^{K} N_i \texttt{dist}(\vec{c}_i, \vec{c})^2 \] + + \item[Total sum of squares] \marginnote{Total sum of squares} + Sum of the squared distances between the point of the dataset and the global centroid. + It can be shown that the total sum of squares can be computed as: + \[ \texttt{TSS} = \texttt{SSE} + \texttt{SSB} \] + + \begin{theorem} + Minimize \texttt{SSE} $\iff$ maximize \texttt{SSB}. + \end{theorem} + + \item[Silhouette score] \marginnote{Silhouette score} + The Silhouette score of a data point $\vec{x}_i$ belonging to a cluster $K_i$ is given by two components: + \begin{description} + \item[Sparsity contribution] + The average distance of $\vec{x}_i$ to all other points in $K_i$: + \[ a(\vec{x}_i) = \frac{1}{\vert K_i \vert - 1} \sum_{\vec{x}_j \in K_i, \vec{x}_j \neq \vec{x}_i} \texttt{dist}(\vec{x}_i, \vec{x}_j) \] + + \item[Separation contribution] + The average distance of $\vec{x}_i$ to the points in the nearest cluster: + \[ b(\vec{x}_i) = \min_{K_j, K_j \neq K_i} \left( \frac{1}{\vert K_j \vert} \sum_{\vec{w} \in K_j} \texttt{dist}(\vec{x}_i, \vec{w}) \right) \] + \end{description} + The Silhouette score of $\vec{x}_i$ is then computed as: + \[ s(\vec{x}_i) = \frac{b(\vec{x}_i) - a(\vec{x}_i)}{\max\{ a(\vec{x}_i), b(\vec{x}_i) \}} \in [-1, 1] \] + + The Silhouette score $\mathcal{S}$ of $K$ clusters is given by the average Silhouette scores of each data point. + $\mathcal{S} \rightarrow 1$ indicates correct clusters, $\mathcal{S} \rightarrow -1$ indicates incorrect clusters. + + \item[Golden standard] \marginnote{Golden standard} + Evaluation using a labeled dataset. + Consider the elements of the same cluster as labeled with the same class. + + \begin{description} + \item[Classification-oriented] + Traditional classification metrics such as accuracy, recall, precision, \dots + + \item[Similarity-oriented] + Given a learnt clustering scheme $y_K(\cdot)$ and the golden standard scheme $y_G(\cdot)$ where + $y_i(\vec{x})$ indicates the label/cluster of $\vec{x}$, each pair of data $(\vec{x}_1, \vec{x}_2)$ can be labeled with: + \begin{descriptionlist} + \item[\texttt{SGSK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$. + \item[\texttt{SGDK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$. + \item[\texttt{DGSK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$. + \item[\texttt{DGDK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$. + \end{descriptionlist} + Then, the following metrics can be computed: + \begin{descriptionlist} + \item[Rand score] $\frac{\texttt{SGSK} + \texttt{DGDK}}{\texttt{SGSK} + \texttt{SGDK} + \texttt{DGSK} + \texttt{DGDK}}$ + \item[Adjusted rand score] Modification of the rand score to take into account that some agreements may happen by chance. + \item[Jaccard coefficient] For each class $c$, the Jaccard coefficient is given by: + \[ \frac{\texttt{SG$_c$SK$_c$}}{\texttt{SG$_c$SK$_c$} + \texttt{SG$_c$DK$_c$} + \texttt{DG$_c$SK$_c$}} \] + \end{descriptionlist} + \end{description} +\end{description} + + + +\section{K-means} + +\begin{description} + \item[Algorithm] \marginnote{K-means} + Clustering algorithm that iteratively improves the centroids. + Given the desired number of clusters $K$, the algorithm works as follows: + \begin{enumerate} + \item Randomly choose $K$ initial centroids. + \item Each data point belongs to the cluster represented by the nearest centroid. + \item Update the centroids as the centroids of the newly found clusters. Go to 2. + \end{enumerate} + + \item[Distortion] \label{desc:distortion} \marginnote{Distortion} + Given: + \begin{itemize} + \item a $D$-dimensional dataset of $N$ points $\vec{x}_i$; + \item an encoding function $\texttt{encode}: \mathbb{R}^D \rightarrow [1, K]$; + \item a decoding function $\texttt{decode}: [1, K] \rightarrow \mathbb{R}^D$. + \end{itemize} + Distortion (or inertia) is defines as: + \[ \texttt{distortion} = \sum_{i=1}^{N} \big(\vec{x}_i - \texttt{decode}(\texttt{encode}(\vec{x_i})) \big)^2 \] + + \begin{theorem} + To minimize the distortion, it is required that: + \begin{enumerate} + \item $\vec{x}_i$ is encoded with its nearest center. + \item The center of a point is the centroid of the cluster it belongs to. + \end{enumerate} + + Note that k-means alternates point 1 and 2. + + \begin{proof} + The second point is derived by imposing the derivative of \texttt{distortion} to 0. + \end{proof} + \end{theorem} + + \item[Elbow method] + Inertia decreases monotonically and can be used to determine an ideal number of clusters. + By computing the inertia for varying $K$, a plausible value is the one corresponding to the point where the slope decreases. + \begin{figure}[H] + \centering + \includegraphics[width=0.4\textwidth]{img/elbow_method.png} + \caption{Plot of inertia. Possibly good values for $K$ are around 3} + \end{figure} + + The Silhouette score can also be used by selecting the $K$ corresponding to its maximum. + Note that, compared to inertia, Silhouette is computationally more expensive. + + \item[Properties] \phantom{} + \begin{description} + \item[Termination] + There are a finite number of ways to cluster $N$ objects into $K$ clusters. + By construction, at each iteration the \texttt{distortion} is reduced. + Therefore, k-means is guaranteed to terminate. + + \item[Non-optimality] + The solution found by k-means is not guaranteed to be a global best. + The choice of starting points heavily influences the final result. + The starting configuration is usually composed of points distant as far as possible. + + \item[Noise] + Outliers heavily influences the clustering result. Sometimes, it is useful to remove them. + + \item[Complexity] + Given a $D$-dimensional dataset of $N$ points, + Running k-means for $T$ iterations to find $K$ clusters has complexity $O(TKND)$. + \end{description} +\end{description} + + + +\section{Hierarchical clustering} + +\begin{description} + \item[Dendogram] \marginnote{Dendogram} + Tree-like structure where the root is a cluster of all data points and + the leaves are clusters with a single data points. + + \item[Agglomerative] \marginnote{Agglomerative} + Starts with a cluster per data point and iteratively merges them (leaves to root). + Uses cluster separation metrics. + + \item[Divisive] \marginnote{Divisive} + Starts with a cluster containing all the data points and iteratively splits them (root to leaves). + Uses cluster cohesion metrics. + + \item[Cluster separation measures] + Measure the distance between two clusters $K_i$ and $K_j$. + \begin{descriptionlist} + \item[Single link] \marginnote{Single link} + Minimum distance of the points in the two clusters: + \[ \texttt{sep}(K_i, K_j) = \min_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \] + Tends to create larger clusters. + + \item[Complete link] \marginnote{Complete link} + Maximum distance of the points in the two clusters: + \[ \texttt{sep}(K_i, K_j) = \max_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \] + Tends to create more compact clusters. + + \item[Average link] \marginnote{Average link} + Average distance of the points in the two clusters: + \[ \texttt{sep}(K_i, K_j) = \frac{1}{\vert K_i \vert \cdot \vert K_j \vert} \sum_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \] + + \item[Centroid-based] \marginnote{Centroid-based} + Distance between the centroids of the two clusters. + + \item[Ward's method] \marginnote{Ward's method} + Let $K_m$ be the cluster obtained by merging $K_i$ and $K_j$. + The distance between $K_i$ and $K_j$ is determined as: + \[ \texttt{sep}(K_i, K_j) = \texttt{SSE}(K_m) - \big( \texttt{SSE}(K_i) + \texttt{SSE}(K_j) \big) \] + \end{descriptionlist} +\end{description} + + +\subsection{Agglomerative clustering} + +\begin{description} + \item[Algorithm] \marginnote{Agglomerative clustering} \phantom{} + \begin{enumerate} + \item Initialize a cluster for each data point. + \item Compute the distance matrix between each cluster. + \item Merge the two clusters with lowest separation, + drop their values from the distance matrix and add an row/column for the newly created cluster. + \item Go to point 2. if the number of clusters is greater than one. + \end{enumerate} + + After the construction of the dendogram, a cut \marginnote{Cut} can be performed at a user define level. + A cut near the root will result in few bigger clusters. + A cut near the leaves will result in numerous smaller clusters. + + + \item[Properties] \phantom{} + \begin{description} + \item[Complexity] + Space complexity of $O(N^2)$ to store the distance matrix. + + Time complexity of $O(N^3)$ ($O(N)$ iterations with a $O(N^2)$ search for the pair to merge and $O(N)$ to recompute the distance matrix) + that can be reduced to $O(N^2\log(N))$ when using indexing. + \end{description} +\end{description} + + + +\section{Density-based clustering} + +Consider as clusters the high-density areas of the data space. + +\begin{description} + \item[Grid-based] + Split the data space into a grid and count the number of points in each tile. + + \item[Object-centered] + Count, for each point, the number of neighbors within a radius. +\end{description} + + +\subsection{DBSCAN} + +\begin{description} + \item[Neighborhood] \marginnote{Neighborhood} + Given a radius $\varepsilon$, the neighborhood of a point $\vec{x}$ are the points within an $\varepsilon$-sphere centered on $\vec{x}$. + + \item[Core point] \marginnote{Core point} + Given a minimum number of neighbors $m$, + a point $\vec{x}$ is a core point if it has at least $m$ neighbors. + + \item[Border point] \marginnote{Border point} + A point $\vec{x}$ is a border point if it is not a core point. + + \item[Directly density reachable] \marginnote{Directly density reachable} + A point $\vec{p}$ is directly density reachable from $\vec{q}$ iff: + \begin{itemize} + \item $\vec{q}$ is a core point. + \item $\vec{q}$ is a neighbor of $\vec{p}$. + \end{itemize} + + \item[Density reachable] \marginnote{Density reachable} + A point $\vec{p}$ is density reachable from $\vec{q}$ iff: + \begin{itemize} + \item $\vec{q}$ is a core point. + \item There exists a sequence of points $\vec{s}_1, \dots, \vec{s}_z$ such that: + \begin{itemize} + \item $\vec{s}_1$ is directly density reachable from $\vec{p}$. + \item $\vec{s}_{i+1}$ is directly density reachable from $\vec{s}_i$. + \item $\vec{q}$ is directly density reachable from $\vec{s}_z$. + \end{itemize} + \end{itemize} + + \item[Density connected] \marginnote{Density connected} + A point $\vec{p}$ is density connected to $\vec{q}$ iff there exists a point $\vec{s}$ + such that both $\vec{p}$ and $\vec{q}$ are density reachable from $\vec{s}$. + + \item[Algorithm] \marginnote{DBSCAN} + Determine clusters as maximal sets of density connected points. + Border points not density connected to any core point are labeled as noise. + + In other words, what happens it the following: + \begin{itemize} + \item Neighboring core points are part of the same cluster. + \item Border points are part of the cluster of their nearest core point neighbor. + \item Border points without a core point neighbor are noise. + \end{itemize} + + \item[Properties] \phantom{} + \begin{description} + \item[Robustness] + Able to find clusters of any shape and detect noise. + + \item[Hyperparameters] + Sensible to the choice of the radius $\varepsilon$ and minimum neighbors $m$. + + \begin{description} + \item[K-distance method] \phantom{} + \begin{enumerate} + \item Determine for each point its $k$-distance as the distance to its $k$-nearest neighbors. + \item Sort the points by decreasing $k$-distance and plot them. + \item Use as possible $\varepsilon$ the values around the area where the slope decreases (similarly to the elbow method). + \end{enumerate} + \end{description} + + \item[Complexity] + Complexity of $O(N^2)$ reduced to $O(N \log N)$ if using spatial indexing. + \end{description} +\end{description} + + +\subsection{DENCLUE} + +\begin{description} + \item[Kernel density estimation] \marginnote{Kernel density estimation} + Statistical method to estimate the distribution of a dataset through a function. + + \begin{description} + \item[Kernel function] \marginnote{Kernel function} + Symmetric and monotonically decreasing function to describe the influence of a data point to its neighbors. + + A typical kernel function is the Gaussian. + + \item[Overall density function] + The overall density of the dataset is obtained as the sum of the kernel function evaluated at each data point. + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\textwidth]{img/kernel_density_estimation.png} + \caption{Example of density function from a set of points (top right) using a Gaussian kernel} + \label{img:denclue} + \end{figure} + \end{description} + + \item[Algorithm] \marginnote{DENCLUE} + Given a threshold $\xi$, DENCLUE works as follows: + \begin{enumerate} + \item Derive a density function of the dataset. + \item Identify local maximums and consider them as density attractors. + \item Associate to each data point the density attractor in the direction of maximum increase. + \item Points associated to the same density attractor are part of the same cluster. + \item Remove clusters with a density attractor lower than $\xi$. + \item Merge clusters connected through a path of points whose density is greater or equal to $\xi$ + (e.g. in \Cref{img:denclue} the center area will result in many small clusters that can be merged with an appropriate $\xi$). + \end{enumerate} + + \item[Properties] \phantom{} + \begin{description} + \item[Robustness] + Able to recognize clusters of different shapes and handle noise. + + \item[High dimension weakness] + Does not perform well with high-dimensional data with different densities. + + \item[Complexity] + Computational complexity of $O(N^2)$. + \end{description} +\end{description} + + + +\section{Model-based clustering} + +Assuming that the attributes are independent random variables, +model-based clustering finds a set of distributions (one per cluster) that describe the data. + + +\subsection*{Gaussian mixture (expectation maximization)} + +\begin{description} + \item[Algorithm] \phantom{} \marginnote{Gaussian mixture} + \begin{enumerate} + \item Select an initial set of parameters for the distributions. + \item Expectation step: for each data point, compute its probability to belong to each distribution. + \item Maximization step: tweak the parameters to maximize the likelihood (i.e. move the Gaussian towards the center of the cluster). + \item Go to point 2. until convergence. + \end{enumerate} \end{description} \ No newline at end of file