Add ML/DM clustering

2026-02-04 07:41:43 +01:00 · 2023-12-08 15:40:08 +01:00
parent 3142389fe4
commit 83b2859752
3 changed files with 418 additions and 1 deletions
--- a/src/machine-learning-and-data-mining/img/elbow_method.png
+++ b/src/machine-learning-and-data-mining/img/elbow_method.png
--- a/src/machine-learning-and-data-mining/img/kernel_density_estimation.png
+++ b/src/machine-learning-and-data-mining/img/kernel_density_estimation.png
--- a/src/machine-learning-and-data-mining/sections/_clustering.tex
+++ b/src/machine-learning-and-data-mining/sections/_clustering.tex
@ -130,8 +130,425 @@ Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics ar
            \item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship.
        \end{itemize}

-    \item[Symmetric uncertainty]
+    \item[Symmetric uncertainty] \marginnote{Symmetric uncertainty}
        Measure of correlation for nominal attributes:
        \[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \]
        where $H$ is the entropy.
 \end{description}
+
+
+
+
+\section{Clustering definitions}
+
+\begin{description}
+    \item[Clustering] \marginnote{Clustering}
+        Given a set of $D$-dimensional objects $\vec{x}_i$, 
+        we want to partition them into $K$ clusters (and potentially recognize outliers).
+        In other words, we are looking for a mapping:
+        \[ \texttt{cluster}(\vec{x}_i) \in \{ 1, \dots, K \} \]
+        such that objects in the same cluster are similar.
+
+    \item[Centroid] \marginnote{Centroid}
+        Average of the coordinates of the points in a cluster.
+        For a cluster $K_i$, the $d$-th coordinate of its centroid is given by:
+        \[ 
+            \texttt{centroid}(K_i)\texttt{[$d$]} 
+                = \frac{1}{\vert K_i \vert} 
+                    \sum_{\vec{x} \in K_i} \vec{x}\texttt{[$d$]} 
+        \]
+
+    \item[Medoid] \marginnote{Medoid}
+        Element of the cluster with minimum average dissimilarity to all other points.
+        Differently from the centroid, the medoid must be an existing point of the dataset.
+
+    \item[Proximity functions] \marginnote{Proximity function} 
+        Measures to determine the similarity of two data points:
+        \begin{descriptionlist}
+            \item[Euclidean distance] 
+        \end{descriptionlist}
+\end{description}
+
+
+\section{Metrics}
+
+\begin{description}
+    \item[Cohesion] \marginnote{Cohesion}
+        Measures the similarity (proximity) of the objects in the same cluster.
+        Given a cluster $K_i$, cohesion is computed as:
+        \[ \texttt{cohesion}(K_i) = \sum_{\vec{x} \in K_i} \texttt{dist}(\vec{x}, \vec{c}_i) \]
+        where $\vec{c}_i$ can be the centroid or medoid
+        and \texttt{dist} is a proximity function.
+
+    \item[Separation] \marginnote{Separation}
+        Measures the distance of two clusters.
+        Given two clusters $K_i$ and $K_j$, their separation is:
+        \[ \texttt{separation}(K_i, K_j) = \texttt{dist}(\vec{c}_i, \vec{c}_j) \]
+        where $\vec{c}_i$ and $\vec{c}_j$ are respectively the centroids of $K_i$ and $K_j$, and \texttt{dist} is a proximity function.
+
+    \item[Sum of squared errors] \marginnote{Sum of squared errors}
+        Measures for each cluster the distance between its points to its centroid.
+        Can be seen as the application of distortion (\Cref{desc:distortion}) to clustering:
+        \[ \texttt{SSE}_j = \sum_{\vec{x}_i \in K_j} \texttt{dist}(\vec{x}_i, \vec{c}_j)^2 \]
+        where $K_j$ is the $j$-th cluster and $\vec{c}_j$ is its centroid.
+
+        If $\texttt{SSE}_j$ is high, the cluster has low quality.
+        If $\texttt{SSE}_j = 0$, all points in the cluster correspond to the centroid.
+
+        The sum of squared errors of $K$ clusters is:
+        \[ \texttt{SSE} = \sum_{j=1}^{K} \texttt{SSE}_j \]
+
+    \item[Sum of squares between clusters] \marginnote{Sum of squares between clusters}
+        Given the global centroid of the dataset $\vec{c}$ and
+        $K$ clusters each with $N_i$ objects,
+        the sum of squares between clusters is given by:
+        \[ \texttt{SSB} = \sum_{i=1}^{K} N_i \texttt{dist}(\vec{c}_i, \vec{c})^2 \]
+
+    \item[Total sum of squares] \marginnote{Total sum of squares}
+        Sum of the squared distances between the point of the dataset and the global centroid.
+        It can be shown that the total sum of squares can be computed as:
+        \[ \texttt{TSS} = \texttt{SSE} + \texttt{SSB} \]
+
+        \begin{theorem}
+            Minimize \texttt{SSE} $\iff$ maximize \texttt{SSB}.
+        \end{theorem}
+        
+    \item[Silhouette score] \marginnote{Silhouette score}
+        The Silhouette score of a data point $\vec{x}_i$ belonging to a cluster $K_i$ is given by two components:
+        \begin{description}
+            \item[Sparsity contribution] 
+                The average distance of $\vec{x}_i$ to all other points in $K_i$:
+                \[ a(\vec{x}_i) = \frac{1}{\vert K_i \vert - 1} \sum_{\vec{x}_j \in K_i, \vec{x}_j \neq \vec{x}_i} \texttt{dist}(\vec{x}_i, \vec{x}_j) \]
+            
+            \item[Separation contribution] 
+                The average distance of $\vec{x}_i$ to the points in the nearest cluster:
+                \[ b(\vec{x}_i) = \min_{K_j, K_j \neq K_i} \left( \frac{1}{\vert K_j \vert} \sum_{\vec{w} \in K_j} \texttt{dist}(\vec{x}_i, \vec{w}) \right) \]
+        \end{description}
+        The Silhouette score of $\vec{x}_i$ is then computed as:
+        \[ s(\vec{x}_i) = \frac{b(\vec{x}_i) - a(\vec{x}_i)}{\max\{ a(\vec{x}_i), b(\vec{x}_i) \}} \in [-1, 1] \]
+        
+        The Silhouette score $\mathcal{S}$ of $K$ clusters is given by the average Silhouette scores of each data point.
+        $\mathcal{S} \rightarrow 1$ indicates correct clusters, $\mathcal{S} \rightarrow -1$ indicates incorrect clusters.
+
+    \item[Golden standard] \marginnote{Golden standard}
+        Evaluation using a labeled dataset.
+        Consider the elements of the same cluster as labeled with the same class.
+
+        \begin{description}
+            \item[Classification-oriented] 
+                Traditional classification metrics such as accuracy, recall, precision, \dots
+
+            \item[Similarity-oriented]
+                Given a learnt clustering scheme $y_K(\cdot)$ and the golden standard scheme $y_G(\cdot)$ where 
+                $y_i(\vec{x})$ indicates the label/cluster of $\vec{x}$, each pair of data $(\vec{x}_1, \vec{x}_2)$ can be labeled with:
+                \begin{descriptionlist}
+                    \item[\texttt{SGSK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$.
+                    \item[\texttt{SGDK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$.
+                    \item[\texttt{DGSK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$.
+                    \item[\texttt{DGDK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$.
+                \end{descriptionlist}
+                Then, the following metrics can be computed:
+                \begin{descriptionlist}
+                    \item[Rand score] $\frac{\texttt{SGSK} + \texttt{DGDK}}{\texttt{SGSK} + \texttt{SGDK} + \texttt{DGSK} + \texttt{DGDK}}$
+                    \item[Adjusted rand score] Modification of the rand score to take into account that some agreements may happen by chance.
+                    \item[Jaccard coefficient] For each class $c$, the Jaccard coefficient is given by:
+                        \[ \frac{\texttt{SG$_c$SK$_c$}}{\texttt{SG$_c$SK$_c$} + \texttt{SG$_c$DK$_c$} + \texttt{DG$_c$SK$_c$}} \]
+                \end{descriptionlist}
+        \end{description}
+\end{description}
+
+
+
+\section{K-means}
+
+\begin{description}
+    \item[Algorithm] \marginnote{K-means}
+        Clustering algorithm that iteratively improves the centroids.
+        Given the desired number of clusters $K$, the algorithm works as follows:
+        \begin{enumerate}
+            \item Randomly choose $K$ initial centroids.
+            \item Each data point belongs to the cluster represented by the nearest centroid.
+            \item Update the centroids as the centroids of the newly found clusters. Go to 2.
+        \end{enumerate}
+
+    \item[Distortion] \label{desc:distortion} \marginnote{Distortion}
+        Given:
+        \begin{itemize}
+            \item a $D$-dimensional dataset of $N$ points $\vec{x}_i$;
+            \item an encoding function $\texttt{encode}: \mathbb{R}^D \rightarrow [1, K]$;
+            \item a decoding function $\texttt{decode}: [1, K] \rightarrow \mathbb{R}^D$.
+        \end{itemize}
+        Distortion (or inertia) is defines as:
+        \[ \texttt{distortion} = \sum_{i=1}^{N} \big(\vec{x}_i - \texttt{decode}(\texttt{encode}(\vec{x_i})) \big)^2 \]
+
+        \begin{theorem}
+            To minimize the distortion, it is required that:
+            \begin{enumerate}
+                \item $\vec{x}_i$ is encoded with its nearest center.
+                \item The center of a point is the centroid of the cluster it belongs to.
+            \end{enumerate}
+
+            Note that k-means alternates point 1 and 2.
+
+            \begin{proof}
+                The second point is derived by imposing the derivative of \texttt{distortion} to 0.
+            \end{proof}
+        \end{theorem}
+
+    \item[Elbow method]
+        Inertia decreases monotonically and can be used to determine an ideal number of clusters.
+        By computing the inertia for varying $K$, a plausible value is the one corresponding to the point where the slope decreases.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.4\textwidth]{img/elbow_method.png}
+            \caption{Plot of inertia. Possibly good values for $K$ are around 3}
+        \end{figure}
+
+        The Silhouette score can also be used by selecting the $K$ corresponding to its maximum.
+        Note that, compared to inertia, Silhouette is computationally more expensive.
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Termination] 
+                There are a finite number of ways to cluster $N$ objects into $K$ clusters.
+                By construction, at each iteration the \texttt{distortion} is reduced.
+                Therefore, k-means is guaranteed to terminate.
+
+            \item[Non-optimality] 
+                The solution found by k-means is not guaranteed to be a global best.
+                The choice of starting points heavily influences the final result. 
+                The starting configuration is usually composed of points distant as far as possible.
+
+            \item[Noise]
+                Outliers heavily influences the clustering result. Sometimes, it is useful to remove them.
+
+            \item[Complexity]
+                Given a $D$-dimensional dataset of $N$ points,
+                Running k-means for $T$ iterations to find $K$ clusters has complexity $O(TKND)$.
+        \end{description}
+\end{description}
+
+
+
+\section{Hierarchical clustering}
+
+\begin{description}
+    \item[Dendogram] \marginnote{Dendogram}
+        Tree-like structure where the root is a cluster of all data points and 
+        the leaves are clusters with a single data points.
+
+    \item[Agglomerative] \marginnote{Agglomerative} 
+        Starts with a cluster per data point and iteratively merges them (leaves to root).
+        Uses cluster separation metrics.
+
+    \item[Divisive] \marginnote{Divisive} 
+        Starts with a cluster containing all the data points and iteratively splits them (root to leaves).
+        Uses cluster cohesion metrics.
+
+    \item[Cluster separation measures]
+        Measure the distance between two clusters $K_i$ and $K_j$.
+        \begin{descriptionlist}
+            \item[Single link] \marginnote{Single link}
+                Minimum distance of the points in the two clusters:
+                \[ \texttt{sep}(K_i, K_j) = \min_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
+                Tends to create larger clusters.
+    
+            \item[Complete link] \marginnote{Complete link}
+                Maximum distance of the points in the two clusters:
+                \[ \texttt{sep}(K_i, K_j) = \max_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
+                Tends to create more compact clusters.
+        
+            \item[Average link] \marginnote{Average link}
+                Average distance of the points in the two clusters:
+                \[ \texttt{sep}(K_i, K_j) = \frac{1}{\vert K_i \vert \cdot \vert K_j \vert} \sum_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
+            
+            \item[Centroid-based] \marginnote{Centroid-based}
+                Distance between the centroids of the two clusters.
+
+            \item[Ward's method] \marginnote{Ward's method}
+                Let $K_m$ be the cluster obtained by merging $K_i$ and $K_j$.
+                The distance between $K_i$ and $K_j$ is determined as:
+                \[ \texttt{sep}(K_i, K_j) = \texttt{SSE}(K_m) - \big( \texttt{SSE}(K_i) + \texttt{SSE}(K_j) \big) \]
+        \end{descriptionlist}
+\end{description}
+
+
+\subsection{Agglomerative clustering}
+
+\begin{description}
+    \item[Algorithm] \marginnote{Agglomerative clustering} \phantom{}
+        \begin{enumerate}
+            \item Initialize a cluster for each data point.
+            \item Compute the distance matrix between each cluster.
+            \item Merge the two clusters with lowest separation, 
+                drop their values from the distance matrix and add an row/column for the newly created cluster.
+            \item Go to point 2. if the number of clusters is greater than one.
+        \end{enumerate}
+
+        After the construction of the dendogram, a cut \marginnote{Cut} can be performed at a user define level.
+        A cut near the root will result in few bigger clusters.
+        A cut near the leaves will result in numerous smaller clusters.
+        
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Complexity] 
+                Space complexity of $O(N^2)$ to store the distance matrix.
+                
+                Time complexity of $O(N^3)$ ($O(N)$ iterations with a $O(N^2)$ search for the pair to merge and $O(N)$ to recompute the distance matrix) 
+                that can be reduced to $O(N^2\log(N))$ when using indexing.
+        \end{description}
+\end{description}
+
+
+
+\section{Density-based clustering}
+
+Consider as clusters the high-density areas of the data space.
+
+\begin{description}
+    \item[Grid-based] 
+        Split the data space into a grid and count the number of points in each tile.
+
+    \item[Object-centered] 
+        Count, for each point, the number of neighbors within a radius.
+\end{description}
+
+
+\subsection{DBSCAN}
+
+\begin{description}
+    \item[Neighborhood] \marginnote{Neighborhood}
+        Given a radius $\varepsilon$, the neighborhood of a point $\vec{x}$ are the points within an $\varepsilon$-sphere centered on $\vec{x}$.
+
+    \item[Core point] \marginnote{Core point}
+        Given a minimum number of neighbors $m$, 
+        a point $\vec{x}$ is a core point if it has at least $m$ neighbors.
+
+    \item[Border point] \marginnote{Border point}
+        A point $\vec{x}$ is a border point if it is not a core point.
+
+    \item[Directly density reachable] \marginnote{Directly density reachable}
+        A point $\vec{p}$ is directly density reachable from $\vec{q}$ iff:
+        \begin{itemize}
+            \item $\vec{q}$ is a core point.
+            \item $\vec{q}$ is a neighbor of $\vec{p}$.
+        \end{itemize}
+
+    \item[Density reachable] \marginnote{Density reachable}
+        A point $\vec{p}$ is density reachable from $\vec{q}$ iff:
+        \begin{itemize}
+            \item $\vec{q}$ is a core point.
+            \item There exists a sequence of points $\vec{s}_1, \dots, \vec{s}_z$ such that:
+            \begin{itemize}
+                \item $\vec{s}_1$ is directly density reachable from $\vec{p}$.
+                \item $\vec{s}_{i+1}$ is directly density reachable from $\vec{s}_i$.
+                \item $\vec{q}$ is directly density reachable from $\vec{s}_z$.
+            \end{itemize}
+        \end{itemize}
+
+    \item[Density connected] \marginnote{Density connected}
+        A point $\vec{p}$ is density connected to $\vec{q}$ iff there exists a point $\vec{s}$ 
+        such that both $\vec{p}$ and $\vec{q}$ are density reachable from $\vec{s}$.
+
+    \item[Algorithm] \marginnote{DBSCAN}
+        Determine clusters as maximal sets of density connected points.
+        Border points not density connected to any core point are labeled as noise.
+
+        In other words, what happens it the following:
+        \begin{itemize}
+            \item Neighboring core points are part of the same cluster.
+            \item Border points are part of the cluster of their nearest core point neighbor.
+            \item Border points without a core point neighbor are noise.
+        \end{itemize}
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Robustness]
+                Able to find clusters of any shape and detect noise.
+
+            \item[Hyperparameters]
+                Sensible to the choice of the radius $\varepsilon$ and minimum neighbors $m$.
+
+                \begin{description}
+                    \item[K-distance method] \phantom{}
+                        \begin{enumerate}
+                            \item Determine for each point its $k$-distance as the distance to its $k$-nearest neighbors.
+                            \item Sort the points by decreasing $k$-distance and plot them.
+                            \item Use as possible $\varepsilon$ the values around the area where the slope decreases (similarly to the elbow method).
+                        \end{enumerate}
+                \end{description}
+
+            \item[Complexity]
+                Complexity of $O(N^2)$ reduced to $O(N \log N)$ if using spatial indexing.
+        \end{description}
+\end{description}
+
+
+\subsection{DENCLUE}
+
+\begin{description}
+    \item[Kernel density estimation] \marginnote{Kernel density estimation}
+        Statistical method to estimate the distribution of a dataset through a function.
+
+        \begin{description}
+            \item[Kernel function] \marginnote{Kernel function}
+                Symmetric and monotonically decreasing function to describe the influence of a data point to its neighbors.
+
+                A typical kernel function is the Gaussian.
+
+            \item[Overall density function]
+                The overall density of the dataset is obtained as the sum of the kernel function evaluated at each data point.
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.35\textwidth]{img/kernel_density_estimation.png}
+                    \caption{Example of density function from a set of points (top right) using a Gaussian kernel}
+                    \label{img:denclue}
+                \end{figure}
+        \end{description}
+
+    \item[Algorithm] \marginnote{DENCLUE}
+        Given a threshold $\xi$, DENCLUE works as follows:
+        \begin{enumerate}
+            \item Derive a density function of the dataset.
+            \item Identify local maximums and consider them as density attractors.
+            \item Associate to each data point the density attractor in the direction of maximum increase.
+            \item Points associated to the same density attractor are part of the same cluster.
+            \item Remove clusters with a density attractor lower than $\xi$.
+            \item Merge clusters connected through a path of points whose density is greater or equal to $\xi$ 
+                (e.g. in \Cref{img:denclue} the center area will result in many small clusters that can be merged with an appropriate $\xi$).
+        \end{enumerate}
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Robustness]
+                Able to recognize clusters of different shapes and handle noise.
+
+            \item[High dimension weakness]
+                Does not perform well with high-dimensional data with different densities.
+
+            \item[Complexity]
+                Computational complexity of $O(N^2)$.
+        \end{description}
+\end{description}
+
+
+
+\section{Model-based clustering}
+
+Assuming that the attributes are independent random variables,
+model-based clustering finds a set of distributions (one per cluster) that describe the data.
+
+
+\subsection*{Gaussian mixture (expectation maximization)}
+
+\begin{description}
+    \item[Algorithm] \phantom{} \marginnote{Gaussian mixture} 
+    \begin{enumerate}
+        \item Select an initial set of parameters for the distributions.
+        \item Expectation step: for each data point, compute its probability to belong to each distribution.
+        \item Maximization step: tweak the parameters to maximize the likelihood (i.e. move the Gaussian towards the center of the cluster).
+        \item Go to point 2. until convergence.
+    \end{enumerate}
+\end{description}