Moved ML in year1
1
src/year1/machine-learning-and-data-mining/ainotes.cls
Symbolic link
@ -0,0 +1 @@
|
||||
../../ainotes.cls
|
||||
41
src/year1/machine-learning-and-data-mining/dm-ml.tex
Normal file
@ -0,0 +1,41 @@
|
||||
\documentclass[11pt]{ainotes}
|
||||
|
||||
\title{Machine Learning and Data Mining}
|
||||
\date{2023 -- 2024}
|
||||
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
|
||||
|
||||
\DeclareAcronym{oltp}{short=OLTP, long=Online Transaction Processing}
|
||||
\DeclareAcronym{erp}{short=ERP, long=Enterprise Resource Planning}
|
||||
\DeclareAcronym{mis}{short=MIS, long=Management Information System}
|
||||
\DeclareAcronym{dss}{short=DSS, long=Decision Support System}
|
||||
\DeclareAcronym{eis}{short=EIS, long=Executive Information System}
|
||||
\DeclareAcronym{olap}{short=OLAP, long=Online Analysical Processing}
|
||||
\DeclareAcronym{bi}{short=BI, long=Business Intelligence}
|
||||
\DeclareAcronym{dwh}{short=DWH, long=Data Warehouse}
|
||||
\DeclareAcronym{dm}{short=DM, long=Data Mart}
|
||||
\DeclareAcronym{etl}{short=ETL, long=Extraction{,} Transformation{,} Loading}
|
||||
\DeclareAcronym{dfm}{short=DFM, long=Dimensional Fact Model}
|
||||
\DeclareAcronym{cdc}{short=CDC, long=Change Data Capture}
|
||||
\DeclareAcronym{crisp}{short=CRISP-DM, long=Cross Industry Standard Process for Data Mining}
|
||||
|
||||
|
||||
\begin{document}
|
||||
|
||||
\makenotesfront
|
||||
\printacronyms
|
||||
\newpage
|
||||
|
||||
\input{sections/_intro.tex}
|
||||
\input{sections/_data_warehouse.tex}
|
||||
\input{sections/_data_lake.tex}
|
||||
\input{sections/_crisp.tex}
|
||||
\input{sections/_machine_learning.tex}
|
||||
\input{sections/_data_prepro.tex}
|
||||
\input{sections/_classification.tex}
|
||||
\input{sections/_regression.tex}
|
||||
\input{sections/_clustering.tex}
|
||||
\input{sections/_association_rules.tex}
|
||||
|
||||
\eoc
|
||||
|
||||
\end{document}
|
||||
BIN
src/year1/machine-learning-and-data-mining/img/_1layer_dwh.pdf
Normal file
BIN
src/year1/machine-learning-and-data-mining/img/_2layer_dwh.pdf
Normal file
BIN
src/year1/machine-learning-and-data-mining/img/_3layer_dwh.pdf
Normal file
BIN
src/year1/machine-learning-and-data-mining/img/_olap_cube.pdf
Normal file
BIN
src/year1/machine-learning-and-data-mining/img/_perceptron.pdf
Normal file
BIN
src/year1/machine-learning-and-data-mining/img/_storage.pdf
Normal file
|
After Width: | Height: | Size: 32 KiB |
|
After Width: | Height: | Size: 61 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/crisp.png
Normal file
|
After Width: | Height: | Size: 608 KiB |
|
After Width: | Height: | Size: 98 KiB |
|
After Width: | Height: | Size: 227 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/delta_lake.png
Normal file
|
After Width: | Height: | Size: 322 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/dfm.png
Normal file
|
After Width: | Height: | Size: 287 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/dfm_events.png
Normal file
|
After Width: | Height: | Size: 84 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/elbow_method.png
Normal file
|
After Width: | Height: | Size: 19 KiB |
|
After Width: | Height: | Size: 37 KiB |
|
After Width: | Height: | Size: 140 KiB |
|
After Width: | Height: | Size: 277 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/kappa_lake.png
Normal file
|
After Width: | Height: | Size: 8.0 KiB |
|
After Width: | Height: | Size: 168 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/lambda_lake.png
Normal file
|
After Width: | Height: | Size: 8.4 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/lift_chart.png
Normal file
|
After Width: | Height: | Size: 85 KiB |
|
After Width: | Height: | Size: 200 KiB |
|
After Width: | Height: | Size: 210 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/mahalanobis.png
Normal file
|
After Width: | Height: | Size: 35 KiB |
|
After Width: | Height: | Size: 39 KiB |
|
After Width: | Height: | Size: 310 KiB |
|
After Width: | Height: | Size: 74 KiB |
|
After Width: | Height: | Size: 499 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/olap_pivot.png
Normal file
|
After Width: | Height: | Size: 17 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/olap_rollup.png
Normal file
|
After Width: | Height: | Size: 62 KiB |
|
After Width: | Height: | Size: 107 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/roc_curve.png
Normal file
|
After Width: | Height: | Size: 103 KiB |
BIN
src/year1/machine-learning-and-data-mining/img/rules_apriori.png
Normal file
|
After Width: | Height: | Size: 225 KiB |
@ -0,0 +1,31 @@
|
||||
<mxfile host="app.diagrams.net" modified="2023-10-13T17:44:38.951Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0" etag="0k6DN-mG6fDlB8POdY3R" version="22.0.4" type="device">
|
||||
<diagram name="Pagina-1" id="Obl2eNAEIfPRNowj_f7H">
|
||||
<mxGraphModel dx="1195" dy="622" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="j0uoPLtJFFh1yWsyPPyp-1" value="" style="endArrow=classic;html=1;rounded=0;strokeWidth=2;startArrow=classic;startFill=1;fontSize=20;" edge="1" parent="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="180" y="400" as="sourcePoint" />
|
||||
<mxPoint x="680" y="400" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="j0uoPLtJFFh1yWsyPPyp-2" value="Data warehouse" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
|
||||
<mxGeometry x="180" y="360" width="150" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="j0uoPLtJFFh1yWsyPPyp-3" value="<div align="right" style="font-size: 20px;">Data lake<br style="font-size: 20px;"></div>" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
|
||||
<mxGeometry x="530" y="360" width="150" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="j0uoPLtJFFh1yWsyPPyp-4" value="Data hub" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
|
||||
<mxGeometry x="360" y="360" width="150" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="j0uoPLtJFFh1yWsyPPyp-5" value="Hot" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
|
||||
<mxGeometry x="180" y="410" width="60" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="j0uoPLtJFFh1yWsyPPyp-6" value="Cold" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
|
||||
<mxGeometry x="620" y="410" width="60" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
||||
BIN
src/year1/machine-learning-and-data-mining/img/svm.png
Normal file
|
After Width: | Height: | Size: 99 KiB |
|
After Width: | Height: | Size: 164 KiB |
|
After Width: | Height: | Size: 186 KiB |
11
src/year1/machine-learning-and-data-mining/metadata.json
Normal file
@ -0,0 +1,11 @@
|
||||
{
|
||||
"name": "Machine Learning and Data Mining",
|
||||
"year": 1,
|
||||
"semester": 1,
|
||||
"pdfs": [
|
||||
{
|
||||
"name": null,
|
||||
"path": "dm-ml.pdf"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -0,0 +1,353 @@
|
||||
\chapter{Association rules}
|
||||
|
||||
|
||||
\section{Frequent itemset}
|
||||
|
||||
\begin{description}
|
||||
\item[Itemset] \marginnote{Itemset}
|
||||
Collection of one or more items (e.g. $\{ \text{milk}, \text{bread}, \text{diapers} \}$).
|
||||
|
||||
\item[K-itemset] \marginnote{K-itemset}
|
||||
Itemset with $k$ items.
|
||||
|
||||
\item[Support count] \marginnote{Support count}
|
||||
Number of occurrences of an itemset in a dataset.
|
||||
\begin{example}
|
||||
\phantom{}\\
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
Given the following transactions:
|
||||
\begin{center}
|
||||
\begin{tabular}{|c|l|}
|
||||
\hline
|
||||
1 & bread, milk \\
|
||||
2 & beer, bread, diaper, eggs \\
|
||||
3 & beer, coke, diaper, milk \\
|
||||
\textbf{4} & \textbf{beer, bread, diaper, milk} \\
|
||||
\textbf{5} & \textbf{bread, coke, diaper, milk} \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.5\textwidth}
|
||||
The support count of the itemset containing bread, diapers and milk is:
|
||||
\[ \sigma(\{ \text{bread}, \text{diapers}, \text{milk} \}) = 2 \]
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
|
||||
\item[Association rule] \marginnote{Association rule}
|
||||
Given two itemsets $A$ and $C$, an association rule has form:
|
||||
\[ A \rightarrow C \]
|
||||
It means that there are transactions in the dataset where $A$ and $C$ co-occur.
|
||||
Note that it is not strictly a logical implication.
|
||||
|
||||
\item[Metrics] \phantom{}
|
||||
\begin{description}
|
||||
\item[Support] \marginnote{Support}
|
||||
Given $N$ transactions, the support of an itemset $A$ is:
|
||||
\[ \texttt{sup}(A) = \frac{\sigma(A)}{N} \]
|
||||
The support of an association rule $A \rightarrow C$ is:
|
||||
\[ \texttt{sup}(A \rightarrow C) = \texttt{sup}(A \cup C) = \frac{\sigma(A \cup C)}{N} \]
|
||||
|
||||
Low support implies random associations.
|
||||
|
||||
\begin{description}
|
||||
\item[Frequent itemset] \marginnote{Frequent itemset}
|
||||
Itemset whose support is at least a given threshold.
|
||||
\end{description}
|
||||
|
||||
\item[Confidence] \marginnote{Confidence}
|
||||
Given an association rule $A \rightarrow C$, its confidence is given by:
|
||||
\[ \texttt{conf}(A \rightarrow C) = \frac{\sigma(A \cup C)}{\sigma(A)} \in [0, 1] \]
|
||||
|
||||
Low confidence implies low reliability.
|
||||
|
||||
\begin{theorem}
|
||||
The confidence of $A \rightarrow C$ can be computed given the supports of $A \rightarrow C$ and $A$:
|
||||
\[ \texttt{conf}(A \rightarrow C) = \frac{\texttt{sup}(A \rightarrow C)}{\texttt{sup}(A)} \]
|
||||
\end{theorem}
|
||||
\end{description}
|
||||
|
||||
\item[Association rule mining] \marginnote{Association rule mining}
|
||||
Given $N$ transactions and two thresholds \texttt{min\_sup} and \texttt{min\_conf},
|
||||
association rule mining finds all the rules $A \rightarrow C$ such that:
|
||||
\[ \begin{split}
|
||||
\texttt{sup}(A \rightarrow C) &\geq \texttt{min\_sup} \\
|
||||
\texttt{conf}(A \rightarrow C) &\geq \texttt{min\_conf}
|
||||
\end{split} \]
|
||||
|
||||
This can be done in two steps:
|
||||
\begin{enumerate}
|
||||
\item \marginnote{Frequent itemset generation}
|
||||
Determine the itemsets with $\text{support} \geq \texttt{min\_sup}$ (frequent itemsets).
|
||||
\item \marginnote{Rule generation}
|
||||
Determine the association rules with $\text{confidence} \geq \texttt{min\_conf}$.
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Frequent itemset generation}
|
||||
|
||||
\subsection{Brute force}
|
||||
Given $D$ items, there are $2^D$ possible itemsets.
|
||||
To compute the support of a single itemset, the complexity is $O(NW)$ where
|
||||
$N$ is the number of transactions and $W$ is the width of the largest transaction.
|
||||
Listing all the itemsets and computing their support have an exponential complexity of $O(NW2^D)$.
|
||||
|
||||
|
||||
\subsection{Apriori principle}
|
||||
|
||||
\begin{theorem} \marginnote{Apriori principle}
|
||||
If an itemset is frequent, then all of its subsets are frequent.
|
||||
|
||||
\begin{proof}
|
||||
By the definition of support, it holds that:
|
||||
\[ \forall X, Y: (X \subseteq Y) \Rightarrow (\texttt{sup}(X) \geq \texttt{sup}(Y)) \]
|
||||
|
||||
In other words, the support metric is anti-monotone.
|
||||
\end{proof}
|
||||
\end{theorem}
|
||||
|
||||
\begin{corollary}
|
||||
If an itemset is infrequent, then all of its supersets are infrequent.
|
||||
\end{corollary}
|
||||
|
||||
\begin{example} \phantom{}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.6\textwidth]{img/itemset_apriori.png}
|
||||
\end{center}
|
||||
\end{example}
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\caption{Apriori principle}
|
||||
\begin{lstlisting}[mathescape=true]
|
||||
def candidatesGeneration(freq_itemsets$_k$):
|
||||
candidate_itemsets$_{k+1}$ = selfJoin(freq_itemsets$_k$)
|
||||
for itemset in candidate_itemsets$_{k+1}$:
|
||||
for sub in subsetsOfSize($k$, itemset):
|
||||
if sub not in freq_itemsets$_k$:
|
||||
candidate_itemsets$_{k+1}$.remove(itemset)
|
||||
return candidate_itemsets$_{k+1}$
|
||||
|
||||
def aprioriItemsetGeneration(transactions, min_sup):
|
||||
freq_itemsets$_1$ = itemsetsOfSize(1, transactions)
|
||||
k = 1
|
||||
while freq_itemsets$_1$ is not null:
|
||||
candidate_itemsets$_{k+1}$ = candidatesGeneration(freq_itemsets$_k$)
|
||||
freq_itemsets$_{k+1}$ = $\{ c \in \texttt{candidate\_itemsets}_{k+1} \mid \texttt{sup(}c\texttt{)} \geq \texttt{min\_sup} \}$
|
||||
k += 1
|
||||
return freq_itemsets$_k$
|
||||
\end{lstlisting}
|
||||
\end{algorithm}
|
||||
|
||||
\begin{description}
|
||||
\item[Complexity]
|
||||
The complexity of the apriori principle depends on:
|
||||
\begin{itemize}
|
||||
\item The choice of the support threshold.
|
||||
\item The number of unique items.
|
||||
\item The number and the width of the transactions.
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Rule generation}
|
||||
|
||||
\subsection{Brute force}
|
||||
Given a frequent $k$-itemset $L$, there are $2^k-2$ possible association rules ($-2$ as $L \rightarrow \varnothing$ and $\varnothing \rightarrow L$ can be ignored).
|
||||
For each possible rule, it is necessary to compute the confidence. The overall complexity is exponential.
|
||||
|
||||
\subsection{Apriori principle}
|
||||
|
||||
\begin{theorem} \marginnote{Apriori principle}
|
||||
Without loss of generality, consider an itemset $\{ A, B, C, D \}$.
|
||||
It holds that:
|
||||
\[ \texttt{conf}(ABC \rightarrow D) \geq \texttt{conf}(AB \rightarrow CD) \geq \texttt{conf}(A \rightarrow BCD) \]
|
||||
\end{theorem}
|
||||
|
||||
\begin{example} \phantom{}
|
||||
\begin{center}
|
||||
\includegraphics[width=0.5\textwidth]{img/rules_apriori.png}
|
||||
\end{center}
|
||||
\end{example}
|
||||
|
||||
|
||||
|
||||
\section{Interestingness measures}
|
||||
|
||||
\begin{description}
|
||||
\item[Contingency table] \marginnote{Contingency table}
|
||||
Given an association rule $A \rightarrow C$, its contingency table is defined as:
|
||||
\begin{center}
|
||||
\def\arraystretch{1.1}
|
||||
\begin{tabular}{c|c|c|c}
|
||||
& $C$ & $\overline{C}$ & \\
|
||||
\hline
|
||||
$A$ & $\prob{A \land C}$ & $\prob{A \land \overline{C}}$ & $\prob{A}$ \\
|
||||
\hline
|
||||
$\overline{A}$ & $\prob{\overline{A} \land C}$ & $\prob{\overline{A} \land \overline{C}}$ & $\prob{\overline{A}}$ \\
|
||||
\hline
|
||||
& $\prob{C}$ & $\prob{\overline{C}}$ & 100 \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
Confidence can be misleading.
|
||||
\begin{example} \phantom{}\\
|
||||
\begin{minipage}[t]{0.36\textwidth}
|
||||
Given the following contingency table:
|
||||
\begin{center}
|
||||
\begin{tabular}{c|c|c|c}
|
||||
& coffee & $\overline{\text{coffee}}$ & \\
|
||||
\hline
|
||||
tea & 15 & 5 & 20 \\
|
||||
\hline
|
||||
$\overline{\text{tea}}$ & 75 & 5 & 80 \\
|
||||
\hline
|
||||
& 90 & 10 & 100 \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
\hspace{0.5cm}
|
||||
\begin{minipage}[t]{0.6\textwidth}
|
||||
We have that:
|
||||
\[ \texttt{conf}(\text{tea} \rightarrow \text{coffee}) = \frac{\texttt{sup}(\text{tea}, \text{coffee})}{\texttt{sup}(\text{tea})} = \frac{15}{20} = 0.75 \]
|
||||
But, we also have that:
|
||||
\[ \prob{\text{coffee}} = 0.9 \hspace*{1cm} \prob{\text{coffee} \mid \overline{\text{tea}}} = \frac{75}{80} = 0.9375 \]
|
||||
So, despite the high confidence of $(\text{tea} \rightarrow \text{coffee})$,
|
||||
the probability of coffee increases in absence of tea.
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
\end{remark}
|
||||
|
||||
|
||||
\subsection{Statistical-based measures}
|
||||
|
||||
Measures that take into account the statistical independence of the items.
|
||||
|
||||
\begin{description}
|
||||
\item[Lift] \marginnote{Lift}
|
||||
\[ \texttt{lift}(A \rightarrow C) = \frac{\texttt{conf}(A \rightarrow C)}{\texttt{sup}(C)} = \frac{\prob{A \land C}}{\prob{A}\prob{C}} \]
|
||||
|
||||
If $\texttt{lift}(A \rightarrow C) = 1$, then $A$ and $C$ are independent.
|
||||
|
||||
\item[Leverage] \marginnote{Leverage}
|
||||
\[ \texttt{leve}(A \rightarrow C) = \texttt{sup}(A \cup C) - \texttt{sup}(A)\texttt{sup}(C) = \prob{A \land C} - \prob{A}\prob{C} \]
|
||||
|
||||
If $\texttt{leve}(A \rightarrow C) = 0$, then $A$ and $C$ are independent.
|
||||
|
||||
\item[Conviction] \marginnote{Conviction}
|
||||
\[ \texttt{conv}(A \rightarrow C) = \frac{1 - \texttt{sup}(C)}{1 - \texttt{conf}(A \rightarrow C)} = \frac{\prob{A}(1-\prob{C})}{\prob{A}-\prob{A \land C}} \]
|
||||
\end{description}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\begin{tabular}{c|p{10cm}}
|
||||
\hline
|
||||
\textbf{Metric} & \textbf{Interpretation} \\
|
||||
\hline
|
||||
High support & The rule applies to many transactions. \\
|
||||
\hline
|
||||
High confidence & The chance that the rule is true for some transactions is high. \\
|
||||
\hline
|
||||
High lift & Low chance that the rule is just a coincidence. \\
|
||||
\hline
|
||||
High conviction & The rule is violated less often compared to the case when the antecedent and consequent are independent. \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\caption{Intuitive interpretation of the measures}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
\section{Multi-dimensional association rules}
|
||||
|
||||
\begin{description}
|
||||
\item[Mono-dimensional events] \marginnote{Mono-dimensional events}
|
||||
Represented as transactions. Each event contains items that appear together.
|
||||
|
||||
\item[Multi-dimensional events] \marginnote{Multi-dimensional events}
|
||||
Represented as tuples. Each event contains the values of its attributes.
|
||||
|
||||
\item[Mono/Multi-dimensional equivalence] \marginnote{Equivalence}
|
||||
Mono-dimensional events can be converted into multi-dimensional events and vice versa.
|
||||
|
||||
To transform quantitative attributes, it is usually useful to discretize them.
|
||||
|
||||
\begin{example}[Multi to mono] \phantom{}\\
|
||||
\begin{minipage}{0.35\textwidth}
|
||||
\begin{center}
|
||||
\begin{tabular}{c|c|c}
|
||||
\textbf{Id} & \textbf{co2} & \textbf{tin\_oxide} \\
|
||||
\hline
|
||||
1 & high & medium \\
|
||||
2 & medium & low \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
$\rightarrow$
|
||||
\begin{minipage}{0.48\textwidth}
|
||||
\begin{center}
|
||||
\begin{tabular}{c|c}
|
||||
\textbf{Id} & \textbf{Transaction} \\
|
||||
\hline
|
||||
1 & $\{ \text{co2/high}, \text{tin\_oxide/medium} \}$ \\
|
||||
2 & $\{ \text{co2/medium}, \text{tin\_oxide/low} \}$ \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
|
||||
\begin{example}[Mono to multi] \phantom{}\\
|
||||
\begin{minipage}{0.35\textwidth}
|
||||
\begin{center}
|
||||
\begin{tabular}{c|c|c|c|c}
|
||||
\textbf{Id} & \textbf{a} & \textbf{b} & \textbf{c} & \textbf{d} \\
|
||||
\hline
|
||||
1 & yes & yes & no & no \\
|
||||
2 & yes & no & yes & no \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
$\leftarrow$
|
||||
\begin{minipage}{0.30\textwidth}
|
||||
\begin{center}
|
||||
\begin{tabular}{c|c}
|
||||
\textbf{Id} & \textbf{Transaction} \\
|
||||
\hline
|
||||
1 & $\{ \text{a}, \text{b} \}$ \\
|
||||
2 & $\{ \text{a}, \text{c} \}$ \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Multi-level association rules}
|
||||
Organize items into a hierarchy.
|
||||
|
||||
\begin{description}
|
||||
\item[Specialized to general] \marginnote{Specialized to general}
|
||||
Generally, the support of the rule increases.
|
||||
\begin{example}
|
||||
From $(\text{apple} \rightarrow \text{milk})$ to $(\text{fruit} \rightarrow \text{dairy})$
|
||||
\end{example}
|
||||
|
||||
\item[General to specialized] \marginnote{General to specialized}
|
||||
Generally, the support of the rule decreases.
|
||||
\begin{example}
|
||||
From $(\text{fruit} \rightarrow \text{dairy})$ to $(\text{apple} \rightarrow \text{milk})$
|
||||
\end{example}
|
||||
|
||||
\item[Redundant level] \marginnote{Redundant level}
|
||||
A more specialized rule in the hierarchy is redundant if its confidence is similar to the one of a more general rule.
|
||||
|
||||
\item[Multi-level association rule mining] \marginnote{Multi-level association rule mining}
|
||||
Run association rule mining on different levels of abstraction (general to specialized).
|
||||
At each level, the support threshold is decreased.
|
||||
\end{description}
|
||||
@ -0,0 +1,889 @@
|
||||
\chapter{Classification}
|
||||
|
||||
\begin{description}
|
||||
\item[(Supervised) classification] \marginnote{Classification}
|
||||
Given a finite set of classes $C$ and a dataset $\matr{X}$ of $N$ individuals,
|
||||
each associated to a class $y(\vec{x}) \in C$,
|
||||
we want to learn a model $\mathcal{M}$ able to
|
||||
guess the value of $y(\bar{\vec{x}})$ for unseen individuals.
|
||||
|
||||
Classification can be:
|
||||
\begin{descriptionlist}
|
||||
\item[Crisp] \marginnote{Crisp classification}
|
||||
Each individual has one and only one label.
|
||||
\item[Probabilistic] \marginnote{Probabilistic classification}
|
||||
Each individual is assigned to a label with a certain probability.
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Classification model] \marginnote{Classification model}
|
||||
A classification model (classifier) makes a prediction by taking as input
|
||||
a data element $\vec{x}$ and a decision function $y_\vec{\uptheta}$ parametrized on $\vec{\uptheta}$:
|
||||
\[ \mathcal{M}(\vec{x}, \vec{\uptheta}) = y_\vec{\uptheta}(\vec{x}) \]
|
||||
|
||||
\item[Vapnik-Chervonenkis dimension] \marginnote{Vapnik-Chervonenkis dimension}
|
||||
A dataset with $N$ elements defines $2^N$ learning problems.
|
||||
A model $\mathcal{M}$ has Vapnik-Chervonenkis (VC) dimension $N$ if
|
||||
it is able to solve all the possible learning problems with $N$ elements.
|
||||
|
||||
\begin{example}
|
||||
A straight line has VC dimension 3.
|
||||
\end{example}
|
||||
|
||||
\item[Data exploration] \marginnote{Data exploration}
|
||||
\begin{figure}[ht]
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_iris_boxplot_general.pdf}
|
||||
\caption{Iris dataset general boxplot}
|
||||
\end{subfigure}%
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_iris_boxplot_inside.pdf}
|
||||
\caption{Iris dataset class boxplot}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_iris_histogram.pdf}
|
||||
\caption{Iris dataset histograms}
|
||||
\end{subfigure}%
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_iris_pairplot.pdf}
|
||||
\caption{Iris dataset pairplots}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\item[Hyperparameters]
|
||||
Parameters of the model that have to be manually chosen.
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Evaluation}
|
||||
|
||||
\begin{description}
|
||||
\item[Dataset split]
|
||||
A supervised dataset can be randomly split into:
|
||||
\begin{descriptionlist}
|
||||
\item[Train set] \marginnote{Train set}
|
||||
Used to learn the model. Usually the largest split. Can be seen as an upper bound of the model performance.
|
||||
\item[Test set] \marginnote{Test set}
|
||||
Used to evaluate the trained model. Can be seen as a lower bound of the model performance.
|
||||
\item[Validation set] \marginnote{Validation set}
|
||||
Used to evaluate the model during training and/or for tuning parameters.
|
||||
\end{descriptionlist}
|
||||
It is assumed that the splits have similar characteristics.
|
||||
|
||||
\item[Overfitting] \marginnote{Overfitting}
|
||||
Given a dataset $\matr{X}$, a model $\mathcal{M}$ is overfitting if
|
||||
there exists another model $\mathcal{M}'$ such that:
|
||||
\[
|
||||
\begin{split}
|
||||
\texttt{error}_\text{train}(\mathcal{M}) &< \texttt{error}_\text{train}(\mathcal{M}') \\
|
||||
\texttt{error}_\matr{X}(\mathcal{M}) &> \texttt{error}_\matr{X}(\mathcal{M}') \\
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
Possible causes of overfitting are:
|
||||
\begin{itemize}
|
||||
\item Noisy data.
|
||||
\item Lack of representative instances.
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Test set error}
|
||||
\textbf{\underline{Disclaimer: I'm very unsure about this part}}\\
|
||||
The error on the test set can be seen as a lower bound error of the model.
|
||||
If the test set error ratio is $x$, we can expect an error of $(x \pm \text{confidence interval})$.
|
||||
|
||||
Predicting the elements of the test set can be seen as a binomial process (i.e. a series of $N$ Bernoulli processes).
|
||||
We can therefore compute the empirical frequency of success as $f = (\text{correct predictions}/N)$.
|
||||
We want to estimate the probability of success $p$.
|
||||
|
||||
We assume that the deviation between the empirical frequency and the true frequency is due to a
|
||||
normal noise around the true probability (i.e. the true probability $p$ is the mean).
|
||||
Fixed a confidence level $\alpha$ (i.e. the probability of a wrong estimate),
|
||||
we want that:
|
||||
\[ \prob{ z_{\frac{\alpha}{2}} \leq \frac{f-p}{\sqrt{\frac{1}{N}p(1-p)}} \leq z_{(1-\frac{\alpha}{2})} } = 1 - \alpha \]
|
||||
In other words, we want the middle term to have a high probability to
|
||||
be between the $\frac{\alpha}{2}$ and $(1-\frac{\alpha}{2})$ quantiles of the gaussian.
|
||||
\begin{center}
|
||||
\includegraphics[width=0.4\textwidth]{img/normal_quantile_test_error.png}
|
||||
\end{center}
|
||||
|
||||
We can estimate $p$ using the Wilson score interval\footnote{\url{https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval}}:
|
||||
\[ p = \frac{1}{1+\frac{1}{N}z^2} \left( f + \frac{1}{2N}z^2 \pm z\sqrt{\frac{1}{N}f(1-f) + \frac{z^2}{4N^2}} \right) \]
|
||||
where $z$ depends on the value of $\alpha$.
|
||||
For a pessimistic estimate, $\pm$ becomes a $+$. Vice versa, for an optimistic estimate, $\pm$ becomes a $-$.
|
||||
|
||||
As $N$ is at the denominator, this means that for large values of $N$, the uncertainty becomes smaller.
|
||||
\begin{center}
|
||||
\includegraphics[width=0.45\textwidth]{img/confidence_interval.png}
|
||||
\end{center}
|
||||
|
||||
\subsection{Dataset splits}
|
||||
|
||||
\begin{description}
|
||||
\item[Holdout] \marginnote{Holdout}
|
||||
The dataset is split into train, test and, if needed, validation.
|
||||
|
||||
\item[Cross-validation] \marginnote{Cross-validation}
|
||||
The training data is partitioned into $k$ chunks.
|
||||
For $k$ iterations, one of the chunks is used to test and the others to train a new model.
|
||||
The overall error is obtained as the average of the errors of the $k$ iterations.
|
||||
|
||||
In the end, the final model is still trained on the entire training data,
|
||||
while cross-validation results are used as an evaluation and comparison metric.
|
||||
Note that cross-validation is done on the training set, so a final test set can still be used to
|
||||
evaluate the resulting model.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.6\textwidth]{img/cross_validation.png}
|
||||
\caption{Cross-validation example}
|
||||
\end{figure}
|
||||
|
||||
\item[Leave-one-out] \marginnote{Leave-one-out}
|
||||
Extreme case of cross-validation with $k=N$, the size of the training set.
|
||||
In this case, the whole dataset but one element is used for training and the remaining entry for testing.
|
||||
|
||||
\item[Bootstrap] \marginnote{Bootstrap}
|
||||
Statistical sampling of the dataset with replacement (i.e. an entry can be selected multiple times).
|
||||
The selected entries form the training set while the elements that have never been selected are used for testing.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Binary classification performance measures}
|
||||
|
||||
In binary classification, the two classes can be distinguished as the positive and negative labels.
|
||||
The prediction of a classifier can be a:
|
||||
\begin{center}
|
||||
True positive ($TP$) $\cdot$ False positive ($FP$) $\cdot$ True negative ($TN$) $\cdot$ False negative ($FN$)
|
||||
\end{center}
|
||||
|
||||
\begin{center}
|
||||
\begin{tabular}{|c|c|c|c|}
|
||||
\cline{3-4}
|
||||
\multicolumn{2}{c|}{} & \multicolumn{2}{c|}{Predicted} \\
|
||||
\cline{3-4}
|
||||
\multicolumn{2}{c|}{} & Pos & Neg \\
|
||||
\hline
|
||||
\multirow{2}{*}{\rotatebox[origin=c]{90}{True}} & Pos & $TP$ & $FN$ \\
|
||||
\cline{2-4}
|
||||
& Neg & $FP$ & $TN$ \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
||||
Given a test set of $N$ element, possible metrics are:
|
||||
\begin{descriptionlist}
|
||||
\item[Accuracy] \marginnote{Accuracy}
|
||||
Number of correct predictions.
|
||||
\[ \text{accuracy} = \frac{TP + TN}{N} \]
|
||||
|
||||
\item[Error rate] \marginnote{Error rate}
|
||||
Number of incorrect predictions.
|
||||
\[ \text{error rate} = 1 - \text{accuracy} \]
|
||||
|
||||
\item[Precision] \marginnote{Precision}
|
||||
Number of true positives among what the model classified as positive
|
||||
(i.e. how many samples the model classified as positive are real positives).
|
||||
\[ \text{precision} = \frac{TP}{TP + FP} \]
|
||||
|
||||
\item[Recall/Sensitivity] \marginnote{Recall}
|
||||
Number of true positives among the real positives
|
||||
(i.e. how many real positives the model predicted).
|
||||
\[ \text{recall} = \frac{TP}{TP + FN} \]
|
||||
|
||||
\item[Specificity] \marginnote{Specificity}
|
||||
Number of true negatives among the real negatives
|
||||
(i.e. recall for negative labels).
|
||||
\[ \text{specificity} = \frac{TN}{TN + FP} \]
|
||||
|
||||
\item[F1 score] \marginnote{F1 score}
|
||||
Harmonic mean of precision and recall
|
||||
(i.e. measure of balance between precision and recall).
|
||||
\[ \text{F1} = 2 \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}} \]
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Multi-class classification performance measures}
|
||||
|
||||
\begin{descriptionlist}
|
||||
\item[Confusion matrix] \marginnote{Confusion matrix}
|
||||
Matrix to correlate the predictions of $n$ classes:
|
||||
\begin{center}
|
||||
\begin{tabular}{|c|c|c|c|c|c|}
|
||||
\cline{3-6}
|
||||
\multicolumn{2}{c|}{} & \multicolumn{4}{c|}{Predicted} \\
|
||||
\cline{3-6}
|
||||
\multicolumn{2}{c|}{} & a & b & c & Total \\
|
||||
\hline
|
||||
\multirow{4}{*}{\rotatebox[origin=c]{90}{True}}
|
||||
& a & $TP_a$ & $FP_{a-b}$ & $FP_{a-c}$ & $T_a$ \\
|
||||
\cline{2-6}
|
||||
& b & $FP_{b-a}$ & $TP_b$ & $FP_{b-c}$ & $T_b$ \\
|
||||
\cline{2-6}
|
||||
& c & $FP_{c-a}$ & $FP_{c-b}$ & $TP_c$ & $T_c$ \\
|
||||
\cline{2-6}
|
||||
& Total & $P_a$ & $P_b$ & $P_c$ & $N$ \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $a$, $b$ and $c$ are the classes.
|
||||
\item $T_x$ is the true number of labels of class $x$ in the dataset.
|
||||
\item $P_x$ is the predicted number of labels of class $x$ in the dataset.
|
||||
\item $TP_x$ is the number of times a class $x$ was correctly predicted (true predictions).
|
||||
\item $FP_{i-j}$ is the number of times a class $i$ was predicted as $j$ (false predictions).
|
||||
\end{itemize}
|
||||
|
||||
\item[Accuracy] \marginnote{Accuracy}
|
||||
Accuracy is extended from the binary case as:
|
||||
\[ \text{accuracy} = \frac{\sum_i TP_i}{N} \]
|
||||
|
||||
\item[Precision] \marginnote{Precision}
|
||||
Precision is defined w.r.t. a single class:
|
||||
\[ \text{precision}_i = \frac{TP_i}{P_i} \]
|
||||
|
||||
\item[Recall] \marginnote{Recall}
|
||||
Recall is defined w.r.t. a single class:
|
||||
\[ \text{recall}_i = \frac{TP_i}{T_i} \]
|
||||
\end{descriptionlist}
|
||||
|
||||
If a single value of precision or recall is needed, the mean can be used by computing
|
||||
a macro (unweighted) average or a class-weighted average.
|
||||
|
||||
\begin{description}
|
||||
\item[$\kappa$-statistic] \marginnote{$\kappa$-statistic}
|
||||
Evaluates the concordance between two classifiers (in our case, the predictor and the ground truth).
|
||||
It is based on two probabilities:
|
||||
\begin{descriptionlist}
|
||||
\item[Probability of concordance] $\prob{c} = \frac{\sum_{i}^{\texttt{classes}} TP_i}{N}$
|
||||
\item[Probability of random concordance] $\prob{r} = \frac{\sum_{i}^{\texttt{classes}} T_i P_i}{N^2}$
|
||||
\end{descriptionlist}
|
||||
|
||||
$\kappa$-statistic is given by:
|
||||
\[ \kappa = \frac{\prob{c} - \prob{r}}{1 - \prob{r}} \in [-1, 1] \]
|
||||
When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$),
|
||||
when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and
|
||||
when $\kappa = 0$, there is random agreement.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Probabilistic classifier performance measures}
|
||||
|
||||
\begin{description}
|
||||
\item[Lift chart] \marginnote{Lift chart}
|
||||
Used in binary classification.
|
||||
Given the resulting probabilities of the positive class of a classifier,
|
||||
sort them in decreasing order and plot a 2d-chart with
|
||||
increasing sample size on the x-axis and the number of positive samples on the y-axis.
|
||||
|
||||
Then, plot a straight line to represent a baseline classifier that makes random choices.
|
||||
As the probabilities are sorted in decreasing order, it is expected a high concentration of
|
||||
positive labels on the right side.
|
||||
When the area between the two curves is large and the curve is above the random classifier,
|
||||
the model can be considered a good classifier.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.5\textwidth]{img/lift_chart.png}
|
||||
\caption{Example of lift chart}
|
||||
\end{figure}
|
||||
|
||||
\item[ROC curve] \marginnote{ROC curve}
|
||||
The ROC curve can be seen as a way to represent multiple confusion matrices of a classifier
|
||||
that uses different thresholds.
|
||||
The x-axis of a ROC curve represents the false positive rate while the y-axis represents the true positive rate.
|
||||
|
||||
A straight line is used to represent a random classifier.
|
||||
A threshold can be considered good if it is high on the y-axis and low on the x-axis.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.35\textwidth]{img/roc_curve.png}
|
||||
\caption{Example of ROC curves}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Data imbalance}
|
||||
A classifier may not perform well when predicting a minority class of the training data.
|
||||
Possible solutions are:
|
||||
\begin{descriptionlist}
|
||||
\item[Undersampling] \marginnote{Undersampling}
|
||||
Randomly reduce the number of examples of the majority classes.
|
||||
|
||||
\item[Oversampling] \marginnote{Oversampling}
|
||||
Increase the examples of the minority classes.
|
||||
|
||||
\begin{description}
|
||||
\item[Synthetic minority oversampling technique (SMOTE)] \marginnote{SMOTE}
|
||||
\begin{enumerate}
|
||||
\item Randomly select an example $x$ belonging to the minority class.
|
||||
\item Select a random neighbor $z_i$ among its $k$-nearest neighbors $z_1, \dots, z_k$.
|
||||
\item Synthesize a new example by selecting a random point of the feature space between $x$ and $z_i$.
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
|
||||
\item[Cost sensitive learning] \marginnote{Cost sensitive learning}
|
||||
Assign a cost to the errors. Higher weights are assigned to minority classes.
|
||||
This can be done by:
|
||||
\begin{itemize}
|
||||
\item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
|
||||
\item Weighting the classes (possible in some algorithms).
|
||||
\end{itemize}
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Decision trees}
|
||||
|
||||
\subsection{Information theory} \label{sec:information_theory}
|
||||
|
||||
\begin{description}
|
||||
\item[Shannon theorem] \marginnote{Shannon theorem}
|
||||
Let $\matr{X} = \{ \vec{v}_1, \dots, \vec{v}_V \}$ be a data source where
|
||||
each of the possible values has probability $p_i = \prob{\vec{v}_i}$.
|
||||
The best encoding allows to transmit $\matr{X}$ with
|
||||
an average number of bits given by the \textbf{entropy} of $X$: \marginnote{Entropy}
|
||||
\[ H(\matr{X}) = - \sum_j p_j \log_2(p_j) \]
|
||||
$H(\matr{X})$ can be seen as a weighted sum of the surprise factor $-\log_2(p_j)$.
|
||||
If $p_j \sim 1$, then the surprise of observing $\vec{v}_j$ is low, vice versa,
|
||||
if $p_j \sim 0$, the surprise of observing $\vec{v}_j$ is high.
|
||||
|
||||
Therefore, when $H(\matr{X})$ is high, $\matr{X}$ is close to a uniform distribution.
|
||||
When $H(\matr{X})$ is low, $\matr{X}$ is close to a constant.
|
||||
|
||||
\begin{example}[Binary source] \phantom{}\\
|
||||
\begin{minipage}{.50\linewidth}
|
||||
The two values of a binary source $\matr{X}$ have respectively probability $p$ and $(1-p)$.
|
||||
When $p \sim 0$ or $p \sim 1$, $H(\matr{X}) \sim 0$.\\
|
||||
When $p \sim 0.5$, $H(\matr{X}) \sim \log_2(2)=1$
|
||||
\end{minipage}
|
||||
\begin{minipage}{.45\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/binary_entropy.png}
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
|
||||
\item[Entropy threshold split] \marginnote{Entropy threshold split}
|
||||
Given a dataset $\matr{D}$,
|
||||
a real-valued attribute $d \in \matr{D}$,
|
||||
a threshold $t$ in the domain of $d$ and
|
||||
the class attribute $c$ of $\matr{D}$.
|
||||
The entropy of the class $c$ of the dataset $\matr{D}$ split with threshold $t$ on $d$ is a weighted sum:
|
||||
\[ H(c \,\vert\, d \,:\, t) = \prob{d < t}H(c \,\vert\, d < t) + \prob{d \geq t}H(c \,\vert\, d \geq t) \]
|
||||
|
||||
\item[Information gain] \marginnote{Information gain}
|
||||
Information gain measures the reduction in entropy after applying a split.
|
||||
It is computed as:
|
||||
\[ IG(c \,\vert\, d \,:\, t) = H(c) - H(c \,\vert\, d \,:\, t) \]
|
||||
When $H(c \,\vert\, d \,:\, t)$ is low, $IG(c \,\vert\, d \,:\, t)$ is high
|
||||
as splitting with threshold $t$ results in purer groups.
|
||||
Vice versa, when $H(c \,\vert\, d \,:\, t)$ is high, $IG(c \,\vert\, d \,:\, t)$ is low
|
||||
as splitting with threshold $t$ is not very useful.
|
||||
|
||||
The information gain of a class $c$ split on a feature $d$ is given by:
|
||||
\[ IG(c \,\vert\, d) = \max_t IG(c \,\vert\, d \,:\, t) \]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Tree construction}
|
||||
|
||||
\begin{description}
|
||||
\item[Decision tree (C4.5)] \marginnote{Decision tree}
|
||||
Tree-shaped classifier where leaves are class predictions and
|
||||
inner nodes represent conditions that guide to a leaf.
|
||||
This type of classifier is non-linear (i.e. does not represent a linear separation).
|
||||
|
||||
Each node of the tree contains:
|
||||
\begin{itemize}
|
||||
\item The applied splitting criteria (i.e. feature and threshold).
|
||||
Leaves do not have this value.
|
||||
\item The purity (e.g. entropy) of the current split.
|
||||
\item Dataset coverage of the current split.
|
||||
\item Classes distribution.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.5\textwidth]{img/_iris_decision_tree_example.pdf}
|
||||
\caption{Example of decision tree}
|
||||
\end{figure}
|
||||
|
||||
Note: the weighted sum of the entropies of the children is always smaller than the entropy of the parent.
|
||||
|
||||
Possible stopping conditions are:
|
||||
\begin{itemize}
|
||||
\item When most of the leaves are pure (i.e. nothing useful to split).
|
||||
\item When some leaves are impure but none of the possible splits have positive $IG$.
|
||||
Impure leaves are labeled with the majority class.
|
||||
\end{itemize}
|
||||
|
||||
\item[Purity] \marginnote{Purity}
|
||||
Value to maximize when splitting a node of a decision tree.
|
||||
|
||||
Nodes with uniformly distributed classes have a low purity.
|
||||
Nodes with a single class have the highest purity.
|
||||
|
||||
Possible impurity measures are:
|
||||
\begin{descriptionlist}
|
||||
\item[Entropy/Information gain] See \Cref{sec:information_theory}.
|
||||
|
||||
\item[Gini index] \marginnote{Gini index}
|
||||
Let $\matr{X}$ be a dataset with classes $C$.
|
||||
The Gini index measures how often an element of $\matr{X}$ would be misclassified
|
||||
if the labels were randomly assigned based on the frequencies of the classes in $\matr{X}$.
|
||||
|
||||
Given a class $i \in C$, $p_i$ is the probability (i.e. frequency) of classifying an element with $i$ and
|
||||
$(1 - p_i)$ is the probability of classifying it with a different label.
|
||||
The Gini index is given by:
|
||||
\[
|
||||
\begin{split}
|
||||
GINI(\matr{X}) = \sum_i^C p_i (1-p_i) &= \sum_i^C p_i - \sum_i^C p_i^2 \\
|
||||
&= 1 - \sum_i^C p_i^2
|
||||
\end{split}
|
||||
\]
|
||||
When $\matr{X}$ is uniformly distributed, $GINI(\matr{X}) \sim (1-\frac{1}{\vert C \vert})$.
|
||||
When $\matr{X}$ is constant, $GINI(\matr{X}) \sim 0$.
|
||||
|
||||
Given a node $x$ split in $n$ children $x_1, \dots, x_n$,
|
||||
the Gini gain of the split is given by:
|
||||
\[ GINI_\text{gain} = GINI(x) - \sum_{i=1}^n \frac{\vert x_i \vert}{\vert x \vert} GINI(x_i) \]
|
||||
|
||||
\item[Misclassification error] \marginnote{Misclassification error}
|
||||
Skipped.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.35\textwidth]{img/impurity_comparison.png}
|
||||
\caption{Comparison of impurity measures}
|
||||
\end{figure}
|
||||
|
||||
Compared to Gini index, entropy is more robust to noise.
|
||||
|
||||
Misclassification error has a bias toward the major class.
|
||||
\end{description}
|
||||
|
||||
\begin{algorithm}[H]
|
||||
\caption{Decision tree construction using information gain as impurity measure}
|
||||
\begin{lstlisting}
|
||||
def buildTree(split):
|
||||
node = Node()
|
||||
if len(split.classes) == 1: # Pure split
|
||||
node.label = split.classes[0]
|
||||
node.isLeaf = True
|
||||
else:
|
||||
ig, attribute, threshold = getMaxInformationGain(split)
|
||||
if ig < 0:
|
||||
node.label = split.majorityClass()
|
||||
node.isLeaf = True
|
||||
else:
|
||||
node.left = buildTree(split[attribute < threshold])
|
||||
node.right = buildTree(split[attribute >= threshold])
|
||||
return node
|
||||
\end{lstlisting}
|
||||
\end{algorithm}
|
||||
|
||||
\begin{description}
|
||||
\item[Pruning] \marginnote{Pruning}
|
||||
Remove branches to reduce overfitting.
|
||||
Different pruning techniques can be employed:
|
||||
\begin{descriptionlist}
|
||||
\item[Maximum depth]
|
||||
Maximum depth allowed for the tree.
|
||||
|
||||
\item[Minimum samples for split]
|
||||
Minimum number of samples a node is required to have to apply a split.
|
||||
|
||||
\item[Minimum samples for a leaf]
|
||||
Minimum number of samples a node is required to have to become a leaf.
|
||||
|
||||
\item[Minimum impurity decrease]
|
||||
Minimum decrease in impurity for a split to be made.
|
||||
|
||||
\item[Statistical pruning]
|
||||
Prune the children of a node if the weighted sum of the maximum errors of the children is greater than
|
||||
the maximum error of the node if it was a leaf.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Complexity}
|
||||
Given a dataset $\matr{X}$ of $N$ instances and $D$ attributes,
|
||||
each level of the tree requires to evaluate the whole dataset and
|
||||
each node requires to process all the attributes.
|
||||
Assuming an average height of $O(\log N)$,
|
||||
the overall complexity for induction (parameters search) is $O(DN \log N)$.
|
||||
|
||||
Moreover, the other operations of a binary tree have complexity:
|
||||
\begin{itemize}
|
||||
\item Threshold search and binary split: $O(N \log N)$ (scan the dataset for the threshold).
|
||||
\item Pruning: $O(N \log N)$ (requires to scan the dataset).
|
||||
\end{itemize}
|
||||
|
||||
For inference, to classify a new instance it is sufficient to traverse the tree from the root to a leaf.
|
||||
This has complexity $O(h)$, with $h$ the height of the tree.
|
||||
|
||||
|
||||
\subsection{Characteristics}
|
||||
\begin{itemize}
|
||||
\item Decision trees are non-parametric in the sense that they do not require any assumption on the distribution of the data.
|
||||
\item Finding the best tree is an NP-complete problem.
|
||||
\item Decision trees are robust to noise if appropriate overfitting methods are applied.
|
||||
\item Decision trees are robust to redundant attributes (correlated attributes are very unlikely to be chosen for multiple splits).
|
||||
\item In practice, the impurity measure has a low impact on the final result, while the pruning strategy is more relevant.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\section{Naive Bayes}
|
||||
|
||||
\begin{description}
|
||||
\item[Bayes' theorem]
|
||||
Given a class $c$ and the evidence $\vec{e}$, we have that:
|
||||
\[ \prob{c \mid \vec{e}} = \frac{\prob{\vec{e} \mid c} \prob{c}}{\prob{\vec{e}}} \]
|
||||
|
||||
\item[Naive Bayes classifier] \marginnote{Naive Bayes classifier}
|
||||
Classifier that uses the Bayes' theorem assuming that the attributes are independent given the class.
|
||||
Given a class $c$ and the evidence $\vec{e} = \langle e_1, e_2, \dots, e_n \rangle$, the probability that
|
||||
the observation $\vec{e}$ is of class $c$ is given by:
|
||||
\[
|
||||
\prob{c \mid \vec{e}} = \frac{\prod_{i=1}^{n}\prob{e_i \mid c} \cdot \prob{c}}{\prob{\vec{e}}}
|
||||
\]
|
||||
As the denominator is the same for all classes, it can be omitted.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Training and inference}
|
||||
\begin{description}
|
||||
\item[Training] \marginnote{Training}
|
||||
Given the classes $C$ and the features $E$,
|
||||
to train the classifier the following priors need to be estimated:
|
||||
\begin{itemize}
|
||||
\item $\forall c \in C:\, \prob{c}$
|
||||
\item $\forall e_{ij} \in E, \forall c \in C:\, \prob{e_{ij} \mid c}$,
|
||||
where $e_{ij}$ is the $j$-th value of the domain of the $i$-th feature $E_i$.
|
||||
\end{itemize}
|
||||
|
||||
\item[Inference] \marginnote{Inference}
|
||||
Given a new observation $\vec{x}_\text{new} = \langle x_1, x_2, \dots, x_n \rangle$,
|
||||
its class is determined by computing the likelihood:
|
||||
\[
|
||||
c_\text{new} = \arg\max_{c \in C} \prob{c} \prod_{i=1}^{n}\prob{x_i \mid c}
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Problems}
|
||||
\begin{description}
|
||||
\item[Smooting]
|
||||
If the value $e_{ij}$ of the domain of a feature $E_i$ never appears in the dataset,
|
||||
its probability $\prob{e_{ij} \mid c}$ will be 0 for all classes.
|
||||
This nullifies all the probabilities that use this feature when
|
||||
computing the chain of products during inference.
|
||||
Smoothing methods can be used to avoid this problem.
|
||||
|
||||
\begin{description}
|
||||
\item[Laplace smoothing] \marginnote{Laplace smoothing}
|
||||
Given:
|
||||
\begin{descriptionlist}
|
||||
\item[$\alpha$] The smoothing factor.
|
||||
\item[\normalfont$\text{af}_{e_{ij}, c}$] The absolute frequency of the value $e_{ij}$ of the feature $E_i$ over the class $c$.
|
||||
\item[$\vert \mathbb{D}_{E_i} \vert$] The number of distinct values in the domain of $E_i$.
|
||||
\item[\normalfont$\text{af}_{c}$] The absolute frequency of the class $c$.
|
||||
\end{descriptionlist}
|
||||
The smoothed frequency is computed as:
|
||||
\[
|
||||
\prob{e_{ij} \mid c} = \frac{\text{af}_{e_{ij}, c} + \alpha}{\text{af}_{c} + \alpha \vert \mathbb{D}_{E_i} \vert}
|
||||
\]
|
||||
|
||||
A common value of $\alpha$ is 1.
|
||||
When $\alpha = 0$, there is no smoothing.
|
||||
For higher values of $\alpha$, the smoothed feature gains more importance when computing the priors.
|
||||
\end{description}
|
||||
|
||||
\item[Missing values] \marginnote{Missing values}
|
||||
Naive Bayes is robust to missing values.
|
||||
|
||||
During training, the record is ignored in the frequency count of the missing feature.
|
||||
|
||||
During inference, the missing feature can be simply excluded in the computation of the likelihood
|
||||
as this equally affects all classes.
|
||||
|
||||
\item[Numeric values] \marginnote{Gaussian assumption}
|
||||
For continuous numeric values, the frequency count method cannot be used.
|
||||
Therefore, an additional assumption is made: numeric values follow a Gaussian distribution.
|
||||
|
||||
During training, the mean $\mu_{i,c}$ and variance $\sigma_{i,c}$ for a numeric feature $E_i$ is computed with respect to a class $c$.
|
||||
Its probability is then obtained as:
|
||||
\[ \prob{E_i = x \mid c} = \mathcal{N}(\mu_{i,c}, \sigma_{i,c})(x) \]
|
||||
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Perceptron}
|
||||
|
||||
\begin{description}
|
||||
\item[Perceptron] \marginnote{Perceptron}
|
||||
A single artificial neuron that takes $n$ inputs $x_1, \dots, x_n$ and a bias $b$,
|
||||
and computes a linear combination of them with weights $w_1, \dots, w_n, w_b$.
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.25\textwidth]{img/_perceptron.pdf}
|
||||
\caption{Example of perceptron}
|
||||
\end{figure}
|
||||
|
||||
The learnt weights $w_b, w_1, \dots, w_n$ define a hyperplane for binary classification such that:
|
||||
\[
|
||||
w_1 x_1 + \text{\dots} + w_n x_n + w_b b = \begin{cases}
|
||||
\texttt{positive} & \text{if $> 0$} \\
|
||||
\texttt{negative} & \text{if $< 0$} \\
|
||||
\end{cases}
|
||||
\]
|
||||
It can be shown that there are either none or infinite hyperplanes with this property.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Training}
|
||||
\begin{algorithm}
|
||||
\caption{Perceptron training}
|
||||
\begin{lstlisting}[mathescape=true]
|
||||
def trainPerceptron(dataset):
|
||||
perceptron = Perceptron(weights=[0 $\dots$ 0])
|
||||
|
||||
while accuracy(perceptron, dataset) != 1.0:
|
||||
for x, y in dataset:
|
||||
if perceptron.predict(x) != y:
|
||||
if y is positive_class:
|
||||
perceptron.weights += x
|
||||
else:
|
||||
perceptron.weights -= x
|
||||
\end{lstlisting}
|
||||
\end{algorithm}
|
||||
|
||||
Note that the algorithm converges only if the dataset is linearly separable.
|
||||
In practice, a maximum number of iterations is set.
|
||||
|
||||
|
||||
|
||||
\section{Support vector machine}
|
||||
|
||||
\begin{description}
|
||||
\item[Convex hull]
|
||||
The convex hull of a set of points is the tightest enclosing convex polygon that contains those points.
|
||||
|
||||
Note: the convex hulls of a linearly separable dataset do not intersect.
|
||||
|
||||
\item[Maximum margin hyperplane] \marginnote{Maximum margin hyperplane}
|
||||
Hyperplane with the maximum margin between two convex hulls.
|
||||
|
||||
In general, a subset of points (support vectors) \marginnote{Support vectors}
|
||||
in the training set is sufficient to define the hulls.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.4\textwidth]{img/svm.png}
|
||||
\caption{Maximum margin hyperplane of linearly separable data}
|
||||
\end{figure}
|
||||
|
||||
\item[Support vector machine] \marginnote{Support vector machine}
|
||||
SVM\footnote{\scriptsize\url{https://www.cs.princeton.edu/courses/archive/spring16/cos495/slides/AndrewNg_SVM_note.pdf}}
|
||||
finds the maximum margin hyperplane and the support vectors as a constrained quadratic optimization problem.
|
||||
Given a dataset of $D$ elements and $n$ features, the problem is defined as:
|
||||
\[ \max_{w_0, w_1, \dots, w_n} M \]
|
||||
\[
|
||||
\begin{split}
|
||||
\text{subject to } & \sum_{i=1}^{n} w_i^2 = 1 \\
|
||||
& c_i(w_0 + w_1 x_{i1} + \dots + w_n x_{in}) \geq M \,\, \forall i = 1, \dots, D
|
||||
\end{split}
|
||||
\]
|
||||
where $M$ is the margin, $w_i$ are the weights of the hyperplane and $c_i = \{-1, 1 \}$ is the class.
|
||||
The second constraint imposes the hyperplane to have a large margin.
|
||||
For positive labels ($c_i=1$), this is true when the hyperplane is positive.
|
||||
For negative labels ($c_i=-1$), this is true when the hyperplane is negative.
|
||||
|
||||
\begin{description}
|
||||
\item[Soft margin] \marginnote{Soft margin}
|
||||
As real-world data is not always linearly separable,
|
||||
soft margin relaxes the margin constraint by adding a penalty $C$.
|
||||
The margin constraint becomes:
|
||||
\[ c_i(w_0 + w_1 x_{i1} + \dots + w_n x_{in}) \geq M - \xi_i \,\, \forall i = 1, \dots, D \]
|
||||
\[ \text{where } \xi_i \geq 0 \text{ and } \sum_{i=0}^{D} \xi_i = C \]
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Kernel trick}\marginnote{Kernel trick}
|
||||
For non-linearly separable data, the boundary can be found using a non-linear mapping
|
||||
to map the data into a new space (feature space) where a linear separation is possible.
|
||||
Then, the data and the boundary is mapped back into the original space.
|
||||
|
||||
\begin{figure}[h]
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/svm_kernel_example1.png}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.49\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/svm_kernel_example2.png}
|
||||
\end{subfigure}
|
||||
\caption{Example of mapping from $\mathbb{R}^2$ to $\mathbb{R}^3$}
|
||||
\end{figure}
|
||||
|
||||
The kernel trick allows to avoid explicitly mapping the dataset into the new space by using kernel functions.
|
||||
Known kernel functions are:
|
||||
\begin{descriptionlist}
|
||||
\item[Linear] $K(x, y) = \langle x, y \rangle$.
|
||||
\item[Polynomial] $K(x, y) = (\gamma \langle x, y \rangle + r)^d$, where $\gamma$, $r$ and $d$ are parameters.
|
||||
\item[Radial based function] $K(x, y) = \exp(-\gamma \Vert x - y \Vert^2)$, where $\gamma$ is a parameter.
|
||||
\item[Sigmoid] $K(x, y) = \tanh(\langle x, y \rangle + r)$, where $r$ is a parameter.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Complexity}
|
||||
Given a dataset with $D$ entries of $n$ features, the complexity of SVM scales from $O(nD^2)$ to $O(nD^3)$
|
||||
depending on the effectiveness of data caching.
|
||||
|
||||
|
||||
\subsection{Characteristics}
|
||||
\begin{itemize}
|
||||
\item Training an SVM model is generally slower.
|
||||
\item SVM is not affected by local minimums.
|
||||
\item SVM does not suffer the curse of dimensionality.
|
||||
\item SVM does not directly provide probability estimates.
|
||||
If needed, these can be computed using a computationally expensive method.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\section{Neural networks}
|
||||
|
||||
\begin{description}
|
||||
\item[Multilayer perceptron] \marginnote{Multilayer perceptron}
|
||||
Hierarchical structure of perceptrons, each with an activation function.
|
||||
|
||||
\item[Activation function] \marginnote{Activation function}
|
||||
Activation functions are useful to add non-linearity.
|
||||
|
||||
\begin{remark}
|
||||
In a linear system, if there is noise in the input, it is transferred to the output
|
||||
(i.e. linearity implies that $f(x + \text{noise}) = f(x) + f(\text{noise})$).
|
||||
On the other hand, a non-linear system is generally more robust
|
||||
(i.e. non-linearity generally implies that $f(x + \text{noise}) \neq f(x) + f(\text{noise})$)
|
||||
\end{remark}
|
||||
|
||||
\item[Feedforward neural network] \marginnote{Feedforward neural network}
|
||||
Network with the following flow:
|
||||
\[ \text{Input layer} \rightarrow \text{Hidden layer} \rightarrow \text{Output layer} \]
|
||||
Neurons at each layer are connected to all neurons of the next layer.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Training}
|
||||
Inputs are fed to the network and backpropagation is used to update the weights.
|
||||
|
||||
\begin{description}
|
||||
\item[Learning rate] \marginnote{Learning rate}
|
||||
Size of the step for gradient descent.
|
||||
|
||||
\item[Epoch] \marginnote{Epoch}
|
||||
A round of training where the entire dataset is processed.
|
||||
|
||||
\item[Stopping criteria] \marginnote{Stopping criteria}
|
||||
Possible conditions to stop the training are:
|
||||
\begin{itemize}
|
||||
\item Small weights update.
|
||||
\item The classification error goes below a predefined target.
|
||||
\item Timeout or maximum number of epochs.
|
||||
\end{itemize}
|
||||
|
||||
\item[Regularization] \marginnote{Regularization}
|
||||
Smoothing of the loss function.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{K-nearest neighbors}
|
||||
|
||||
\begin{description}
|
||||
\item[K-nearest neighbors] \marginnote{K-nearest neighbors}
|
||||
Given a similarity metric and a training set,
|
||||
to predict a new observation, the $k$ most similar entries in the training set are selected
|
||||
and the class of the new data is determined as the most frequent class among the $k$ entries.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Binary to multi-class classification}
|
||||
|
||||
\begin{description}
|
||||
\item[One-vs-one strategy (OVO)] \marginnote{One-vs-one strategy (OVO)}
|
||||
Train a classifier for all the possible pairs of classes (this will result in $\frac{C \cdot (C-1)}{2}$ pairs).
|
||||
The class assigned to a new observation is determined through a majority vote.
|
||||
|
||||
\item[One-vs-rest strategy (OVR)] \marginnote{One-vs-rest strategy (OVR)}
|
||||
Train $C$ classifiers where each is specialized to classify a specific class as positive and the others as negative.
|
||||
The class assigned to a new observation is determined by the confidence score of each classifier.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Ensemble methods}
|
||||
\marginnote{Ensemble methods}
|
||||
Train a set of base classifiers and make predictions by majority vote.
|
||||
If all the classifiers have the same but independent error rate,
|
||||
the overall error of the ensemble model is lower (derived from a binomial distribution).
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.6\textwidth]{img/ensemble_error.png}
|
||||
\caption{Relationship between the error of base classifiers and ensemble models}
|
||||
\end{figure}
|
||||
|
||||
Different strategies to train an ensemble classifier can be used:
|
||||
\begin{descriptionlist}
|
||||
\item[Dataset manipulation] Resampling the dataset for each base classifier:
|
||||
\begin{description}
|
||||
\item[Bagging]
|
||||
Sample with replacement with a uniform distribution.
|
||||
\item[Boosting]
|
||||
Iteratively change the distribution of the training data
|
||||
prioritizing examples difficult to classify.
|
||||
\begin{description}
|
||||
\item[Adaboost] \marginnote{Adaboost}
|
||||
Iteratively train base classifiers on a dataset where samples
|
||||
misclassified at the previous iteration have a higher weight.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
\item[Feature manipulation]
|
||||
Train a base classifier using only a subset of the features.
|
||||
|
||||
\item[Class labels manipulation]
|
||||
Train a base classifier to classify a partition of the class labels.
|
||||
For instance, class labels can be partitioned into two groups $A_1$ and $A_2$, and
|
||||
the base classifier is trained to assign as label one of the two groups.
|
||||
During inference, when a group is predicted, all labels within that group receive a vote.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Random forests}
|
||||
\marginnote{Random forests}
|
||||
|
||||
Multiple decision trees trained on a different random sampling of the training set and different subsets of features.
|
||||
A prediction is made by averaging the output of each tree.
|
||||
|
||||
\begin{description}
|
||||
\item[Bias] \marginnote{Bias}
|
||||
Simplicity of the target function of a model.
|
||||
\item[Variance] \marginnote{Variance}
|
||||
Amount of change of the target function when using different training data (i.e. how much the model overfits).
|
||||
\end{description}
|
||||
|
||||
Random forests aim to reduce the high variance of decision trees.
|
||||
@ -0,0 +1,554 @@
|
||||
\chapter{Clustering}
|
||||
|
||||
|
||||
\section{Similarity and dissimilarity}
|
||||
|
||||
\begin{description}
|
||||
\item[Similarity] \marginnote{Similarity}
|
||||
Measures how alike two objects are.
|
||||
Often defined in the range $[0, 1]$.
|
||||
|
||||
\item[Dissimilarity] \marginnote{Dissimilarity}
|
||||
Measures how two objects differ.
|
||||
0 indicates no difference while the upper bound varies.
|
||||
\end{description}
|
||||
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\renewcommand{\arraystretch}{2}
|
||||
\begin{tabular}{c | c | c}
|
||||
\textbf{Attribute type} & \textbf{Dissimilarity} & \textbf{Similarity} \\
|
||||
\hline
|
||||
Nominal & $d(p, q) = \begin{cases} 0 & \text{if } p=q \\ 1 & \text{if } p \neq q \end{cases}$ & $s(p, q) = 1 - d(p, q)$ \\
|
||||
\hline
|
||||
Ordinal & $d(p, q) = \frac{\vert p - q \vert}{V}$ with $p, q \in \{ 0, \dots, V \}$ & $s(p, q) = 1 - d(p, q)$ \\
|
||||
\hline
|
||||
Interval or ratio & $d(p, q) = \vert p - q \vert$ & $s(p, q) = \frac{1}{1 + d(p, q)}$
|
||||
\end{tabular}
|
||||
\caption{Similarity and dissimilarity by attribute type}
|
||||
\end{table}
|
||||
|
||||
\begin{description}
|
||||
\item[Similarity properties] \phantom{}
|
||||
\begin{enumerate}
|
||||
\item $\texttt{sim}(p, q) = 1$ iff $p = q$.
|
||||
\item $\texttt{sim}(p, q) = \texttt{sim}(q, p)$.
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Distance}
|
||||
|
||||
Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics are:
|
||||
\begin{descriptionlist}
|
||||
\item[Minkowski distance ($L_r$)] \marginnote{Minkowski distance}
|
||||
\[ \texttt{dist}(p, q) = \left( \sum_{d=1}^{D} \vert p_d - q_d \vert^r \right)^{\frac{1}{r}} \]
|
||||
where $r$ is a parameter.
|
||||
|
||||
Common values for $r$ are:
|
||||
\begin{descriptionlist}
|
||||
\item[$r = 1$]
|
||||
Corresponds to the $L_1$ norm.
|
||||
It is useful for discriminating 0 distance and near-0 distance as
|
||||
an $\varepsilon$ change in the data corresponds to an $\varepsilon$ change in the distance.
|
||||
\item[$r = 2$]
|
||||
Corresponds to the Euclidean distance or $L_2$ norm.
|
||||
\item[$r = \infty$]
|
||||
Corresponds to the $L_\infty$ norm.
|
||||
Considers only the dimensions with the maximum difference.
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Mahalanobis distance] \marginnote{Mahalanobis distance}
|
||||
\[ \texttt{dist}(p, q) = \sqrt{ (p-q) \matr{\Sigma}^{-1} (p-q)^T } \]
|
||||
where $\matr{\Sigma}$ is the covariance matrix of the dataset.
|
||||
The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them
|
||||
points towards a direction of greater variation of the data.
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.35\textwidth]{img/mahalanobis.png}
|
||||
\caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
|
||||
\subsubsection{Distance properties}
|
||||
\begin{descriptionlist}
|
||||
\item[Positive definiteness]
|
||||
$\texttt{dist}(p, q) \geq 0$ and $\texttt{dist}(p, q) = 0$ iff $p = q$.
|
||||
\item[Symmetry]
|
||||
$\texttt{dist}(p, q) = \texttt{dist}(q, p)$
|
||||
\item[Triangle inequality]
|
||||
$\texttt{dist}(p, q) \leq \texttt{dist}(p, r) + \texttt{dist}(r, q)$
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\subsection{Vector similarity}
|
||||
|
||||
\begin{description}
|
||||
\item[Binary vectors]
|
||||
Given two examples $p$ and $q$ with binary features, we can compute the following values:
|
||||
\[
|
||||
\begin{split}
|
||||
M_{00} &= \text{ number of features that equals to 0 for both $p$ and $q$} \\
|
||||
M_{01} &= \text{ number of features that equals to 0 for $p$ and 1 for $q$} \\
|
||||
M_{10} &= \text{ number of features that equals to 1 for $p$ and 0 for $q$} \\
|
||||
M_{11} &= \text{ number of features that equals to 1 for both $p$ and $q$}
|
||||
\end{split}
|
||||
\]
|
||||
Possible distance metrics are:
|
||||
\begin{descriptionlist}
|
||||
\item[Simple matching coefficient] \marginnote{Simple matching coefficient}
|
||||
$\texttt{SMC}(p, q) = \frac{M_{00} + M_{11}}{M_{00} + M_{01} + M_{10} + M_{11}}$
|
||||
\item[Jaccard coefficient] \marginnote{Jaccard coefficient}
|
||||
$\texttt{JC}(p, q) = \frac{M_{11}}{M_{01} + M_{10} + M_{11}}$
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Cosine similarity] \marginnote{Cosine similarity}
|
||||
Cosine of the angle between two vectors:
|
||||
\[ \texttt{cos}(p, q) = \frac{p \cdot q}{\Vert p \Vert \cdot \Vert q \Vert} \]
|
||||
|
||||
\item[Extended Jaccard coefficient (Tanimoto)] \marginnote{Extended Jaccard coefficient (Tanimoto)}
|
||||
Variation of the Jaccard coefficient for continuous values:
|
||||
\[ \texttt{T}(p, q) = \frac{p \cdot q}{\Vert p \Vert^2 + \Vert q \Vert^2 - p \cdot q} \]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Correlation}
|
||||
|
||||
\begin{description}
|
||||
\item[Pearson's correlation] \marginnote{Pearson's correlation}
|
||||
Measure of linear relationship between a pair of quantitative attributes $e_1$ and $e_2$.
|
||||
To compute Pearson's correlation, the values of $e_1$ and $e_2$ are first standardized and then ordered to obtain the vectors $\vec{e}_1$ and $\vec{e}_2$.
|
||||
The correlation is then computed as the dot product between $\vec{e}_1$ and $\vec{e}_2$:
|
||||
\[ \texttt{corr}(e_1, e_2) = \langle \vec{e}_1, \vec{e}_2 \rangle \]
|
||||
|
||||
Pearson's correlation has the following properties:
|
||||
\begin{itemize}
|
||||
\item If the variables are independent, then the correlation is 0 (but not vice versa).
|
||||
\item If the correlation is 0, then there is no linear relationship between the variables.
|
||||
\item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship.
|
||||
\end{itemize}
|
||||
|
||||
\item[Symmetric uncertainty] \marginnote{Symmetric uncertainty}
|
||||
Measure of correlation for nominal attributes:
|
||||
\[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \]
|
||||
where $H$ is the entropy.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
|
||||
\section{Clustering definitions}
|
||||
|
||||
\begin{description}
|
||||
\item[Clustering] \marginnote{Clustering}
|
||||
Given a set of $D$-dimensional objects $\vec{x}_i$,
|
||||
we want to partition them into $K$ clusters (and potentially recognize outliers).
|
||||
In other words, we are looking for a mapping:
|
||||
\[ \texttt{cluster}(\vec{x}_i) \in \{ 1, \dots, K \} \]
|
||||
such that objects in the same cluster are similar.
|
||||
|
||||
\item[Centroid] \marginnote{Centroid}
|
||||
Average of the coordinates of the points in a cluster.
|
||||
For a cluster $K_i$, the $d$-th coordinate of its centroid is given by:
|
||||
\[
|
||||
\texttt{centroid}(K_i)\texttt{[$d$]}
|
||||
= \frac{1}{\vert K_i \vert}
|
||||
\sum_{\vec{x} \in K_i} \vec{x}\texttt{[$d$]}
|
||||
\]
|
||||
|
||||
\item[Medoid] \marginnote{Medoid}
|
||||
Element of the cluster with minimum average dissimilarity to all other points.
|
||||
Differently from the centroid, the medoid must be an existing point of the dataset.
|
||||
|
||||
\item[Proximity functions] \marginnote{Proximity function}
|
||||
Measures to determine the similarity of two data points:
|
||||
\begin{descriptionlist}
|
||||
\item[Euclidean distance]
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Metrics}
|
||||
|
||||
\begin{description}
|
||||
\item[Cohesion] \marginnote{Cohesion}
|
||||
Measures the similarity (proximity) of the objects in the same cluster.
|
||||
Given a cluster $K_i$, cohesion is computed as:
|
||||
\[ \texttt{cohesion}(K_i) = \sum_{\vec{x} \in K_i} \texttt{dist}(\vec{x}, \vec{c}_i) \]
|
||||
where $\vec{c}_i$ can be the centroid or medoid
|
||||
and \texttt{dist} is a proximity function.
|
||||
|
||||
\item[Separation] \marginnote{Separation}
|
||||
Measures the distance of two clusters.
|
||||
Given two clusters $K_i$ and $K_j$, their separation is:
|
||||
\[ \texttt{separation}(K_i, K_j) = \texttt{dist}(\vec{c}_i, \vec{c}_j) \]
|
||||
where $\vec{c}_i$ and $\vec{c}_j$ are respectively the centroids of $K_i$ and $K_j$, and \texttt{dist} is a proximity function.
|
||||
|
||||
\item[Sum of squared errors] \marginnote{Sum of squared errors}
|
||||
Measures for each cluster the distance between its points to its centroid.
|
||||
Can be seen as the application of distortion (\Cref{desc:distortion}) to clustering:
|
||||
\[ \texttt{SSE}_j = \sum_{\vec{x}_i \in K_j} \texttt{dist}(\vec{x}_i, \vec{c}_j)^2 \]
|
||||
where $K_j$ is the $j$-th cluster and $\vec{c}_j$ is its centroid.
|
||||
|
||||
If $\texttt{SSE}_j$ is high, the cluster has low quality.
|
||||
If $\texttt{SSE}_j = 0$, all points in the cluster correspond to the centroid.
|
||||
|
||||
The sum of squared errors of $K$ clusters is:
|
||||
\[ \texttt{SSE} = \sum_{j=1}^{K} \texttt{SSE}_j \]
|
||||
|
||||
\item[Sum of squares between clusters] \marginnote{Sum of squares between clusters}
|
||||
Given the global centroid of the dataset $\vec{c}$ and
|
||||
$K$ clusters each with $N_i$ objects,
|
||||
the sum of squares between clusters is given by:
|
||||
\[ \texttt{SSB} = \sum_{i=1}^{K} N_i \cdot \texttt{dist}(\vec{c}_i, \vec{c})^2 \]
|
||||
|
||||
\item[Total sum of squares] \marginnote{Total sum of squares}
|
||||
Sum of the squared distances between the points of the dataset and the global centroid.
|
||||
It can be shown that the total sum of squares can be computed as:
|
||||
\[ \texttt{TSS} = \texttt{SSE} + \texttt{SSB} \]
|
||||
|
||||
\begin{theorem}
|
||||
Minimize \texttt{SSE} $\iff$ maximize \texttt{SSB}.
|
||||
\end{theorem}
|
||||
|
||||
\item[Silhouette score] \marginnote{Silhouette score}
|
||||
The Silhouette score of a data point $\vec{x}_i$ belonging to a cluster $K_i$ is given by two components:
|
||||
\begin{description}
|
||||
\item[Sparsity contribution]
|
||||
The average distance of $\vec{x}_i$ to the other points in $K_i$:
|
||||
\[ a(\vec{x}_i) = \frac{1}{\vert K_i \vert - 1} \sum_{\vec{x}_j \in K_i, \vec{x}_j \neq \vec{x}_i} \texttt{dist}(\vec{x}_i, \vec{x}_j) \]
|
||||
|
||||
\item[Separation contribution]
|
||||
The average distance of $\vec{x}_i$ to the points in the nearest cluster:
|
||||
\[ b(\vec{x}_i) = \min_{K_j, K_j \neq K_i} \left( \frac{1}{\vert K_j \vert} \sum_{\vec{w} \in K_j} \texttt{dist}(\vec{x}_i, \vec{w}) \right) \]
|
||||
\end{description}
|
||||
The Silhouette score of $\vec{x}_i$ is then computed as:
|
||||
\[ s(\vec{x}_i) = \frac{b(\vec{x}_i) - a(\vec{x}_i)}{\max\{ a(\vec{x}_i), b(\vec{x}_i) \}} \in [-1, 1] \]
|
||||
|
||||
The Silhouette score $\mathcal{S}$ of $K$ clusters is given by the average Silhouette scores of each data point.
|
||||
$\mathcal{S} \rightarrow 1$ indicates correct clusters, $\mathcal{S} \rightarrow -1$ indicates incorrect clusters.
|
||||
|
||||
\item[Golden standard] \marginnote{Golden standard}
|
||||
Evaluation using a labeled dataset.
|
||||
Consider the elements of the same cluster as labeled with the same class.
|
||||
|
||||
\begin{description}
|
||||
\item[Classification-oriented]
|
||||
Traditional classification metrics such as accuracy, recall, precision, \dots
|
||||
|
||||
\item[Similarity-oriented]
|
||||
Given a learnt clustering scheme $y_K(\cdot)$ and the golden standard scheme $y_G(\cdot)$ where
|
||||
$y_i(\vec{x})$ indicates the label/cluster of $\vec{x}$, each pair of data $(\vec{x}_1, \vec{x}_2)$ can be labeled with:
|
||||
\begin{descriptionlist}
|
||||
\item[\texttt{SGSK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$.
|
||||
\item[\texttt{SGDK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$.
|
||||
\item[\texttt{DGSK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$.
|
||||
\item[\texttt{DGDK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$.
|
||||
\end{descriptionlist}
|
||||
Then, the following metrics can be computed:
|
||||
\begin{descriptionlist}
|
||||
\item[Rand score] $\frac{\texttt{SGSK} + \texttt{DGDK}}{\texttt{SGSK} + \texttt{SGDK} + \texttt{DGSK} + \texttt{DGDK}}$
|
||||
\item[Adjusted rand score] Modification of the rand score to take into account that some agreements may happen by chance.
|
||||
\item[Jaccard coefficient] For each class $c$, the Jaccard coefficient is given by:
|
||||
\[ \frac{\texttt{SG$_c$SK$_c$}}{\texttt{SG$_c$SK$_c$} + \texttt{SG$_c$DK$_c$} + \texttt{DG$_c$SK$_c$}} \]
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{K-means}
|
||||
|
||||
\begin{description}
|
||||
\item[Algorithm] \marginnote{K-means}
|
||||
Clustering algorithm that iteratively improves the centroids.
|
||||
Given the desired number of clusters $K$, the algorithm works as follows:
|
||||
\begin{enumerate}
|
||||
\item Randomly choose $K$ initial centroids.
|
||||
\item Each data point belongs to the cluster represented by the nearest centroid.
|
||||
\item Update the centroids as the centroids of the newly found clusters. Go to 2.
|
||||
\end{enumerate}
|
||||
|
||||
\item[Distortion] \label{desc:distortion} \marginnote{Distortion}
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item a $D$-dimensional dataset of $N$ points $\vec{x}_i$;
|
||||
\item an encoding function $\texttt{encode}: \mathbb{R}^D \rightarrow [1, K]$;
|
||||
\item a decoding function $\texttt{decode}: [1, K] \rightarrow \mathbb{R}^D$.
|
||||
\end{itemize}
|
||||
Distortion (or inertia) is defined as:
|
||||
\[ \texttt{distortion} = \sum_{i=1}^{N} \big(\vec{x}_i - \texttt{decode}(\texttt{encode}(\vec{x_i})) \big)^2 \]
|
||||
|
||||
\begin{theorem}
|
||||
To minimize the distortion, it is required that:
|
||||
\begin{enumerate}
|
||||
\item $\vec{x}_i$ is encoded with its nearest center.
|
||||
\item The center of a point is the centroid of the cluster it belongs to.
|
||||
\end{enumerate}
|
||||
|
||||
Note that k-means alternates points 1 and 2.
|
||||
|
||||
\begin{proof}
|
||||
The second point is derived by imposing the derivative of \texttt{distortion} to 0.
|
||||
\end{proof}
|
||||
\end{theorem}
|
||||
|
||||
\item[Elbow method]
|
||||
Inertia decreases monotonically and can be used to determine an ideal number of clusters.
|
||||
By computing the inertia for varying $K$, a plausible value is the one corresponding to the point where the slope decreases.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\textwidth]{img/elbow_method.png}
|
||||
\caption{Plot of inertia. Possibly good values for $K$ are around 3}
|
||||
\end{figure}
|
||||
|
||||
The Silhouette score can also be used by selecting the $K$ corresponding to its maximum.
|
||||
Note that, compared to inertia, Silhouette is computationally more expensive.
|
||||
|
||||
\item[Properties] \phantom{}
|
||||
\begin{description}
|
||||
\item[Termination]
|
||||
There are a finite number of ways to cluster $N$ objects into $K$ clusters.
|
||||
By construction, at each iteration, the \texttt{distortion} is reduced.
|
||||
Therefore, k-means is guaranteed to terminate.
|
||||
|
||||
\item[Non-optimality]
|
||||
The solution found by k-means is not guaranteed to be a global best.
|
||||
The choice of starting points heavily influences the final result.
|
||||
The starting configuration is usually composed of points distant as far as possible.
|
||||
|
||||
\item[Noise]
|
||||
Outliers heavily influence the clustering result. Sometimes, it is useful to remove them.
|
||||
|
||||
\item[Complexity]
|
||||
Given a $D$-dimensional dataset of $N$ points,
|
||||
running k-means for $T$ iterations to find $K$ clusters has complexity $O(TKND)$.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Hierarchical clustering}
|
||||
|
||||
\begin{description}
|
||||
\item[Dendrogram] \marginnote{Dendrogram}
|
||||
Tree-like structure where the root is a cluster of all the data points and
|
||||
the leaves are clusters with a single data point.
|
||||
|
||||
\item[Agglomerative] \marginnote{Agglomerative}
|
||||
Starts with a cluster per data point and iteratively merges them (leaves to root).
|
||||
Uses cluster separation metrics.
|
||||
|
||||
\item[Divisive] \marginnote{Divisive}
|
||||
Starts with a cluster containing all the data points and iteratively splits them (root to leaves).
|
||||
Uses cluster cohesion metrics.
|
||||
|
||||
\item[Cluster separation measures]
|
||||
Measure the distance between two clusters $K_i$ and $K_j$.
|
||||
\begin{descriptionlist}
|
||||
\item[Single link] \marginnote{Single link}
|
||||
Minimum distance of the points in the two clusters:
|
||||
\[ \texttt{sep}(K_i, K_j) = \min_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
|
||||
Tends to create larger clusters.
|
||||
|
||||
\item[Complete link] \marginnote{Complete link}
|
||||
Maximum distance of the points in the two clusters:
|
||||
\[ \texttt{sep}(K_i, K_j) = \max_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
|
||||
Tends to create more compact clusters.
|
||||
|
||||
\item[Average link] \marginnote{Average link}
|
||||
Average distance of the points in the two clusters:
|
||||
\[ \texttt{sep}(K_i, K_j) = \frac{1}{\vert K_i \vert \cdot \vert K_j \vert} \sum_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
|
||||
|
||||
\item[Centroid-based] \marginnote{Centroid-based}
|
||||
Distance between the centroids of the two clusters.
|
||||
|
||||
\item[Ward's method] \marginnote{Ward's method}
|
||||
Let $K_m$ be the cluster obtained by merging $K_i$ and $K_j$.
|
||||
The distance between $K_i$ and $K_j$ is determined as:
|
||||
\[ \texttt{sep}(K_i, K_j) = \texttt{SSE}(K_m) - \big( \texttt{SSE}(K_i) + \texttt{SSE}(K_j) \big) \]
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Agglomerative clustering}
|
||||
|
||||
\begin{description}
|
||||
\item[Algorithm] \marginnote{Agglomerative clustering} \phantom{}
|
||||
\begin{enumerate}
|
||||
\item Initialize a cluster for each data point.
|
||||
\item Compute the distance matrix between each cluster.
|
||||
\item Merge the two clusters with the lowest separation,
|
||||
drop their values from the distance matrix and add a row/column for the newly created cluster.
|
||||
\item Go to point 2. if the number of clusters is greater than one.
|
||||
\end{enumerate}
|
||||
|
||||
After the construction of the dendrogram, a cut \marginnote{Cut} can be performed at a user-defined level.
|
||||
A cut near the root will result in few bigger clusters.
|
||||
A cut near the leaves will result in numerous smaller clusters.
|
||||
|
||||
|
||||
\item[Properties] \phantom{}
|
||||
\begin{description}
|
||||
\item[Complexity]
|
||||
Space complexity of $O(N^2)$ to store the distance matrix.
|
||||
|
||||
Time complexity of $O(N^3)$ ($O(N)$ iterations with a $O(N^2)$ search for the pair to merge and $O(N)$ to recompute the distance matrix)
|
||||
that can be reduced to $O(N^2\log(N))$ when using indexing.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Density-based clustering}
|
||||
|
||||
Consider as clusters the high-density areas of the data space.
|
||||
|
||||
\begin{description}
|
||||
\item[Grid-based]
|
||||
Split the data space into a grid and count the number of points in each tile.
|
||||
|
||||
\item[Object-centered]
|
||||
Count, for each point, the number of neighbors within a radius.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{DBSCAN}
|
||||
|
||||
\begin{description}
|
||||
\item[Neighborhood] \marginnote{Neighborhood}
|
||||
Given a radius $\varepsilon$, the neighborhood of a point $\vec{x}$ are the points within an $\varepsilon$-sphere centered on $\vec{x}$.
|
||||
|
||||
\item[Core point] \marginnote{Core point}
|
||||
Given a minimum number of neighbors $m$,
|
||||
a point $\vec{x}$ is a core point if it has at least $m$ neighbors.
|
||||
|
||||
\item[Border point] \marginnote{Border point}
|
||||
A point $\vec{x}$ is a border point if it is not a core point.
|
||||
|
||||
\item[Directly density reachable] \marginnote{Directly density reachable}
|
||||
A point $\vec{p}$ is directly density reachable from $\vec{q}$ iff:
|
||||
\begin{itemize}
|
||||
\item $\vec{q}$ is a core point.
|
||||
\item $\vec{q}$ is a neighbor of $\vec{p}$.
|
||||
\end{itemize}
|
||||
|
||||
\item[Density reachable] \marginnote{Density reachable}
|
||||
A point $\vec{p}$ is density reachable from $\vec{q}$ iff:
|
||||
\begin{itemize}
|
||||
\item $\vec{q}$ is a core point.
|
||||
\item There exists a sequence of points $\vec{s}_1, \dots, \vec{s}_z$ such that:
|
||||
\begin{itemize}
|
||||
\item $\vec{s}_1$ is directly density reachable from $\vec{q}$.
|
||||
\item $\vec{s}_{i+1}$ is directly density reachable from $\vec{s}_i$.
|
||||
\item $\vec{p}$ is directly density reachable from $\vec{s}_z$.
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
\item[Density connected] \marginnote{Density connected}
|
||||
A point $\vec{p}$ is density connected to $\vec{q}$ iff there exists a point $\vec{s}$
|
||||
such that both $\vec{p}$ and $\vec{q}$ are density reachable from $\vec{s}$.
|
||||
|
||||
\item[Algorithm] \marginnote{DBSCAN}
|
||||
Determine clusters as maximal sets of density connected points.
|
||||
Border points not density connected to any core point are labeled as noise.
|
||||
|
||||
In other words, what happens is the following:
|
||||
\begin{itemize}
|
||||
\item Neighboring core points are part of the same cluster.
|
||||
\item Border points are part of the cluster of their nearest core point neighbor.
|
||||
\item Border points without a core point neighbor are noise.
|
||||
\end{itemize}
|
||||
|
||||
\item[Properties] \phantom{}
|
||||
\begin{description}
|
||||
\item[Robustness]
|
||||
Able to find clusters of any shape and detect noise.
|
||||
|
||||
\item[Hyperparameters]
|
||||
Sensible to the choice of the radius $\varepsilon$ and minimum neighbors $m$.
|
||||
|
||||
\begin{description}
|
||||
\item[K-distance method] \phantom{}
|
||||
\begin{enumerate}
|
||||
\item Determine for each point its $k$-distance as the distance to its $k$-nearest neighbors.
|
||||
\item Sort the points by decreasing $k$-distance and plot them.
|
||||
\item Use as possible $\varepsilon$ the values around the area where the slope decreases (similarly to the elbow method).
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
|
||||
\item[Complexity]
|
||||
Complexity of $O(N^2)$, reduced to $O(N \log N)$ if using spatial indexing.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{DENCLUE}
|
||||
|
||||
\begin{description}
|
||||
\item[Kernel density estimation] \marginnote{Kernel density estimation}
|
||||
Statistical method to estimate the distribution of a dataset through a function.
|
||||
|
||||
\begin{description}
|
||||
\item[Kernel function] \marginnote{Kernel function}
|
||||
Symmetric and monotonically decreasing function to describe the influence of a data point on its neighbors.
|
||||
|
||||
A typical kernel function is the Gaussian.
|
||||
|
||||
\item[Overall density function]
|
||||
The overall density of the dataset is obtained as the sum of the kernel function evaluated at each data point.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.35\textwidth]{img/kernel_density_estimation.png}
|
||||
\caption{Example of density function from a set of points (top right) using a Gaussian kernel}
|
||||
\label{img:denclue}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
\item[Algorithm] \marginnote{DENCLUE}
|
||||
Given a threshold $\xi$, DENCLUE works as follows:
|
||||
\begin{enumerate}
|
||||
\item Derive a density function of the dataset.
|
||||
\item Identify local maximums and consider them as density attractors.
|
||||
\item Associate to each data point the density attractor in the direction of maximum increase.
|
||||
\item Points associated with the same density attractor are part of the same cluster.
|
||||
\item Remove clusters with a density attractor lower than $\xi$.
|
||||
\item Merge clusters connected through a path of points whose density is greater or equal to $\xi$
|
||||
(e.g. in \Cref{img:denclue} the center area will result in many small clusters that can be merged with an appropriate $\xi$).
|
||||
\end{enumerate}
|
||||
|
||||
\item[Properties] \phantom{}
|
||||
\begin{description}
|
||||
\item[Robustness]
|
||||
Able to recognize clusters of different shapes and handle noise.
|
||||
|
||||
\item[High dimension weakness]
|
||||
Does not perform well with high-dimensional data with different densities.
|
||||
|
||||
\item[Complexity]
|
||||
Computational complexity of $O(N^2)$.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Model-based clustering}
|
||||
|
||||
Assuming that the attributes are independent random variables,
|
||||
model-based clustering finds a set of distributions (one per cluster) that describe the data.
|
||||
|
||||
|
||||
\subsection*{Gaussian mixture (expectation maximization)}
|
||||
|
||||
\begin{description}
|
||||
\item[Algorithm] \phantom{} \marginnote{Gaussian mixture}
|
||||
\begin{enumerate}
|
||||
\item Select an initial set of parameters for the distributions.
|
||||
\item Expectation step: for each data point, compute its probability to belong to each distribution.
|
||||
\item Maximization step: tweak the parameters to maximize the likelihood (i.e. move the Gaussian towards the center of the cluster).
|
||||
\item Go to point 2. until convergence.
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
@ -0,0 +1,57 @@
|
||||
\chapter{CRISP-DM}
|
||||
|
||||
\begin{description}
|
||||
\item[\Acl{crisp}] \marginnote{\acs{crisp}}
|
||||
Standardized process for data mining.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.45\textwidth]{img/crisp.png}
|
||||
\caption{\ac{crisp} workflow}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Business understanding}
|
||||
\begin{itemize}
|
||||
\item Determine the objective and the success criteria.
|
||||
\marginnote{Business understanding}
|
||||
\item Feasibility study.
|
||||
\item Produce a plan.
|
||||
\end{itemize}
|
||||
|
||||
\section{Data understanding}
|
||||
\begin{itemize}
|
||||
\item Determine the available (raw) data.
|
||||
\marginnote{Data understanding}
|
||||
\item Determine the cost of the data.
|
||||
\item Collect, describe, explore and verify data.
|
||||
\end{itemize}
|
||||
|
||||
\section{Data preparation}
|
||||
\begin{itemize}
|
||||
\item Data cleaning.
|
||||
\marginnote{Data preparation}
|
||||
\item Data transformations.
|
||||
\end{itemize}
|
||||
|
||||
\section{Modeling}
|
||||
\begin{itemize}
|
||||
\item Select modeling technique.
|
||||
\marginnote{Modeling}
|
||||
\item Build/train the model.
|
||||
\end{itemize}
|
||||
|
||||
\section{Evaluation}
|
||||
\begin{itemize}
|
||||
\item Evaluate results.
|
||||
\marginnote{Evaluation}
|
||||
\item Review process.
|
||||
\end{itemize}
|
||||
|
||||
\section{Deployment}
|
||||
\begin{itemize}
|
||||
\item Plan deployment.
|
||||
\marginnote{Deployment}
|
||||
\item Plan monitoring and maintenance.
|
||||
\item Final report and review.
|
||||
\end{itemize}
|
||||
@ -0,0 +1,206 @@
|
||||
\chapter{Data lake}
|
||||
|
||||
\begin{description}
|
||||
\item[Dark data] \marginnote{Dark data}
|
||||
Acquired and stored data that are never used for decision-making processes.
|
||||
|
||||
\item[Data lake] \marginnote{Data lake}
|
||||
Repository to store raw (unstructured) data.
|
||||
It has the following features:
|
||||
\begin{itemize}
|
||||
\item Does not enforce a schema on write.
|
||||
\item Allows flexible access and applies schemas on read.
|
||||
\item Single source of truth.
|
||||
\item Low cost and scalable.
|
||||
\end{itemize}
|
||||
|
||||
\item[Storage]
|
||||
Stored data can be classified as:
|
||||
\begin{descriptionlist}
|
||||
\item[Hot] \marginnote{Hot storage}
|
||||
A low volume of highly requested data that requires low latency.
|
||||
More expensive HW/SW.
|
||||
\item[Cold] \marginnote{Cold storage}
|
||||
A large amount of data that does not have latency requirements.
|
||||
Less expensive.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.5\textwidth]{img/_storage.pdf}
|
||||
\caption{Data storage technologies}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Traditional vs insight-driven data systems}
|
||||
\begin{tabular}{c | p{0.4\textwidth} | p{0.4\textwidth}}
|
||||
& \textbf{\makecell[c]{Traditional (data warehouse)}} & \textbf{\makecell[c]{Insight-driven (data lake)}} \\
|
||||
\hline
|
||||
\textbf{Sources} & Structured data & Structured, semi-structured and unstructured data \\
|
||||
\hline
|
||||
\textbf{Storage} & Limited ingestion and storage capability & Virtually unlimited ingestion and storage capability \\
|
||||
\hline
|
||||
\textbf{Schema} & Schema designed upfront & Schema not fixed \\
|
||||
\hline
|
||||
\textbf{Transformations} & \ac{etl} upfront & Transformations on query \\
|
||||
\hline
|
||||
\textbf{Analytics} & SQL, \ac{bi} tools, full-text search & Traditional methods, self-service \ac{bi}, big data, machine learning, \dots \\
|
||||
\hline
|
||||
\textbf{Price} & High storage cost & Low storage cost \\
|
||||
\textbf{Performance} & Fast queries & Scalability/speed/cost tradeoffs \\
|
||||
\hline
|
||||
\textbf{Quality} & High data quality & Depends on the use case \\
|
||||
\end{tabular}
|
||||
|
||||
|
||||
\section{Data architecture evolution}
|
||||
\begin{description}
|
||||
\item[Traditional data warehouse] \marginnote{Traditional data warehouse}
|
||||
(i.e. in-house data warehouse)
|
||||
\begin{itemize}
|
||||
\item Structured data with predefined schemas.
|
||||
\item High setup and maintenance cost. Not scalable.
|
||||
\item Relational high-quality data.
|
||||
\item Slow data ingestion.
|
||||
\end{itemize}
|
||||
|
||||
\item[Modern cloud data warehouse] \marginnote{Modern cloud data warehouse}
|
||||
\phantom{}
|
||||
\begin{itemize}
|
||||
\item Structured and semi-structured data.
|
||||
\item Low setup and maintenance cost. Scalable and easier disaster recovery.
|
||||
\item Relational high-quality data and mixed data.
|
||||
\item Fast data ingestion if supported.
|
||||
\end{itemize}
|
||||
|
||||
\item[On-premise big data] \marginnote{On-premise big data}
|
||||
(i.e. in-house data lake)
|
||||
\begin{itemize}
|
||||
\item Any type of data with schemas on read.
|
||||
\item High setup and maintenance cost.
|
||||
\item Fast data ingestion.
|
||||
\end{itemize}
|
||||
|
||||
\item[Cloud data lake] \marginnote{Cloud data lake}
|
||||
\phantom{}
|
||||
\begin{itemize}
|
||||
\item Any type of data with schemas on read.
|
||||
\item Low setup and maintenance cost. Scalable and easier disaster recovery.
|
||||
\item Fast data ingestion.
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Components}
|
||||
|
||||
\subsection{Data ingestion}
|
||||
\begin{descriptionlist}
|
||||
\item[Workload migration] \marginnote{Data ingestion}
|
||||
Inserting all the data from an existing source.
|
||||
\item[Incremental ingestion]
|
||||
Inserting changes since the last ingestion.
|
||||
\item[Streaming ingestion]
|
||||
Continuously inserting data.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{description}
|
||||
\item[\Acl{cdc} (\Acs{cdc})] \marginnote{\Acl{cdc} (\Acs{cdc})}
|
||||
Mechanism to detect changes and insert the new data into the data lake (possibly in real-time).
|
||||
\end{description}
|
||||
|
||||
\subsection{Storage}
|
||||
\begin{descriptionlist}
|
||||
\item[Raw] \marginnote{Raw storage}
|
||||
Immutable data useful for disaster recovery.
|
||||
\item[Optimized] \marginnote{Optimized storage}
|
||||
Optimized raw data for faster query.
|
||||
\item[Analytics] \marginnote{Analytics storage}
|
||||
Ready to use data.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{description}
|
||||
\item[Columnar storage] \phantom{}
|
||||
\begin{itemize}
|
||||
\item Homogenous data are stored contiguously.
|
||||
\item Speeds up methods that process entire columns (i.e. all the values of a feature).
|
||||
\item Insertion becomes slower.
|
||||
\end{itemize}
|
||||
|
||||
\item[Data catalog]
|
||||
Methods to add descriptive metadata to a data lake.
|
||||
This is useful to prevent an unorganized data lake (data swamp).
|
||||
\end{description}
|
||||
|
||||
\subsection{Processing and analytics}
|
||||
\begin{descriptionlist}
|
||||
\item[Interactive analytics] \marginnote{Processing and analytics}
|
||||
Interactive queries to large volumes of data.
|
||||
The results are stored back in the data lake.
|
||||
\item[Big data analytics]
|
||||
Data aggregations and transformations.
|
||||
\item[Real-time analytics]
|
||||
Streaming analysis.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\section{Architectures}
|
||||
|
||||
\subsection{Lambda lake}
|
||||
\begin{description}
|
||||
\item[Batch layer] \marginnote{Lambda lake}
|
||||
Receives and stores the data. Prepares the batch views for the serving layer.
|
||||
\item[Serving layer]
|
||||
Indexes batch views for faster queries.
|
||||
\item[Speed layer]
|
||||
Receives the data and prepares real-time views. The views are also stored in the serving layer.
|
||||
\end{description}
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.5\textwidth]{img/lambda_lake.png}
|
||||
\caption{Lambda lake architecture}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Kappa lake}
|
||||
\marginnote{Kappa lake}
|
||||
The data are stored in a long-term store.
|
||||
Computations only happen in the speed layer (avoids lambda lake redundancy between batch layer and speed layer).
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.5\textwidth]{img/kappa_lake.png}
|
||||
\caption{Kappa lake architecture}
|
||||
\end{figure}
|
||||
|
||||
\subsection{Delta lake}
|
||||
\marginnote{Delta lake}
|
||||
Framework that adds features on top of an existing data lake.
|
||||
\begin{itemize}
|
||||
\item ACID transactions
|
||||
\item Scalable metadata handling
|
||||
\item Data versioning
|
||||
\item Unified batch and streaming
|
||||
\item Schema enforcement
|
||||
\end{itemize}
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.7\textwidth]{img/delta_lake.png}
|
||||
\caption{Delta lake architecture}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\section{Metadata}
|
||||
\marginnote{Metadata}
|
||||
Metadata is used to organize a data lake.
|
||||
Useful metadata are:
|
||||
\begin{descriptionlist}
|
||||
\item[Source] Origin of the data.
|
||||
\item[Schema] Structure of the data.
|
||||
\item[Format] File format or encoding.
|
||||
\item[Quality metrics] (e.g. percentage of missing values).
|
||||
\item[Lifecycle] Retention policies and archiving rules.
|
||||
\item[Ownership]
|
||||
\item[Lineage] History of applied transformations or dependencies.
|
||||
\item[Access control]
|
||||
\item[Classification] Sensitivity level of the data.
|
||||
\item[Usage information] Record of who accessed the data and how it is used.
|
||||
\end{descriptionlist}
|
||||
@ -0,0 +1,163 @@
|
||||
\chapter{Data preprocessing}
|
||||
|
||||
\section{Aggregation}
|
||||
\marginnote{Aggregation}
|
||||
|
||||
Combining multiple attributes into a single one.
|
||||
Useful for:
|
||||
\begin{descriptionlist}
|
||||
\item[Data reduction]
|
||||
Reduce the number of attributes.
|
||||
|
||||
\item[Change of scale]
|
||||
View the data in a more general level of detail (e.g. from cities and regions to countries).
|
||||
|
||||
\item[Data stability]
|
||||
Aggregated data tend to have less variability.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Sampling}
|
||||
\marginnote{Sampling}
|
||||
Sampling can be used when the full dataset is too expensive to obtain or too expensive to process.
|
||||
Obviously, a sample has to be representative.
|
||||
|
||||
The types of sampling techniques are:
|
||||
\begin{descriptionlist}
|
||||
\item[Simple random] \marginnote{Simple random}
|
||||
Extraction of a single element following a given probability distribution.
|
||||
|
||||
\item[With replacement] \marginnote{With replacement}
|
||||
Multiple extractions with repetitions following a given probability distribution
|
||||
(i.e. multiple simple random extractions).
|
||||
|
||||
If the population is small, the sample may underestimate the actual population.
|
||||
|
||||
\item[Without replacement] \marginnote{Without replacement}
|
||||
Multiple extractions without repetitions following a given probability distribution.
|
||||
|
||||
\item[Stratified] \marginnote{Stratified}
|
||||
Split the data and sample from each partition.
|
||||
Useful when the partitions are homogenous.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{description}
|
||||
\item[Sample size]
|
||||
The sampling size represents a tradeoff between data reduction and precision.
|
||||
In a labeled dataset, it is important to consider the probability of sampling data from all the possible classes.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Dimensionality reduction}
|
||||
|
||||
\begin{description}
|
||||
\item[Curse of dimensionality] \marginnote{Curse of dimensionality}
|
||||
Data with a high number of dimensions result in a sparse feature space
|
||||
where distance metrics are ineffective.
|
||||
|
||||
\item[Dimensionality reduction] \marginnote{Dimensionality reduction}
|
||||
Useful to:
|
||||
\begin{itemize}
|
||||
\item Avoid the curse of dimensionality.
|
||||
\item Reduce noise.
|
||||
\item Reduce the time and space complexity of mining and learning algorithms.
|
||||
\item Visualize multi-dimensional data.
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
|
||||
\subsection{Principal component analysis} \marginnote{PCA}
|
||||
Projection of the data into a lower-dimensional space that maximizes the variance of the data.
|
||||
It can be proven that this problem can be solved by finding the eigenvectors of the covariance matrix of the data.
|
||||
|
||||
\subsection{Feature subset selection} \marginnote{Feature subset selection}
|
||||
Local technique to reduce dimensionality by:
|
||||
\begin{itemize}
|
||||
\item Removing redundant attributes.
|
||||
\item Removing irrelevant attributes.
|
||||
\end{itemize}
|
||||
|
||||
This can be achieved by:
|
||||
\begin{descriptionlist}
|
||||
\item[Brute force]
|
||||
Try all the possible subsets of the dataset.
|
||||
|
||||
\item[Embedded approach]
|
||||
Feature selection is naturally done by the learning algorithm (e.g. decision trees).
|
||||
|
||||
\item[Filter approach]
|
||||
Features are filtered using domain-specific knowledge.
|
||||
|
||||
\item[Wrapper approaches]
|
||||
A mining algorithm is used to select the best features.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
|
||||
\section{Feature creation}
|
||||
\marginnote{Feature creation}
|
||||
Useful to help a learning algorithm capture data characteristics.
|
||||
Possible approaches are:
|
||||
\begin{descriptionlist}
|
||||
\item[Feature extraction]
|
||||
Features extracted from the existing ones (e.g. from a picture of a face, the eye distance can be a new feature).
|
||||
|
||||
\item[Mapping]
|
||||
Projecting the data into a new feature space.
|
||||
|
||||
\item[New features]
|
||||
Add new, possibly redundant, features.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Data type conversion}
|
||||
|
||||
\subsection{One-hot encoding} \marginnote{One-hot encoding}
|
||||
A discrete feature $E \in \{ e_1, \dots, e_n \}$ with $n$ unique values is replaced with
|
||||
$n$ new binary features $H_{e_1}, \dots, H_{e_n}$ each corresponding to a value of $E$.
|
||||
For each entry, if its feature $E$ has value $e_i$, then $H_{e_i} = \texttt{true}$ and the rests are \texttt{false}.
|
||||
|
||||
\subsection{Ordinal encoding} \marginnote{Ordinal encoding}
|
||||
A feature whose values have an ordering can be converted into a consecutive sequence of integers
|
||||
(e.g. ["good", "neutral", "bad"] $\mapsto$ [1, 0, -1]).
|
||||
|
||||
\subsection{Discretization} \marginnote{Discretization}
|
||||
Convert a continuous feature to a discrete one.
|
||||
\begin{description}
|
||||
\item[Binarization] \marginnote{Binarization}
|
||||
Given a continuous feature and a threshold,
|
||||
it can be replaced with a new binary feature that is \texttt{true} if the value is above the threshold and \texttt{false} otherwise.
|
||||
|
||||
\item[Thresholding] \marginnote{Thresholding}
|
||||
Same as binarization but using multiple thresholds.
|
||||
|
||||
\item[K-bins] \marginnote{K-bins}
|
||||
A continuous feature is discretized using $k$ bins each representing an integer from $0$ to $k-1$.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Attribute transformation}
|
||||
Useful for normalizing features with different scales and outliers.
|
||||
|
||||
\begin{description}
|
||||
\item[Mapping] \marginnote{Mapping}
|
||||
Map the domain of a feature into a new set of values (i.e. apply a function).
|
||||
|
||||
\item[Standardization] \marginnote{Standardization}
|
||||
Transform a feature with Gaussian distribution into a standard distribution.
|
||||
\[ x = \frac{x - \mu}{\sigma} \]
|
||||
|
||||
\item[Rescaling] \marginnote{Rescaling}
|
||||
Map a feature into a fixed range (e.g. scale to $[0, 1]$ or $[-1, 1]$).
|
||||
|
||||
\item[Affine transformation] \marginnote{Affine transformation}
|
||||
Apply a linear transformation on a feature before rescaling it.
|
||||
This method is more robust to outliers.
|
||||
|
||||
\item[Normalization] \marginnote{Normalization}
|
||||
Normalize each data row to unit norm.
|
||||
\end{description}
|
||||
@ -0,0 +1,356 @@
|
||||
\chapter{Data warehouse}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[\Acl{bi}] \marginnote{\Acl{bi}}
|
||||
Transform raw data into information.
|
||||
Deliver the right information to the right people at the right time through the right channel.
|
||||
|
||||
\item[\Ac{dwh}] \marginnote{\Acl{dwh}}
|
||||
Optimized repository that stores information for decision-making processes.
|
||||
\Acp{dwh} are a specific type of \ac{dss}.
|
||||
|
||||
Features:
|
||||
\begin{itemize}
|
||||
\item Subject-oriented: focused on enterprise-specific concepts.
|
||||
\item Integrates data from different sources and provides a unified view.
|
||||
\item Non-volatile storage with change tracking.
|
||||
\end{itemize}
|
||||
|
||||
\item[\Ac{dm}] \marginnote{\Acl{dm}}
|
||||
Subset of the primary \ac{dwh} with information relevant to a specific business area.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{\Acl{olap} (\Acs{olap})}
|
||||
|
||||
\begin{description}
|
||||
\item[\ac{olap} analyses] \marginnote{\Acl{olap} (\Acs{olap})}
|
||||
Able to interactively navigate the information in a data warehouse.
|
||||
Allows to visualize different levels of aggregation.
|
||||
|
||||
\item[\ac{olap} session]
|
||||
Navigation path created by the operations that a user applied.
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.35\textwidth]{img/_olap_cube.pdf}
|
||||
\caption{\ac{olap} data cube}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Operators}
|
||||
|
||||
\begin{description}
|
||||
\item[Roll-up] \marginnote{Roll-up}
|
||||
\begin{minipage}{0.7\textwidth}
|
||||
Increases the level of aggregation (i.e. \texttt{GROUP BY} in SQL).
|
||||
Some details are collapsed together.
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.15\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/olap_rollup.png}
|
||||
\end{minipage}
|
||||
|
||||
\item[Drill-down] \marginnote{Drill-down}
|
||||
\begin{minipage}{0.7\textwidth}
|
||||
Reduces the level of aggregation.
|
||||
Some details are reintroduced.
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.15\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/olap_drilldown.png}
|
||||
\end{minipage}
|
||||
|
||||
\item[Slide-and-dice] \marginnote{Slide-and-dice}
|
||||
\begin{minipage}{0.65\textwidth}
|
||||
The slice operator reduces the number of dimensions (i.e. drops columns).
|
||||
|
||||
The dice operator reduces the number of data being analyzed (i.e. \texttt{LIMIT} in SQL).
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.15\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/olap_slicedice.png}
|
||||
\end{minipage}
|
||||
|
||||
\item[Pivot] \marginnote{Pivot}
|
||||
\begin{minipage}{0.7\textwidth}
|
||||
Changes the layout of the data, to analyze it from a different viewpoint.
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.15\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/olap_pivot.png}
|
||||
\end{minipage}
|
||||
|
||||
\item[Drill-across] \marginnote{Drill-across}
|
||||
\begin{minipage}{0.7\textwidth}
|
||||
Links concepts from different data sources (i.e. \texttt{JOIN} in SQL).
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.15\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/olap_drillacross.png}
|
||||
\end{minipage}
|
||||
|
||||
\item[Drill-through] \marginnote{Drill-through}
|
||||
Switches from multidimensional aggregated data to operational data (e.g. a spreadsheet).
|
||||
\begin{center}
|
||||
\includegraphics[width=0.5\textwidth]{img/olap_drillthrough.png}
|
||||
\end{center}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{\Acl{etl} (\Acs{etl})}
|
||||
\marginnote{\Acl{etl} (\Acs{etl})}
|
||||
The \Ac{etl} process extracts, integrates and cleans operational data that will be loaded into a data warehouse.
|
||||
|
||||
|
||||
\subsection{Extraction}
|
||||
|
||||
Extracted operational data can be:
|
||||
\begin{descriptionlist}
|
||||
\item[Structured] \marginnote{Strucured data}
|
||||
with a predefined data model (e.g. relational DB, CSV)
|
||||
|
||||
\item[Untructured] \marginnote{Unstrucured data}
|
||||
without a predefined data model (e.g. social media content)
|
||||
\end{descriptionlist}
|
||||
|
||||
Extraction can be of two types:
|
||||
\begin{descriptionlist}
|
||||
\item[Static] \marginnote{Static extraction}
|
||||
The entirety of the operational data are extracted to populate the
|
||||
data warehouse for the first time.
|
||||
|
||||
\item[Incremental] \marginnote{Incremental extraction}
|
||||
Only changes applied since the last extraction are considered.
|
||||
Can be based on a timestamp or a trigger.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Cleaning}
|
||||
|
||||
Operational data may contain:
|
||||
\begin{descriptionlist}
|
||||
\item[Duplicate data]
|
||||
\item[Missing data]
|
||||
\item[Improper use of fields] (e.g. saving the phone number in the \texttt{notes} field)
|
||||
\item[Wrong values] (e.g. 30th of February)
|
||||
\item[Inconsistencies] (e.g. use of different abbreviations)
|
||||
\item[Typos]
|
||||
\end{descriptionlist}
|
||||
|
||||
Methods to clean and increase the quality of the data are:
|
||||
\begin{descriptionlist}
|
||||
\item[Dictionary-based techniques] \marginnote{Dictionary-based cleaning}
|
||||
Lookup tables to substitute abbreviations, synonyms or typos.
|
||||
Applicable if the domain is known and limited.
|
||||
|
||||
\item[Approximate merging] \marginnote{Approximate merging}
|
||||
Methods to merge data that do not have a common key.
|
||||
\begin{description}
|
||||
\item[Approximate join]
|
||||
Use non-key attributes to join two tables (e.g. using the name and surname instead of a unique identifier).
|
||||
|
||||
\item[Similarity approach]
|
||||
Use similarity functions (e.g. edit distance) to merge multiple instances of the same information
|
||||
(e.g. typo in customer surname).
|
||||
\end{description}
|
||||
|
||||
\item[Ad-hoc algorithms] \marginnote{Ad-hoc algorithms}
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Transformation}
|
||||
Data are transformed to respect the format of the data warehouse:
|
||||
\begin{descriptionlist}
|
||||
\item[Conversion] \marginnote{Conversion}
|
||||
Modifications of types and formats (e.g. date format)
|
||||
|
||||
\item[Enrichment] \marginnote{Enrichment}
|
||||
Creating new information by using existing attributes (e.g. compute profit from receipts and expenses)
|
||||
|
||||
\item[Separation and concatenation] \marginnote{Separation and concatenation}
|
||||
Denormalization of the data: introduces redundancies (i.e. breaks normal form\footnote{\url{https://en.wikipedia.org/wiki/Database_normalization}})
|
||||
to speed up operations.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Loading}
|
||||
Adding data into a data warehouse:
|
||||
\begin{descriptionlist}
|
||||
\item[Refresh] \marginnote{Refresh loading}
|
||||
The entire \ac{dwh} is rewritten.
|
||||
|
||||
\item[Update] \marginnote{Update loading}
|
||||
Only the changes are added to the \ac{dwh}. Old data are not modified.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Data warehouse architectures}
|
||||
|
||||
The architecture of a data warehouse should meet the following requirements:
|
||||
\begin{descriptionlist}
|
||||
\item[Separation] Separate the analytical and transactional workflows.
|
||||
\item[Scalability] Hardware and software should be easily upgradable.
|
||||
\item[Extensibility] Capability to host new applications and technologies without the need to redesign the system.
|
||||
\item[Security] Access control.
|
||||
\item[Administrability] Easily manageable.
|
||||
\end{descriptionlist}
|
||||
|
||||
\subsection{Single-layer architecture}
|
||||
\marginnote{Single-layer architecture}
|
||||
\begin{minipage}{0.55\textwidth}
|
||||
\begin{itemize}
|
||||
\item Minimizes the amount of data stored (i.e. no redundances).
|
||||
\item The source layer is the only physical layer (i.e. no separation).
|
||||
\item A middleware provides the \ac{dwh} features.
|
||||
\end{itemize}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_1layer_dwh.pdf}
|
||||
\end{minipage}
|
||||
|
||||
|
||||
\subsection{Two-layer architecture}
|
||||
\marginnote{Two-layer architecture}
|
||||
\begin{minipage}{0.55\textwidth}
|
||||
\begin{itemize}
|
||||
\item Source data (source layer) are physically separated from the \ac{dwh} (data warehouse layer).
|
||||
\item A staging layer applies \ac{etl} procedures before populating the \ac{dwh}.
|
||||
\item The \ac{dwh} is a centralized repository from which data marts can be created.
|
||||
Metadata repositories store information on sources, staging and data marts schematics.
|
||||
\end{itemize}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.4\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_2layer_dwh.pdf}
|
||||
\end{minipage}
|
||||
|
||||
|
||||
\subsection{Three-layer architecture}
|
||||
\marginnote{Three-layer architecture}
|
||||
\begin{minipage}{0.45\textwidth}
|
||||
\begin{itemize}
|
||||
\item A reconciled layer enhances the cleaned data coming from the staging step by
|
||||
adding enterprise-level details (i.e. adds more redundancy before populating the \ac{dwh}).
|
||||
\end{itemize}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}{0.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{img/_3layer_dwh.pdf}
|
||||
\end{minipage}
|
||||
|
||||
|
||||
|
||||
\section{Conceptual modeling}
|
||||
|
||||
\begin{description}
|
||||
\item[\Acl{dfm} (\acs{dfm})] \marginnote{\Acl{dfm} (\acs{dfm})}
|
||||
Conceptual model to support the design of data marts.
|
||||
The main concepts are:
|
||||
\begin{descriptionlist}
|
||||
\item[Fact]
|
||||
Concept relevant to decision-making processes (e.g. sales).
|
||||
\item[Measure]
|
||||
Numerical property to describe a fact (e.g. profit).
|
||||
\item[Dimension]
|
||||
Property of a fact with a finite domain (e.g. date).
|
||||
\item[Dimensional attribute]
|
||||
Property of a dimension (e.g. month).
|
||||
\item[Hierarchy]
|
||||
A tree where the root is a dimension and nodes are dimensional attributes (e.g. date $\rightarrow$ month).
|
||||
\item[Primary event]
|
||||
Occurrence of a fact. It is described by a tuple with a value for each dimension and each measure.
|
||||
\item[Secondary event]
|
||||
Aggregation of primary events.
|
||||
Measures of primary events are aggregated if they have the same (preselected) dimensional attributes.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.8\textwidth]{img/dfm.png}
|
||||
\caption{Example of \ac{dfm}}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.5\textwidth]{img/dfm_events.png}
|
||||
\caption{Example of primary and secondary events}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Aggregation operators}
|
||||
|
||||
Measures can be classified as:
|
||||
\begin{descriptionlist}
|
||||
\item[Flow measures] \marginnote{Flow measures}
|
||||
Evaluated cumulatively with respect to a time interval (e.g. quantity sold).
|
||||
\item[Level measures] \marginnote{Level measures}
|
||||
Evaluated at a particular time (e.g. number of products in inventory).
|
||||
\item[Unit measures] \marginnote{Unit measures}
|
||||
Evaluated at a particular time but expressed in relative terms (e.g. unit price).
|
||||
\end{descriptionlist}
|
||||
|
||||
Aggregation operators can be classified as:
|
||||
\begin{descriptionlist}
|
||||
\item[Distributive] \marginnote{Distributive operators}
|
||||
Able to calculate aggregates from partial aggregates (e.g. \texttt{SUM}, \texttt{MIN}, \texttt{MAX}).
|
||||
\item[Algebraic] \marginnote{Algebraic operators}
|
||||
Requires a finite number of support measures to compute the result (e.g. \texttt{AVG}).
|
||||
\item[Holistic] \marginnote{Holistic operators}
|
||||
Requires an infinite number of support measures to compute the result (e.g. \texttt{RANK}).
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{description}
|
||||
\item[Additivity] \marginnote{Additive measure}
|
||||
A measure is additive along a dimension if an aggregation operator can be applied.
|
||||
\begin{table}[ht]
|
||||
\centering
|
||||
\begin{tabular}{l | c | c}
|
||||
& \textbf{Temporal hierarchies} & \textbf{Non-temporal hierarchies} \\
|
||||
\hline
|
||||
\textbf{Flow measures} & \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX} & \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX} \\
|
||||
\textbf{Level measures} & \texttt{AVG}, \texttt{MIN}, \texttt{MAX} & \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX} \\
|
||||
\textbf{Unit measures} & \texttt{AVG}, \texttt{MIN}, \texttt{MAX} & \texttt{AVG}, \texttt{MIN}, \texttt{MAX} \\
|
||||
\end{tabular}
|
||||
\caption{Allowed operators for each measure type}
|
||||
\end{table}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Logical design}
|
||||
\marginnote{Logical design}
|
||||
Defining the data structures (e.g. tables and relationships) according to a conceptual model.
|
||||
There are two main strategies:
|
||||
\begin{descriptionlist}
|
||||
\item[Star schema] \marginnote{Star schema}
|
||||
A fact table that contains all the measures is linked to dimensional tables.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/logical_star_schema.png}
|
||||
\caption{Example of star schema}
|
||||
\end{figure}
|
||||
|
||||
\item[Snowflake schema] \marginnote{Snowflake schema}
|
||||
A star schema variant with partially normalized dimensional tables.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\textwidth]{img/logical_snowflake_schema.png}
|
||||
\caption{Example of snowflake schema}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
@ -0,0 +1,98 @@
|
||||
\chapter{Introduction}
|
||||
|
||||
|
||||
\section{Data}
|
||||
|
||||
\begin{description}
|
||||
\item[Data] \marginnote{Data}
|
||||
Collection of raw values.
|
||||
|
||||
\item[Information] \marginnote{Information}
|
||||
Organized data (e.g. relationships, context, \dots).
|
||||
|
||||
\item[Knowledge] \marginnote{Knowledge}
|
||||
Understanding information.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Data sources}
|
||||
\begin{description}
|
||||
\item[Transaction] \marginnote{Transaction}
|
||||
Business event that generates or modifies data in an information system (e.g. database).
|
||||
|
||||
\item[Signal] \marginnote{Signal}
|
||||
Measure produced by a sensor.
|
||||
|
||||
\item[External subjects]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Software}
|
||||
\begin{description}
|
||||
\item[\Ac{oltp}] \marginnote{\Acl{oltp}}
|
||||
Class of programs to support transaction-oriented applications and data storage.
|
||||
Suitable for real-time applications.
|
||||
|
||||
\item[\Ac{erp}] \marginnote{\Acl{erp}}
|
||||
Integrated system to manage all the processes of a business.
|
||||
Uses a shared database for all applications.
|
||||
Suitable for real-time applications.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Insight}
|
||||
Decisions can be classified as:
|
||||
\begin{descriptionlist}
|
||||
\item[Structured] \marginnote{Structured decision}
|
||||
Established and well-understood situations.
|
||||
What is needed is known.
|
||||
\item[Unstructured] \marginnote{Unstructured decision}
|
||||
Unplanned and unclear situations.
|
||||
What is needed for the decision is unknown.
|
||||
\end{descriptionlist}
|
||||
|
||||
Different levels of insight can be extracted by:
|
||||
\begin{descriptionlist}
|
||||
\item[\Ac{mis}] \marginnote{\Acl{mis}}
|
||||
Standardized reporting system built on an existing \ac{oltp}.
|
||||
Used for structured decisions.
|
||||
|
||||
\item[\Ac{dss}] \marginnote{\Acl{dss}}
|
||||
Analytical system to provide support for unstructured decisions.
|
||||
|
||||
\item[\Ac{eis}] \marginnote{\Acl{eis}}
|
||||
Formulate high-level decisions that impact the organization.
|
||||
|
||||
\item[\Ac{olap}] \marginnote{\Acl{olap}}
|
||||
Grouped analysis of multidimensional data.
|
||||
Involves a large amount of data.
|
||||
|
||||
\item[\Ac{bi}] \marginnote{\Acl{bi}}
|
||||
Applications, infrastructure, tools and best practices to analyze information.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Big data] \marginnote{Big data}
|
||||
Large and/or complex and/or fast-changing collection of data that traditional DBMSs are unable to process.
|
||||
\begin{description}
|
||||
\item[Structured] e.g. relational tables.
|
||||
\item[Unstructured] e.g. videos.
|
||||
\item[Semi-structured] e.g. JSON.
|
||||
\end{description}
|
||||
|
||||
\item[Anaylitics] \marginnote{Anaylitics}
|
||||
Structured decision driven by data.
|
||||
|
||||
\item[Data mining] \marginnote{Data mining}
|
||||
Discovery process for unstructured decisions.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.8\textwidth]{img/data_mining_process.png}
|
||||
\caption{Data mining process}
|
||||
\end{figure}
|
||||
|
||||
\item[Machine learning] \marginnote{Machine learning}
|
||||
Learning models and algorithms that allow to extract patterns from data.
|
||||
\end{description}
|
||||
@ -0,0 +1,172 @@
|
||||
\chapter{Machine learning}
|
||||
|
||||
\begin{description}
|
||||
\item[Machine learning] \marginnote{Machine learning}
|
||||
Application of methods and algorithms to extract patterns from data.
|
||||
\end{description}
|
||||
|
||||
\section{Tasks}
|
||||
\begin{description}
|
||||
\item[Classification] Estimation of a finite number of classes.
|
||||
\item[Regression] Estimation of a numeric value.
|
||||
\item[Similarity matching] Identify similar individuals.
|
||||
\item[Clustering] Grouping individuals based on their similarities.
|
||||
\item[Co-occurrence grouping] Identify associations between entities based on the transactions in which they appear together.
|
||||
\item[Profiling] Behavior description.
|
||||
\item[Link analysis] Analysis of connections (e.g. in a graph).
|
||||
\item[Data reduction] Reduce the dimensionality of data with minimal information loss.
|
||||
\item[Casual modeling] Understand the connections between events and actions.
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Categories}
|
||||
\begin{description}
|
||||
\item[Supervised learning] \marginnote{Supervised learning}
|
||||
Problem where the target(s) is defined.
|
||||
\item[Unsupervised learning] \marginnote{Unsupervised learning}
|
||||
Problem where no specific target is known.
|
||||
\item[Reinforcement learning] \marginnote{Reinforcement learning}
|
||||
Learn a policy to generate a sequence of actions.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Data}
|
||||
|
||||
\begin{description}
|
||||
\item[Dataset] \marginnote{Dataset}
|
||||
Set of $N$ individuals, each described by $D$ features.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Data types}
|
||||
|
||||
\begin{description}
|
||||
\item[Categorical] Values with a discrete domain.
|
||||
\begin{description}
|
||||
\item[Nominal] \marginnote{Categorical nominal data}
|
||||
The values are a set of non-ordered labels.
|
||||
|
||||
\textbf{Operators.} $=$, $\neq$
|
||||
\begin{example}
|
||||
Name, surname, zip code.
|
||||
\end{example}
|
||||
|
||||
\item[Ordinal] \marginnote{Categorical ordinal data}
|
||||
The values are a set of totally ordered labels.
|
||||
|
||||
\textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$
|
||||
\begin{example}
|
||||
Non-numerical quality evaluations (excellent, good, fair, poor, bad).
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
\item[Numerical] Values with a continuous domain.
|
||||
\begin{description}
|
||||
\item[Interval] \marginnote{Numerical interval data}
|
||||
Numerical values without an univocal definition of 0 (i.e. 0 is not used as reference).
|
||||
It is not reasonable to compare the magnitude of this type of data.
|
||||
|
||||
\textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$, $+$, $-$
|
||||
\begin{example}
|
||||
Celsius and Fahrenheit temperature scales, CGPA, time, \dots.
|
||||
|
||||
For instance, there is a $6.25\%$ increase from $16\text{°C}$ to $17\text{°C}$, but
|
||||
converted in Fahrenheit, the increase is of $2.96\%$ (from $60.8\text{°F}$ to $62.6\text{°F}$).
|
||||
\end{example}
|
||||
|
||||
\item[Ratio] \marginnote{Numerical ratio data}
|
||||
Values with an absolute 0 point.
|
||||
|
||||
\textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$, $+$, $-$
|
||||
\begin{example}
|
||||
Kelvin temperature scale, age, income, length.
|
||||
|
||||
For instance, there is a $10\%$ increase from 100\$ to 110\$.
|
||||
Converted in euro (1\geneuro = 1.06\$), the increase is still of $10\%$ (from $94.34\geneuro$ to $103.77\geneuro$).
|
||||
\end{example}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Transformations}
|
||||
\begin{center}
|
||||
|
||||
\begin{tabular}{c|c|>{\raggedright\arraybackslash}m{8cm}}
|
||||
\hline
|
||||
\multicolumn{2}{c|}{\textbf{Data type}} & \textbf{Transformation} \\
|
||||
\hline
|
||||
\multirow{2}{*}{Categorical} & Nominal & One-to-one transformations \\
|
||||
\cline{2-3}
|
||||
& Ordinal & Order preserving transformations (i.e. monotonic functions) \\
|
||||
\hline
|
||||
\multirow{2}{*}{Numerical} & Interval & Linear transformations \\
|
||||
\cline{2-3}
|
||||
& Ratio & Any mathematical function, standardization, variation in percentage \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
||||
|
||||
% \subsection{Dataset characteristics}
|
||||
% \begin{description}
|
||||
% \item[Dimensionality]
|
||||
% \item[Sparsity]
|
||||
% \item[Missing data]
|
||||
% \item[Resolution]
|
||||
% \end{description}
|
||||
|
||||
|
||||
\subsection{Dataset format}
|
||||
\begin{description}
|
||||
\item[Relational table] \marginnote{Relational table}
|
||||
The attributes of each record are the same.
|
||||
|
||||
\item[Data matrix] \marginnote{Data matrix}
|
||||
Matrix with $N$ rows (entries) and $D$ columns (attributes).
|
||||
|
||||
\item[Sparse matrix] \marginnote{Sparse matrix}
|
||||
Data matrix with lots of zeros.
|
||||
\begin{example}[Bag-of-words]
|
||||
Each row represents a document, each column represents a term.
|
||||
The $i,j$-th cell contains the frequency of the $j$-th term in the $i$-th document.
|
||||
\end{example}
|
||||
|
||||
\item[Transactional data] \marginnote{Transactional data}
|
||||
Each record contains a set of objects (not necessarily a relational table).
|
||||
|
||||
\item[Graph data] \marginnote{Graph data}
|
||||
Set of nodes and edges.
|
||||
|
||||
\item[Ordered data] \marginnote{Ordered data}
|
||||
e.g. temporal data.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Data quality}
|
||||
\begin{description}
|
||||
\item[Noise] \marginnote{Noise}
|
||||
Alteration of the original values.
|
||||
|
||||
\item[Outliers] \marginnote{Outliers}
|
||||
Data that considerably differ from the majority of the dataset.
|
||||
May be caused by noise or rare events.
|
||||
|
||||
Box plots can be used to visually detect outliers.
|
||||
|
||||
\item[Missing values] \marginnote{Missing values}
|
||||
Data that have not been collected.
|
||||
Sometimes they are not easily recognizable
|
||||
(e.g. when special values are used to mark missing data instead of \texttt{null}).
|
||||
|
||||
Can be handled in different ways:
|
||||
\begin{itemize}
|
||||
\item Ignore the records with missing values.
|
||||
\item Estimate or default missing values.
|
||||
\item Ignore the fact that some values are missing (not always applicable).
|
||||
\item Insert all the possible values and weigh them by their probability.
|
||||
\end{itemize}
|
||||
|
||||
\item[Duplicated data] \marginnote{Duplicated data}
|
||||
Data that may be merged.
|
||||
\end{description}
|
||||
@ -0,0 +1,56 @@
|
||||
\chapter{Regression}
|
||||
|
||||
\begin{description}
|
||||
\item[Linear regression] \marginnote{Linear regression}
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item A dataset $\matr{X}$ of $N$ rows and $D$ features.
|
||||
\item A response vector $\vec{y}$ of $N$ continuous values.
|
||||
\end{itemize}
|
||||
We want to learn the parameters $\vec{w} \in \mathbb{R}^D$ such that:
|
||||
\[ \vec{y} \approx \matr{X}\vec{w}^T \]
|
||||
|
||||
\item[Mean squared error] \marginnote{Mean squared error}
|
||||
To find the parameters for linear regression,
|
||||
we minimize as loss function the mean squared error:
|
||||
\[
|
||||
\mathcal{L}(\vec{w}) = \Vert \matr{X}\vec{w}^T - \vec{y} \Vert^2
|
||||
\]
|
||||
Its gradient is:
|
||||
\[ \nabla\mathcal{L}(\vec{w}) = 2\matr{X}^T(\matr{X}\vec{w}^T - \vec{y}) \]
|
||||
Constraining it to 0, we obtain the problem:
|
||||
\[ \matr{X}^T\matr{X}\vec{w}^T = \matr{X}^T\vec{y} \]
|
||||
If $\matr{X}^T\matr{X}$ is invertible, this can be solved analytically but could lead to overfitting.
|
||||
Numerical methods are therefore more suited.
|
||||
|
||||
Note that:
|
||||
\begin{itemize}
|
||||
\item MSE is influenced by the magnitude of the data.
|
||||
\item It measures the fitness of a model in absolute terms.
|
||||
% \item It is suited to compare different models.
|
||||
\end{itemize}
|
||||
|
||||
\item[Coefficient of determination] \marginnote{Coefficient of determination}
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item The mean of the observed data: $y_\text{avg} = \frac{1}{N} \sum_i \vec{y}_i$.
|
||||
\item The sum of the squared residuals: $SS_\text{res} = \sum_i (\vec{y}_i - \vec{w}^T\vec{x}_i)^2$.
|
||||
\item The total sum of squares: $SS_\text{tot} = \sum_i (\vec{y}_i - y_\text{avg})^2$.
|
||||
\end{itemize}
|
||||
The coefficient of determination is given by:
|
||||
\[ \text{R}^2 = 1 - \frac{SS_\text{res}}{SS_\text{tot}} \]
|
||||
|
||||
Intuitively, $\text{R}^2$ compares the model with a horizontal straight line ($y_\text{avg}$).
|
||||
When $\text{R}^2 = 1$, the model has a perfect fit.
|
||||
When $\text{R}^2$ is outside the range $[0, 1]$, then the model is worse than a straight line.
|
||||
|
||||
Note that:
|
||||
\begin{itemize}
|
||||
\item $\text{R}^2$ is a standardized index.
|
||||
\item $\text{R}^2$ tells how well the variables of the predictor can explain the variation in the target.
|
||||
\item $\text{R}^2$ is not suited for non-linear models.
|
||||
\end{itemize}
|
||||
|
||||
\item[Polynomial regression] \marginnote{Polynomial regression}
|
||||
Find a polynomial instead of a hyperplane.
|
||||
\end{description}
|
||||