Moved ML in year1

2026-02-04 15:51:43 +01:00 · 2023-12-22 20:00:40 +01:00
parent 3dc77a448a
commit fa4f50df48
57 changed files with 1 additions and 1 deletions
--- a/src/year1/machine-learning-and-data-mining/ainotes.cls
+++ b/src/year1/machine-learning-and-data-mining/ainotes.cls
@ -0,0 +1 @@
+../../ainotes.cls
--- a/src/year1/machine-learning-and-data-mining/dm-ml.tex
+++ b/src/year1/machine-learning-and-data-mining/dm-ml.tex
@ -0,0 +1,41 @@
+\documentclass[11pt]{ainotes}
+
+\title{Machine Learning and Data Mining}
+\date{2023 -- 2024}
+\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
+
+\DeclareAcronym{oltp}{short=OLTP, long=Online Transaction Processing}
+\DeclareAcronym{erp}{short=ERP, long=Enterprise Resource Planning}
+\DeclareAcronym{mis}{short=MIS, long=Management Information System}
+\DeclareAcronym{dss}{short=DSS, long=Decision Support System}
+\DeclareAcronym{eis}{short=EIS, long=Executive Information System}
+\DeclareAcronym{olap}{short=OLAP, long=Online Analysical Processing}
+\DeclareAcronym{bi}{short=BI, long=Business Intelligence}
+\DeclareAcronym{dwh}{short=DWH, long=Data Warehouse}
+\DeclareAcronym{dm}{short=DM, long=Data Mart}
+\DeclareAcronym{etl}{short=ETL, long=Extraction{,} Transformation{,} Loading}
+\DeclareAcronym{dfm}{short=DFM, long=Dimensional Fact Model}
+\DeclareAcronym{cdc}{short=CDC, long=Change Data Capture}
+\DeclareAcronym{crisp}{short=CRISP-DM, long=Cross Industry Standard Process for Data Mining}
+
+
+\begin{document}
+
+    \makenotesfront
+    \printacronyms
+    \newpage
+
+    \input{sections/_intro.tex}
+    \input{sections/_data_warehouse.tex}
+    \input{sections/_data_lake.tex}
+    \input{sections/_crisp.tex}
+    \input{sections/_machine_learning.tex}
+    \input{sections/_data_prepro.tex}
+    \input{sections/_classification.tex}
+    \input{sections/_regression.tex}
+    \input{sections/_clustering.tex}
+    \input{sections/_association_rules.tex}
+
+    \eoc
+
+\end{document}
--- a/src/year1/machine-learning-and-data-mining/img/_1layer_dwh.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_1layer_dwh.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_2layer_dwh.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_2layer_dwh.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_3layer_dwh.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_3layer_dwh.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_iris_boxplot_general.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_iris_boxplot_general.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_iris_boxplot_inside.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_iris_boxplot_inside.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_iris_decision_tree_example.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_iris_decision_tree_example.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_iris_histogram.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_iris_histogram.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_iris_pairplot.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_iris_pairplot.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_olap_cube.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_olap_cube.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_perceptron.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_perceptron.pdf
--- a/src/year1/machine-learning-and-data-mining/img/_storage.pdf
+++ b/src/year1/machine-learning-and-data-mining/img/_storage.pdf
--- a/src/year1/machine-learning-and-data-mining/img/binary_entropy.png
+++ b/src/year1/machine-learning-and-data-mining/img/binary_entropy.png
--- a/src/year1/machine-learning-and-data-mining/img/confidence_interval.png
+++ b/src/year1/machine-learning-and-data-mining/img/confidence_interval.png
--- a/src/year1/machine-learning-and-data-mining/img/crisp.png
+++ b/src/year1/machine-learning-and-data-mining/img/crisp.png
--- a/src/year1/machine-learning-and-data-mining/img/cross_validation.png
+++ b/src/year1/machine-learning-and-data-mining/img/cross_validation.png
--- a/src/year1/machine-learning-and-data-mining/img/data_mining_process.png
+++ b/src/year1/machine-learning-and-data-mining/img/data_mining_process.png
--- a/src/year1/machine-learning-and-data-mining/img/delta_lake.png
+++ b/src/year1/machine-learning-and-data-mining/img/delta_lake.png
--- a/src/year1/machine-learning-and-data-mining/img/dfm.png
+++ b/src/year1/machine-learning-and-data-mining/img/dfm.png
--- a/src/year1/machine-learning-and-data-mining/img/dfm_events.png
+++ b/src/year1/machine-learning-and-data-mining/img/dfm_events.png
--- a/src/year1/machine-learning-and-data-mining/img/elbow_method.png
+++ b/src/year1/machine-learning-and-data-mining/img/elbow_method.png
--- a/src/year1/machine-learning-and-data-mining/img/ensemble_error.png
+++ b/src/year1/machine-learning-and-data-mining/img/ensemble_error.png
--- a/src/year1/machine-learning-and-data-mining/img/impurity_comparison.png
+++ b/src/year1/machine-learning-and-data-mining/img/impurity_comparison.png
--- a/src/year1/machine-learning-and-data-mining/img/itemset_apriori.png
+++ b/src/year1/machine-learning-and-data-mining/img/itemset_apriori.png
--- a/src/year1/machine-learning-and-data-mining/img/kappa_lake.png
+++ b/src/year1/machine-learning-and-data-mining/img/kappa_lake.png
--- a/src/year1/machine-learning-and-data-mining/img/kernel_density_estimation.png
+++ b/src/year1/machine-learning-and-data-mining/img/kernel_density_estimation.png
--- a/src/year1/machine-learning-and-data-mining/img/lambda_lake.png
+++ b/src/year1/machine-learning-and-data-mining/img/lambda_lake.png
--- a/src/year1/machine-learning-and-data-mining/img/lift_chart.png
+++ b/src/year1/machine-learning-and-data-mining/img/lift_chart.png
--- a/src/year1/machine-learning-and-data-mining/img/logical_snowflake_schema.png
+++ b/src/year1/machine-learning-and-data-mining/img/logical_snowflake_schema.png
--- a/src/year1/machine-learning-and-data-mining/img/logical_star_schema.png
+++ b/src/year1/machine-learning-and-data-mining/img/logical_star_schema.png
--- a/src/year1/machine-learning-and-data-mining/img/mahalanobis.png
+++ b/src/year1/machine-learning-and-data-mining/img/mahalanobis.png
--- a/src/year1/machine-learning-and-data-mining/img/normal_quantile_test_error.png
+++ b/src/year1/machine-learning-and-data-mining/img/normal_quantile_test_error.png
--- a/src/year1/machine-learning-and-data-mining/img/olap_drillacross.png
+++ b/src/year1/machine-learning-and-data-mining/img/olap_drillacross.png
--- a/src/year1/machine-learning-and-data-mining/img/olap_drilldown.png
+++ b/src/year1/machine-learning-and-data-mining/img/olap_drilldown.png
--- a/src/year1/machine-learning-and-data-mining/img/olap_drillthrough.png
+++ b/src/year1/machine-learning-and-data-mining/img/olap_drillthrough.png
--- a/src/year1/machine-learning-and-data-mining/img/olap_pivot.png
+++ b/src/year1/machine-learning-and-data-mining/img/olap_pivot.png
--- a/src/year1/machine-learning-and-data-mining/img/olap_rollup.png
+++ b/src/year1/machine-learning-and-data-mining/img/olap_rollup.png
--- a/src/year1/machine-learning-and-data-mining/img/olap_slicedice.png
+++ b/src/year1/machine-learning-and-data-mining/img/olap_slicedice.png
--- a/src/year1/machine-learning-and-data-mining/img/roc_curve.png
+++ b/src/year1/machine-learning-and-data-mining/img/roc_curve.png
--- a/src/year1/machine-learning-and-data-mining/img/rules_apriori.png
+++ b/src/year1/machine-learning-and-data-mining/img/rules_apriori.png
--- a/src/year1/machine-learning-and-data-mining/img/storage.drawio
+++ b/src/year1/machine-learning-and-data-mining/img/storage.drawio
@ -0,0 +1,31 @@
+<mxfile host="app.diagrams.net" modified="2023-10-13T17:44:38.951Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0" etag="0k6DN-mG6fDlB8POdY3R" version="22.0.4" type="device">
+  <diagram name="Pagina-1" id="Obl2eNAEIfPRNowj_f7H">
+    <mxGraphModel dx="1195" dy="622" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="j0uoPLtJFFh1yWsyPPyp-1" value="" style="endArrow=classic;html=1;rounded=0;strokeWidth=2;startArrow=classic;startFill=1;fontSize=20;" edge="1" parent="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="180" y="400" as="sourcePoint" />
+            <mxPoint x="680" y="400" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="j0uoPLtJFFh1yWsyPPyp-2" value="Data warehouse" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="180" y="360" width="150" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="j0uoPLtJFFh1yWsyPPyp-3" value="&lt;div align=&quot;right&quot; style=&quot;font-size: 20px;&quot;&gt;Data lake&lt;br style=&quot;font-size: 20px;&quot;&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="530" y="360" width="150" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="j0uoPLtJFFh1yWsyPPyp-4" value="Data hub" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="360" y="360" width="150" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="j0uoPLtJFFh1yWsyPPyp-5" value="Hot" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="180" y="410" width="60" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="j0uoPLtJFFh1yWsyPPyp-6" value="Cold" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
+          <mxGeometry x="620" y="410" width="60" height="30" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/src/year1/machine-learning-and-data-mining/img/svm.png
+++ b/src/year1/machine-learning-and-data-mining/img/svm.png
--- a/src/year1/machine-learning-and-data-mining/img/svm_kernel_example1.png
+++ b/src/year1/machine-learning-and-data-mining/img/svm_kernel_example1.png
--- a/src/year1/machine-learning-and-data-mining/img/svm_kernel_example2.png
+++ b/src/year1/machine-learning-and-data-mining/img/svm_kernel_example2.png
--- a/src/year1/machine-learning-and-data-mining/metadata.json
+++ b/src/year1/machine-learning-and-data-mining/metadata.json
@ -0,0 +1,11 @@
+{
+    "name": "Machine Learning and Data Mining",
+    "year": 1,
+    "semester": 1,
+    "pdfs": [
+        {
+            "name": null,
+            "path": "dm-ml.pdf"
+        }
+    ]
+}
--- a/src/year1/machine-learning-and-data-mining/sections/_association_rules.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_association_rules.tex
@ -0,0 +1,353 @@
+\chapter{Association rules}
+
+
+\section{Frequent itemset}
+
+\begin{description}
+    \item[Itemset] \marginnote{Itemset}
+        Collection of one or more items (e.g. $\{ \text{milk}, \text{bread}, \text{diapers} \}$).
+
+    \item[K-itemset] \marginnote{K-itemset}
+        Itemset with $k$ items.
+
+    \item[Support count] \marginnote{Support count}
+        Number of occurrences of an itemset in a dataset.
+        \begin{example}
+            \phantom{}\\
+            \begin{minipage}{0.4\textwidth}
+                Given the following transactions:
+                \begin{center}
+                    \begin{tabular}{|c|l|}
+                        \hline
+                        1 & bread, milk \\
+                        2 & beer, bread, diaper, eggs \\
+                        3 & beer, coke, diaper, milk \\
+                        \textbf{4} & \textbf{beer, bread, diaper, milk} \\
+                        \textbf{5} & \textbf{bread, coke, diaper, milk} \\
+                        \hline
+                    \end{tabular}
+                \end{center}
+            \end{minipage}
+            \begin{minipage}{0.5\textwidth}
+                The support count of the itemset containing bread, diapers and milk is:
+                \[ \sigma(\{ \text{bread}, \text{diapers}, \text{milk} \}) = 2 \]
+            \end{minipage}
+        \end{example}
+
+        \item[Association rule] \marginnote{Association rule}
+        Given two itemsets $A$ and $C$, an association rule has form:
+        \[ A \rightarrow C \]
+        It means that there are transactions in the dataset where $A$ and $C$ co-occur. 
+        Note that it is not strictly a logical implication.
+
+    \item[Metrics] \phantom{}
+        \begin{description}
+            \item[Support] \marginnote{Support}
+            Given $N$ transactions, the support of an itemset $A$ is:
+            \[ \texttt{sup}(A) = \frac{\sigma(A)}{N} \]
+            The support of an association rule $A \rightarrow C$ is:
+            \[ \texttt{sup}(A \rightarrow C) = \texttt{sup}(A \cup C) = \frac{\sigma(A \cup C)}{N} \]
+
+            Low support implies random associations.
+
+            \begin{description}
+                \item[Frequent itemset] \marginnote{Frequent itemset}
+                    Itemset whose support is at least a given threshold.
+            \end{description}
+    
+        \item[Confidence] \marginnote{Confidence}
+            Given an association rule $A \rightarrow C$, its confidence is given by:
+            \[ \texttt{conf}(A \rightarrow C) = \frac{\sigma(A \cup C)}{\sigma(A)} \in [0, 1] \]
+
+            Low confidence implies low reliability.
+
+            \begin{theorem}
+                The confidence of $A \rightarrow C$ can be computed given the supports of $A \rightarrow C$ and $A$:
+                \[ \texttt{conf}(A \rightarrow C) = \frac{\texttt{sup}(A \rightarrow C)}{\texttt{sup}(A)} \]
+            \end{theorem}
+    \end{description}
+
+    \item[Association rule mining] \marginnote{Association rule mining}
+        Given $N$ transactions and two thresholds \texttt{min\_sup} and \texttt{min\_conf},
+        association rule mining finds all the rules $A \rightarrow C$ such that:
+        \[ \begin{split}
+            \texttt{sup}(A \rightarrow C) &\geq \texttt{min\_sup} \\
+            \texttt{conf}(A \rightarrow C) &\geq \texttt{min\_conf}
+        \end{split} \]
+
+        This can be done in two steps:
+        \begin{enumerate}
+            \item \marginnote{Frequent itemset generation}
+                Determine the itemsets with $\text{support} \geq \texttt{min\_sup}$ (frequent itemsets).
+            \item \marginnote{Rule generation}
+                Determine the association rules with $\text{confidence} \geq \texttt{min\_conf}$.
+        \end{enumerate}
+\end{description}
+
+
+
+\section{Frequent itemset generation}
+
+\subsection{Brute force}
+Given $D$ items, there are $2^D$ possible itemsets.
+To compute the support of a single itemset, the complexity is $O(NW)$ where 
+$N$ is the number of transactions and $W$ is the width of the largest transaction.
+Listing all the itemsets and computing their support have an exponential complexity of $O(NW2^D)$.
+
+
+\subsection{Apriori principle}
+
+\begin{theorem} \marginnote{Apriori principle}
+    If an itemset is frequent, then all of its subsets are frequent.
+
+    \begin{proof}
+        By the definition of support, it holds that:
+        \[ \forall X, Y: (X \subseteq Y) \Rightarrow (\texttt{sup}(X) \geq \texttt{sup}(Y)) \]
+
+        In other words, the support metric is anti-monotone.
+    \end{proof}
+\end{theorem}
+
+\begin{corollary}
+    If an itemset is infrequent, then all of its supersets are infrequent.
+\end{corollary}
+
+\begin{example} \phantom{}
+    \begin{center}
+        \includegraphics[width=0.6\textwidth]{img/itemset_apriori.png}
+    \end{center}
+\end{example}
+
+\begin{algorithm}[H]
+\caption{Apriori principle}
+\begin{lstlisting}[mathescape=true]
+def candidatesGeneration(freq_itemsets$_k$):
+    candidate_itemsets$_{k+1}$ = selfJoin(freq_itemsets$_k$)
+    for itemset in candidate_itemsets$_{k+1}$:
+        for sub in subsetsOfSize($k$, itemset):
+            if sub not in freq_itemsets$_k$:
+                candidate_itemsets$_{k+1}$.remove(itemset)
+    return candidate_itemsets$_{k+1}$
+
+def aprioriItemsetGeneration(transactions, min_sup):
+    freq_itemsets$_1$ = itemsetsOfSize(1, transactions)
+    k = 1
+    while freq_itemsets$_1$ is not null:
+        candidate_itemsets$_{k+1}$ = candidatesGeneration(freq_itemsets$_k$)
+        freq_itemsets$_{k+1}$ = $\{ c \in \texttt{candidate\_itemsets}_{k+1} \mid \texttt{sup(}c\texttt{)} \geq \texttt{min\_sup} \}$
+        k += 1
+    return freq_itemsets$_k$
+\end{lstlisting}
+\end{algorithm}
+
+\begin{description}
+    \item[Complexity]
+        The complexity of the apriori principle depends on:
+        \begin{itemize}
+            \item The choice of the support threshold.
+            \item The number of unique items.
+            \item The number and the width of the transactions.
+        \end{itemize}
+\end{description}
+
+
+
+\section{Rule generation}
+
+\subsection{Brute force}
+Given a frequent $k$-itemset $L$, there are $2^k-2$ possible association rules ($-2$ as $L \rightarrow \varnothing$ and $\varnothing \rightarrow L$ can be ignored).
+For each possible rule, it is necessary to compute the confidence. The overall complexity is exponential.
+
+\subsection{Apriori principle}
+
+\begin{theorem} \marginnote{Apriori principle}
+    Without loss of generality, consider an itemset $\{ A, B, C, D \}$.
+    It holds that:
+    \[ \texttt{conf}(ABC \rightarrow D) \geq \texttt{conf}(AB \rightarrow CD) \geq \texttt{conf}(A \rightarrow BCD) \]
+\end{theorem}
+
+\begin{example} \phantom{}
+    \begin{center}
+        \includegraphics[width=0.5\textwidth]{img/rules_apriori.png}
+    \end{center}
+\end{example}
+
+
+
+\section{Interestingness measures}
+
+\begin{description}
+    \item[Contingency table] \marginnote{Contingency table}
+        Given an association rule $A \rightarrow C$, its contingency table is defined as:
+        \begin{center}
+            \def\arraystretch{1.1}
+            \begin{tabular}{c|c|c|c}
+                & $C$ & $\overline{C}$ & \\
+                \hline
+                $A$ & $\prob{A \land C}$ & $\prob{A \land \overline{C}}$ & $\prob{A}$ \\
+                \hline
+                $\overline{A}$ & $\prob{\overline{A} \land C}$ & $\prob{\overline{A} \land \overline{C}}$ & $\prob{\overline{A}}$ \\
+                \hline
+                & $\prob{C}$ & $\prob{\overline{C}}$ & 100 \\
+            \end{tabular}
+        \end{center}
+\end{description}
+
+\begin{remark}
+    Confidence can be misleading.
+    \begin{example} \phantom{}\\
+        \begin{minipage}[t]{0.36\textwidth}
+            Given the following contingency table:
+            \begin{center}
+                \begin{tabular}{c|c|c|c}
+                                                & coffee    & $\overline{\text{coffee}}$ & \\
+                    \hline
+                    tea                         & 15        & 5                     & 20 \\
+                    \hline
+                    $\overline{\text{tea}}$     & 75        & 5                     & 80 \\
+                    \hline
+                                                & 90        & 10                    & 100 \\
+                \end{tabular}
+            \end{center}
+        \end{minipage}
+        \hspace{0.5cm}
+        \begin{minipage}[t]{0.6\textwidth}
+            We have that:
+            \[ \texttt{conf}(\text{tea} \rightarrow \text{coffee}) = \frac{\texttt{sup}(\text{tea}, \text{coffee})}{\texttt{sup}(\text{tea})} = \frac{15}{20} = 0.75 \]
+            But, we also have that:
+            \[ \prob{\text{coffee}} = 0.9 \hspace*{1cm} \prob{\text{coffee} \mid \overline{\text{tea}}} = \frac{75}{80} = 0.9375 \]
+            So, despite the high confidence of $(\text{tea} \rightarrow \text{coffee})$,
+            the probability of coffee increases in absence of tea.
+        \end{minipage}
+    \end{example}
+\end{remark}
+
+
+\subsection{Statistical-based measures}
+
+Measures that take into account the statistical independence of the items.
+
+\begin{description}
+    \item[Lift] \marginnote{Lift}
+        \[ \texttt{lift}(A \rightarrow C) = \frac{\texttt{conf}(A \rightarrow C)}{\texttt{sup}(C)} = \frac{\prob{A \land C}}{\prob{A}\prob{C}} \]
+
+        If $\texttt{lift}(A \rightarrow C) = 1$, then $A$ and $C$ are independent.
+    
+    \item[Leverage] \marginnote{Leverage}
+        \[ \texttt{leve}(A \rightarrow C) = \texttt{sup}(A \cup C) - \texttt{sup}(A)\texttt{sup}(C) = \prob{A \land C} - \prob{A}\prob{C} \]
+        
+        If $\texttt{leve}(A \rightarrow C) = 0$, then $A$ and $C$ are independent.
+    
+    \item[Conviction] \marginnote{Conviction}
+        \[ \texttt{conv}(A \rightarrow C) = \frac{1 - \texttt{sup}(C)}{1 - \texttt{conf}(A \rightarrow C)} = \frac{\prob{A}(1-\prob{C})}{\prob{A}-\prob{A \land C}} \]
+\end{description}
+
+\begin{table}[H]
+    \centering
+    \begin{tabular}{c|p{10cm}}
+        \hline
+        \textbf{Metric} & \textbf{Interpretation} \\
+        \hline
+        High support        & The rule applies to many transactions. \\
+        \hline
+        High confidence     & The chance that the rule is true for some transactions is high. \\
+        \hline
+        High lift           & Low chance that the rule is just a coincidence. \\
+        \hline
+        High conviction     & The rule is violated less often compared to the case when the antecedent and consequent are independent. \\
+        \hline
+    \end{tabular}
+    \caption{Intuitive interpretation of the measures}
+\end{table}
+
+
+
+\section{Multi-dimensional association rules}
+
+\begin{description}
+    \item[Mono-dimensional events] \marginnote{Mono-dimensional events}
+        Represented as transactions. Each event contains items that appear together.
+        
+    \item[Multi-dimensional events] \marginnote{Multi-dimensional events}
+        Represented as tuples. Each event contains the values of its attributes.
+
+    \item[Mono/Multi-dimensional equivalence] \marginnote{Equivalence}
+        Mono-dimensional events can be converted into multi-dimensional events and vice versa.
+
+        To transform quantitative attributes, it is usually useful to discretize them.
+
+        \begin{example}[Multi to mono] \phantom{}\\
+            \begin{minipage}{0.35\textwidth}
+                \begin{center}
+                    \begin{tabular}{c|c|c}
+                        \textbf{Id} & \textbf{co2} & \textbf{tin\_oxide} \\
+                        \hline
+                        1 & high & medium \\
+                        2 & medium & low \\
+                    \end{tabular}
+                \end{center}
+            \end{minipage}
+            $\rightarrow$
+            \begin{minipage}{0.48\textwidth}
+                \begin{center}
+                    \begin{tabular}{c|c}
+                        \textbf{Id} & \textbf{Transaction} \\
+                        \hline
+                        1 & $\{ \text{co2/high}, \text{tin\_oxide/medium} \}$ \\
+                        2 & $\{ \text{co2/medium}, \text{tin\_oxide/low} \}$ \\
+                    \end{tabular}
+                \end{center}
+            \end{minipage}
+        \end{example}
+
+        \begin{example}[Mono to multi] \phantom{}\\
+            \begin{minipage}{0.35\textwidth}
+                \begin{center}
+                    \begin{tabular}{c|c|c|c|c}
+                        \textbf{Id} & \textbf{a} & \textbf{b} & \textbf{c} & \textbf{d} \\
+                        \hline
+                        1 & yes & yes & no & no \\
+                        2 & yes & no & yes & no \\
+                    \end{tabular}
+                \end{center}
+            \end{minipage}
+            $\leftarrow$
+            \begin{minipage}{0.30\textwidth}
+                \begin{center}
+                    \begin{tabular}{c|c}
+                        \textbf{Id} & \textbf{Transaction} \\
+                        \hline
+                        1 & $\{ \text{a}, \text{b} \}$ \\
+                        2 & $\{ \text{a}, \text{c} \}$ \\
+                    \end{tabular}
+                \end{center}
+            \end{minipage}
+        \end{example}
+
+\end{description}
+
+
+
+\section{Multi-level association rules}
+Organize items into a hierarchy.
+
+\begin{description}
+    \item[Specialized to general] \marginnote{Specialized to general} 
+        Generally, the support of the rule increases.
+        \begin{example}
+            From $(\text{apple} \rightarrow \text{milk})$ to $(\text{fruit} \rightarrow \text{dairy})$
+        \end{example}
+
+    \item[General to specialized] \marginnote{General to specialized} 
+        Generally, the support of the rule decreases.
+        \begin{example}
+            From $(\text{fruit} \rightarrow \text{dairy})$ to $(\text{apple} \rightarrow \text{milk})$
+        \end{example}
+
+    \item[Redundant level] \marginnote{Redundant level}
+        A more specialized rule in the hierarchy is redundant if its confidence is similar to the one of a more general rule.
+
+    \item[Multi-level association rule mining] \marginnote{Multi-level association rule mining}
+        Run association rule mining on different levels of abstraction (general to specialized).
+        At each level, the support threshold is decreased.
+\end{description}
--- a/src/year1/machine-learning-and-data-mining/sections/_classification.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_classification.tex
@ -0,0 +1,889 @@
+\chapter{Classification}
+
+\begin{description}
+    \item[(Supervised) classification] \marginnote{Classification} 
+        Given a finite set of classes $C$ and a dataset $\matr{X}$ of $N$ individuals, 
+        each associated to a class $y(\vec{x}) \in C$,
+        we want to learn a model $\mathcal{M}$ able to 
+        guess the value of $y(\bar{\vec{x}})$ for unseen individuals.
+
+        Classification can be:
+        \begin{descriptionlist}
+            \item[Crisp] \marginnote{Crisp classification}
+                Each individual has one and only one label.
+            \item[Probabilistic] \marginnote{Probabilistic classification}
+                Each individual is assigned to a label with a certain probability.
+        \end{descriptionlist}
+
+    \item[Classification model] \marginnote{Classification model}
+        A classification model (classifier) makes a prediction by taking as input 
+        a data element $\vec{x}$ and a decision function $y_\vec{\uptheta}$ parametrized on $\vec{\uptheta}$:
+        \[ \mathcal{M}(\vec{x}, \vec{\uptheta}) = y_\vec{\uptheta}(\vec{x}) \]
+
+    \item[Vapnik-Chervonenkis dimension] \marginnote{Vapnik-Chervonenkis dimension}
+        A dataset with $N$ elements defines $2^N$ learning problems.
+        A model $\mathcal{M}$ has Vapnik-Chervonenkis (VC) dimension $N$ if 
+        it is able to solve all the possible learning problems with $N$ elements.
+
+        \begin{example}
+            A straight line has VC dimension 3.
+        \end{example}
+
+    \item[Data exploration] \marginnote{Data exploration}
+        \begin{figure}[ht]
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=\linewidth]{img/_iris_boxplot_general.pdf}
+                \caption{Iris dataset general boxplot}
+            \end{subfigure}%
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=\linewidth]{img/_iris_boxplot_inside.pdf}
+                \caption{Iris dataset class boxplot}
+            \end{subfigure}
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=\linewidth]{img/_iris_histogram.pdf}
+                \caption{Iris dataset histograms}
+            \end{subfigure}%
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=\linewidth]{img/_iris_pairplot.pdf}
+                \caption{Iris dataset pairplots}
+            \end{subfigure}
+        \end{figure}
+
+    \item[Hyperparameters]
+        Parameters of the model that have to be manually chosen.
+\end{description}
+
+
+\section{Evaluation}
+
+\begin{description}
+    \item[Dataset split]
+        A supervised dataset can be randomly split into:
+        \begin{descriptionlist}
+            \item[Train set] \marginnote{Train set}
+                Used to learn the model. Usually the largest split. Can be seen as an upper bound of the model performance.
+            \item[Test set] \marginnote{Test set}
+                Used to evaluate the trained model. Can be seen as a lower bound of the model performance.
+            \item[Validation set] \marginnote{Validation set}
+                Used to evaluate the model during training and/or for tuning parameters.
+        \end{descriptionlist}
+        It is assumed that the splits have similar characteristics.
+
+    \item[Overfitting] \marginnote{Overfitting}
+        Given a dataset $\matr{X}$, a model $\mathcal{M}$ is overfitting if
+        there exists another model $\mathcal{M}'$ such that:
+        \[ 
+            \begin{split}
+                \texttt{error}_\text{train}(\mathcal{M}) &< \texttt{error}_\text{train}(\mathcal{M}') \\
+                \texttt{error}_\matr{X}(\mathcal{M}) &> \texttt{error}_\matr{X}(\mathcal{M}') \\
+            \end{split}    
+        \]
+
+        Possible causes of overfitting are:
+        \begin{itemize}
+            \item Noisy data.
+            \item Lack of representative instances.
+        \end{itemize}
+\end{description}
+
+
+\subsection{Test set error}
+\textbf{\underline{Disclaimer: I'm very unsure about this part}}\\
+The error on the test set can be seen as a lower bound error of the model.
+If the test set error ratio is $x$, we can expect an error of $(x \pm  \text{confidence interval})$.
+
+Predicting the elements of the test set can be seen as a binomial process (i.e. a series of $N$ Bernoulli processes).
+We can therefore compute the empirical frequency of success as $f = (\text{correct predictions}/N)$.
+We want to estimate the probability of success $p$.
+
+We assume that the deviation between the empirical frequency and the true frequency is due to a 
+normal noise around the true probability (i.e. the true probability $p$ is the mean).
+Fixed a confidence level $\alpha$ (i.e. the probability of a wrong estimate),
+we want that:
+\[ \prob{ z_{\frac{\alpha}{2}} \leq \frac{f-p}{\sqrt{\frac{1}{N}p(1-p)}} \leq z_{(1-\frac{\alpha}{2})} } = 1 - \alpha \]
+In other words, we want the middle term to have a high probability to 
+be between the $\frac{\alpha}{2}$ and $(1-\frac{\alpha}{2})$ quantiles of the gaussian.
+\begin{center}
+    \includegraphics[width=0.4\textwidth]{img/normal_quantile_test_error.png}
+\end{center}
+
+We can estimate $p$ using the Wilson score interval\footnote{\url{https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval}}:
+\[ p = \frac{1}{1+\frac{1}{N}z^2} \left( f + \frac{1}{2N}z^2 \pm z\sqrt{\frac{1}{N}f(1-f) + \frac{z^2}{4N^2}} \right) \]
+where $z$ depends on the value of $\alpha$.
+For a pessimistic estimate, $\pm$ becomes a $+$. Vice versa, for an optimistic estimate, $\pm$ becomes a $-$.
+
+As $N$ is at the denominator, this means that for large values of $N$, the uncertainty becomes smaller.
+\begin{center}
+    \includegraphics[width=0.45\textwidth]{img/confidence_interval.png}
+\end{center}
+
+\subsection{Dataset splits}
+
+\begin{description}
+    \item[Holdout] \marginnote{Holdout}
+        The dataset is split into train, test and, if needed, validation.
+
+    \item[Cross-validation] \marginnote{Cross-validation}
+        The training data is partitioned into $k$ chunks.
+        For $k$ iterations, one of the chunks is used to test and the others to train a new model.
+        The overall error is obtained as the average of the errors of the $k$ iterations.
+
+        In the end, the final model is still trained on the entire training data, 
+        while cross-validation results are used as an evaluation and comparison metric.
+        Note that cross-validation is done on the training set, so a final test set can still be used to
+        evaluate the resulting model.
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.6\textwidth]{img/cross_validation.png}
+            \caption{Cross-validation example}
+        \end{figure}
+
+    \item[Leave-one-out] \marginnote{Leave-one-out}
+        Extreme case of cross-validation with $k=N$, the size of the training set.
+        In this case, the whole dataset but one element is used for training and the remaining entry for testing.
+
+    \item[Bootstrap] \marginnote{Bootstrap}
+        Statistical sampling of the dataset with replacement (i.e. an entry can be selected multiple times).
+        The selected entries form the training set while the elements that have never been selected are used for testing.
+\end{description}
+
+
+\subsection{Binary classification performance measures}
+
+In binary classification, the two classes can be distinguished as the positive and negative labels.
+The prediction of a classifier can be a:
+\begin{center}
+    True positive ($TP$) $\cdot$ False positive ($FP$) $\cdot$ True negative ($TN$) $\cdot$ False negative ($FN$)
+\end{center}
+
+\begin{center}
+    \begin{tabular}{|c|c|c|c|}
+        \cline{3-4}
+        \multicolumn{2}{c|}{} & \multicolumn{2}{c|}{Predicted} \\
+        \cline{3-4}
+        \multicolumn{2}{c|}{} & Pos & Neg \\
+        \hline
+        \multirow{2}{*}{\rotatebox[origin=c]{90}{True}} & Pos & $TP$ & $FN$ \\
+        \cline{2-4}
+        & Neg & $FP$ & $TN$ \\
+        \hline
+    \end{tabular}
+\end{center}
+
+Given a test set of $N$ element, possible metrics are:
+\begin{descriptionlist}
+    \item[Accuracy] \marginnote{Accuracy}
+        Number of correct predictions.
+        \[ \text{accuracy} = \frac{TP + TN}{N} \]
+
+    \item[Error rate] \marginnote{Error rate}
+        Number of incorrect predictions.
+        \[ \text{error rate} = 1 - \text{accuracy} \]
+
+    \item[Precision] \marginnote{Precision}
+        Number of true positives among what the model classified as positive
+        (i.e. how many samples the model classified as positive are real positives).
+        \[ \text{precision} = \frac{TP}{TP + FP} \]
+
+    \item[Recall/Sensitivity] \marginnote{Recall}
+        Number of true positives among the real positives
+        (i.e. how many real positives the model predicted).
+        \[ \text{recall} = \frac{TP}{TP + FN} \]
+
+    \item[Specificity] \marginnote{Specificity}
+        Number of true negatives among the real negatives
+        (i.e. recall for negative labels).
+        \[ \text{specificity} = \frac{TN}{TN + FP} \]
+
+    \item[F1 score] \marginnote{F1 score}
+        Harmonic mean of precision and recall
+        (i.e. measure of balance between precision and recall).
+        \[ \text{F1} = 2 \frac{\text{precision} \cdot \text{recall}}{\text{precision} + \text{recall}} \]
+\end{descriptionlist}
+
+
+\subsection{Multi-class classification performance measures}
+
+\begin{descriptionlist}
+    \item[Confusion matrix] \marginnote{Confusion matrix}
+        Matrix to correlate the predictions of $n$ classes:
+        \begin{center}
+            \begin{tabular}{|c|c|c|c|c|c|}
+                \cline{3-6}
+                \multicolumn{2}{c|}{} & \multicolumn{4}{c|}{Predicted} \\
+                \cline{3-6}
+                \multicolumn{2}{c|}{} & a & b & c & Total \\
+                \hline
+                \multirow{4}{*}{\rotatebox[origin=c]{90}{True}} 
+                & a & $TP_a$ & $FP_{a-b}$ & $FP_{a-c}$ & $T_a$ \\
+                \cline{2-6}
+                & b & $FP_{b-a}$ & $TP_b$ & $FP_{b-c}$ & $T_b$ \\
+                \cline{2-6}
+                & c & $FP_{c-a}$ & $FP_{c-b}$ & $TP_c$ & $T_c$ \\
+                \cline{2-6}
+                & Total & $P_a$ & $P_b$ & $P_c$ & $N$ \\
+                \hline
+            \end{tabular}
+        \end{center}
+        where:
+        \begin{itemize}
+            \item $a$, $b$ and $c$ are the classes.
+            \item $T_x$ is the true number of labels of class $x$ in the dataset.
+            \item $P_x$ is the predicted number of labels of class $x$ in the dataset.
+            \item $TP_x$ is the number of times a class $x$ was correctly predicted (true predictions).
+            \item $FP_{i-j}$ is the number of times a class $i$ was predicted as $j$ (false predictions).
+        \end{itemize}
+
+    \item[Accuracy] \marginnote{Accuracy}
+        Accuracy is extended from the binary case as:
+        \[ \text{accuracy} = \frac{\sum_i TP_i}{N} \]
+
+    \item[Precision] \marginnote{Precision}
+        Precision is defined w.r.t. a single class:
+        \[ \text{precision}_i = \frac{TP_i}{P_i} \]
+
+    \item[Recall] \marginnote{Recall}
+        Recall is defined w.r.t. a single class:
+        \[ \text{recall}_i = \frac{TP_i}{T_i} \]
+\end{descriptionlist}
+
+If a single value of precision or recall is needed, the mean can be used by computing
+a macro (unweighted) average or a class-weighted average.
+
+\begin{description}
+    \item[$\kappa$-statistic] \marginnote{$\kappa$-statistic}
+        Evaluates the concordance between two classifiers (in our case, the predictor and the ground truth).
+        It is based on two probabilities:
+        \begin{descriptionlist}
+            \item[Probability of concordance] $\prob{c} = \frac{\sum_{i}^{\texttt{classes}} TP_i}{N}$ 
+            \item[Probability of random concordance] $\prob{r} = \frac{\sum_{i}^{\texttt{classes}} T_i P_i}{N^2}$ 
+        \end{descriptionlist}
+
+        $\kappa$-statistic is given by:
+        \[ \kappa = \frac{\prob{c} - \prob{r}}{1 - \prob{r}} \in [-1, 1] \]
+        When $\kappa = 1$, there is perfect agreement ($\sum_{i}^{\texttt{classes}} TP_i = 1$), 
+        when $\kappa = -1$, there is total disagreement ($\sum_{i}^{\texttt{classes}} TP_i = 0$) and
+        when $\kappa = 0$, there is random agreement.
+\end{description}
+
+
+\subsection{Probabilistic classifier performance measures}
+
+\begin{description}
+    \item[Lift chart] \marginnote{Lift chart}
+        Used in binary classification.
+        Given the resulting probabilities of the positive class of a classifier, 
+        sort them in decreasing order and plot a 2d-chart with
+        increasing sample size on the x-axis and the number of positive samples on the y-axis.
+
+        Then, plot a straight line to represent a baseline classifier that makes random choices.
+        As the probabilities are sorted in decreasing order, it is expected a high concentration of
+        positive labels on the right side.
+        When the area between the two curves is large and the curve is above the random classifier, 
+        the model can be considered a good classifier.
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.5\textwidth]{img/lift_chart.png}
+            \caption{Example of lift chart}
+        \end{figure}
+
+    \item[ROC curve] \marginnote{ROC curve}
+        The ROC curve can be seen as a way to represent multiple confusion matrices of a classifier
+        that uses different thresholds.
+        The x-axis of a ROC curve represents the false positive rate while the y-axis represents the true positive rate.
+
+        A straight line is used to represent a random classifier.
+        A threshold can be considered good if it is high on the y-axis and low on the x-axis.
+        
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.35\textwidth]{img/roc_curve.png}
+            \caption{Example of ROC curves}
+        \end{figure}
+\end{description}
+
+
+\subsection{Data imbalance}
+A classifier may not perform well when predicting a minority class of the training data.
+Possible solutions are:
+\begin{descriptionlist}
+    \item[Undersampling] \marginnote{Undersampling}
+        Randomly reduce the number of examples of the majority classes.
+
+    \item[Oversampling] \marginnote{Oversampling}
+        Increase the examples of the minority classes.
+
+        \begin{description}
+            \item[Synthetic minority oversampling technique (SMOTE)] \marginnote{SMOTE}
+                \begin{enumerate}
+                    \item Randomly select an example $x$ belonging to the minority class.
+                    \item Select a random neighbor $z_i$ among its $k$-nearest neighbors $z_1, \dots, z_k$.
+                    \item Synthesize a new example by selecting a random point of the feature space between $x$ and $z_i$.
+                \end{enumerate}
+        \end{description}
+
+    \item[Cost sensitive learning] \marginnote{Cost sensitive learning}
+        Assign a cost to the errors. Higher weights are assigned to minority classes.
+        This can be done by:
+        \begin{itemize}
+            \item Altering the proportions of the dataset by duplicating samples to reduce its misclassification.
+            \item Weighting the classes (possible in some algorithms).
+        \end{itemize}
+\end{descriptionlist}
+
+
+
+\section{Decision trees}
+
+\subsection{Information theory} \label{sec:information_theory}
+
+\begin{description}
+    \item[Shannon theorem] \marginnote{Shannon theorem}
+        Let $\matr{X} = \{ \vec{v}_1, \dots, \vec{v}_V \}$ be a data source where 
+        each of the possible values has probability $p_i = \prob{\vec{v}_i}$.
+        The best encoding allows to transmit $\matr{X}$ with 
+        an average number of bits given by the \textbf{entropy} of $X$: \marginnote{Entropy}
+        \[ H(\matr{X}) = - \sum_j p_j \log_2(p_j) \]
+        $H(\matr{X})$ can be seen as a weighted sum of the surprise factor $-\log_2(p_j)$.
+        If $p_j \sim 1$, then the surprise of observing $\vec{v}_j$ is low, vice versa,
+        if $p_j \sim 0$, the surprise of observing $\vec{v}_j$ is high.
+        
+        Therefore, when $H(\matr{X})$ is high, $\matr{X}$ is close to a uniform distribution.
+        When $H(\matr{X})$ is low, $\matr{X}$ is close to a constant.
+
+        \begin{example}[Binary source] \phantom{}\\
+            \begin{minipage}{.50\linewidth}
+                The two values of a binary source $\matr{X}$ have respectively probability $p$ and $(1-p)$.
+                When $p \sim 0$ or $p \sim 1$, $H(\matr{X}) \sim 0$.\\
+                When $p \sim 0.5$, $H(\matr{X}) \sim \log_2(2)=1$
+            \end{minipage}
+            \begin{minipage}{.45\linewidth}
+                \centering
+                \includegraphics[width=\linewidth]{img/binary_entropy.png}
+            \end{minipage}
+        \end{example}
+
+    \item[Entropy threshold split] \marginnote{Entropy threshold split}
+        Given a dataset $\matr{D}$, 
+        a real-valued attribute $d \in \matr{D}$,
+        a threshold $t$ in the domain of $d$ and
+        the class attribute $c$ of $\matr{D}$.
+        The entropy of the class $c$ of the dataset $\matr{D}$ split with threshold $t$ on $d$ is a weighted sum:
+        \[ H(c \,\vert\, d \,:\, t) = \prob{d < t}H(c \,\vert\, d < t) + \prob{d \geq t}H(c \,\vert\, d \geq t) \]
+
+    \item[Information gain] \marginnote{Information gain}
+        Information gain measures the reduction in entropy after applying a split.
+        It is computed as:
+        \[ IG(c \,\vert\, d \,:\, t) = H(c) - H(c \,\vert\, d \,:\, t) \]
+        When $H(c \,\vert\, d \,:\, t)$ is low, $IG(c \,\vert\, d \,:\, t)$ is high 
+        as splitting with threshold $t$ results in purer groups.
+        Vice versa, when $H(c \,\vert\, d \,:\, t)$ is high, $IG(c \,\vert\, d \,:\, t)$ is low
+        as splitting with threshold $t$ is not very useful.
+
+        The information gain of a class $c$ split on a feature $d$ is given by:
+        \[ IG(c \,\vert\, d) = \max_t IG(c \,\vert\, d \,:\, t) \]
+\end{description}
+
+
+\subsection{Tree construction}
+
+\begin{description}
+    \item[Decision tree (C4.5)] \marginnote{Decision tree}
+        Tree-shaped classifier where leaves are class predictions and 
+        inner nodes represent conditions that guide to a leaf.
+        This type of classifier is non-linear (i.e. does not represent a linear separation).
+
+        Each node of the tree contains:
+        \begin{itemize}
+            \item The applied splitting criteria (i.e. feature and threshold). 
+                Leaves do not have this value.
+            \item The purity (e.g. entropy) of the current split.
+            \item Dataset coverage of the current split.
+            \item Classes distribution.
+        \end{itemize}
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.5\textwidth]{img/_iris_decision_tree_example.pdf}
+            \caption{Example of decision tree}
+        \end{figure}
+
+        Note: the weighted sum of the entropies of the children is always smaller than the entropy of the parent.
+
+        Possible stopping conditions are:
+        \begin{itemize}
+            \item When most of the leaves are pure (i.e. nothing useful to split).
+            \item When some leaves are impure but none of the possible splits have positive $IG$.
+                Impure leaves are labeled with the majority class.
+        \end{itemize}
+
+    \item[Purity] \marginnote{Purity}
+        Value to maximize when splitting a node of a decision tree.
+
+        Nodes with uniformly distributed classes have a low purity.
+        Nodes with a single class have the highest purity.
+
+        Possible impurity measures are:
+        \begin{descriptionlist}
+            \item[Entropy/Information gain] See \Cref{sec:information_theory}. 
+
+            \item[Gini index] \marginnote{Gini index}
+                Let $\matr{X}$ be a dataset with classes $C$.
+                The Gini index measures how often an element of $\matr{X}$ would be misclassified
+                if the labels were randomly assigned based on the frequencies of the classes in $\matr{X}$.
+
+                Given a class $i \in C$, $p_i$ is the probability (i.e. frequency) of classifying an element with $i$ and
+                $(1 - p_i)$ is the probability of classifying it with a different label.
+                The Gini index is given by:
+                \[
+                    \begin{split}
+                        GINI(\matr{X}) = \sum_i^C p_i (1-p_i) &= \sum_i^C p_i - \sum_i^C p_i^2 \\
+                            &= 1 - \sum_i^C p_i^2
+                    \end{split}  
+                \]
+                When $\matr{X}$ is uniformly distributed, $GINI(\matr{X}) \sim (1-\frac{1}{\vert C \vert})$.
+                When $\matr{X}$ is constant, $GINI(\matr{X}) \sim 0$.
+
+                Given a node $x$ split in $n$ children $x_1, \dots, x_n$,
+                the Gini gain of the split is given by:
+                \[ GINI_\text{gain} = GINI(x) - \sum_{i=1}^n \frac{\vert x_i \vert}{\vert x \vert} GINI(x_i) \]
+ 
+            \item[Misclassification error] \marginnote{Misclassification error}
+                Skipped.
+        \end{descriptionlist}
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.35\textwidth]{img/impurity_comparison.png}
+            \caption{Comparison of impurity measures}
+        \end{figure}
+
+        Compared to Gini index, entropy is more robust to noise.
+
+        Misclassification error has a bias toward the major class.
+\end{description}
+
+\begin{algorithm}[H]
+\caption{Decision tree construction using information gain as impurity measure}
+\begin{lstlisting}
+    def buildTree(split):
+        node = Node()
+        if len(split.classes) == 1: # Pure split
+            node.label = split.classes[0]
+            node.isLeaf = True
+        else:
+            ig, attribute, threshold = getMaxInformationGain(split)
+            if ig < 0:
+                node.label = split.majorityClass()
+                node.isLeaf = True
+            else:
+                node.left = buildTree(split[attribute < threshold])
+                node.right = buildTree(split[attribute >= threshold])
+        return node
+\end{lstlisting}
+\end{algorithm}
+
+\begin{description}
+    \item[Pruning] \marginnote{Pruning}
+        Remove branches to reduce overfitting.
+        Different pruning techniques can be employed:
+        \begin{descriptionlist}
+            \item[Maximum depth] 
+                Maximum depth allowed for the tree.
+
+            \item[Minimum samples for split] 
+                Minimum number of samples a node is required to have to apply a split.
+
+            \item[Minimum samples for a leaf] 
+                Minimum number of samples a node is required to have to become a leaf.
+
+            \item[Minimum impurity decrease] 
+                Minimum decrease in impurity for a split to be made.
+
+            \item[Statistical pruning] 
+                Prune the children of a node if the weighted sum of the maximum errors of the children is greater than 
+                the maximum error of the node if it was a leaf.
+        \end{descriptionlist}
+\end{description}
+
+
+\subsection{Complexity}
+Given a dataset $\matr{X}$ of $N$ instances and $D$ attributes,
+each level of the tree requires to evaluate the whole dataset and
+each node requires to process all the attributes.
+Assuming an average height of $O(\log N)$, 
+the overall complexity for induction (parameters search) is $O(DN \log N)$.
+
+Moreover, the other operations of a binary tree have complexity:
+\begin{itemize}
+    \item Threshold search and binary split: $O(N \log N)$ (scan the dataset for the threshold).
+    \item Pruning: $O(N \log N)$ (requires to scan the dataset).
+\end{itemize}
+
+For inference, to classify a new instance it is sufficient to traverse the tree from the root to a leaf.
+This has complexity $O(h)$, with $h$ the height of the tree.
+
+
+\subsection{Characteristics}
+\begin{itemize}
+    \item Decision trees are non-parametric in the sense that they do not require any assumption on the distribution of the data.
+    \item Finding the best tree is an NP-complete problem.
+    \item Decision trees are robust to noise if appropriate overfitting methods are applied.
+    \item Decision trees are robust to redundant attributes (correlated attributes are very unlikely to be chosen for multiple splits).
+    \item In practice, the impurity measure has a low impact on the final result, while the pruning strategy is more relevant.
+\end{itemize}
+
+
+
+\section{Naive Bayes}
+
+\begin{description}
+    \item[Bayes' theorem]
+        Given a class $c$ and the evidence $\vec{e}$, we have that:
+        \[ \prob{c \mid \vec{e}} = \frac{\prob{\vec{e} \mid c} \prob{c}}{\prob{\vec{e}}} \]
+
+    \item[Naive Bayes classifier] \marginnote{Naive Bayes classifier}
+        Classifier that uses the Bayes' theorem assuming that the attributes are independent given the class.
+        Given a class $c$ and the evidence $\vec{e} = \langle e_1, e_2, \dots, e_n \rangle$, the probability that 
+        the observation $\vec{e}$ is of class $c$ is given by:
+        \[
+            \prob{c \mid \vec{e}} = \frac{\prod_{i=1}^{n}\prob{e_i \mid c} \cdot \prob{c}}{\prob{\vec{e}}}  
+        \]
+        As the denominator is the same for all classes, it can be omitted.
+\end{description}
+
+
+\subsection{Training and inference}
+\begin{description}
+    \item[Training] \marginnote{Training} 
+        Given the classes $C$ and the features $E$,
+        to train the classifier the following priors need to be estimated:
+        \begin{itemize}
+            \item $\forall c \in C:\, \prob{c}$
+            \item $\forall e_{ij} \in E, \forall c \in C:\, \prob{e_{ij} \mid c}$,
+                where $e_{ij}$ is the $j$-th value of the domain of the $i$-th feature $E_i$.
+        \end{itemize}
+
+    \item[Inference] \marginnote{Inference} 
+        Given a new observation $\vec{x}_\text{new} = \langle x_1, x_2, \dots, x_n \rangle$,
+        its class is determined by computing the likelihood:
+        \[
+            c_\text{new} = \arg\max_{c \in C} \prob{c} \prod_{i=1}^{n}\prob{x_i \mid c}
+        \]
+\end{description}
+
+
+\subsection{Problems}
+\begin{description}
+    \item[Smooting] 
+        If the value $e_{ij}$ of the domain of a feature $E_i$ never appears in the dataset, 
+        its probability $\prob{e_{ij} \mid c}$ will be 0 for all classes.
+        This nullifies all the probabilities that use this feature when 
+        computing the chain of products during inference.
+        Smoothing methods can be used to avoid this problem.
+
+        \begin{description}
+            \item[Laplace smoothing] \marginnote{Laplace smoothing}
+                Given:
+                \begin{descriptionlist}
+                    \item[$\alpha$] The smoothing factor.
+                    \item[\normalfont$\text{af}_{e_{ij}, c}$] The absolute frequency of the value $e_{ij}$ of the feature $E_i$ over the class $c$.
+                    \item[$\vert \mathbb{D}_{E_i} \vert$] The number of distinct values in the domain of $E_i$.
+                    \item[\normalfont$\text{af}_{c}$] The absolute frequency of the class $c$.
+                \end{descriptionlist}
+                The smoothed frequency is computed as:
+                \[
+                    \prob{e_{ij} \mid c} = \frac{\text{af}_{e_{ij}, c} + \alpha}{\text{af}_{c} + \alpha \vert \mathbb{D}_{E_i} \vert}    
+                \]
+
+                A common value of $\alpha$ is 1.
+                When $\alpha = 0$, there is no smoothing.
+                For higher values of $\alpha$, the smoothed feature gains more importance when computing the priors.
+        \end{description}
+
+    \item[Missing values] \marginnote{Missing values}
+        Naive Bayes is robust to missing values.
+
+        During training, the record is ignored in the frequency count of the missing feature.
+
+        During inference, the missing feature can be simply excluded in the computation of the likelihood
+        as this equally affects all classes.
+
+    \item[Numeric values] \marginnote{Gaussian assumption}
+        For continuous numeric values, the frequency count method cannot be used.
+        Therefore, an additional assumption is made: numeric values follow a Gaussian distribution.
+
+        During training, the mean $\mu_{i,c}$ and variance $\sigma_{i,c}$ for a numeric feature $E_i$ is computed with respect to a class $c$.
+        Its probability is then obtained as:
+        \[ \prob{E_i = x \mid c} = \mathcal{N}(\mu_{i,c}, \sigma_{i,c})(x) \]
+
+\end{description}
+
+
+
+\section{Perceptron}
+
+\begin{description}
+    \item[Perceptron] \marginnote{Perceptron}
+        A single artificial neuron that takes $n$ inputs $x_1, \dots, x_n$ and a bias $b$,
+        and computes a linear combination of them with weights $w_1, \dots, w_n, w_b$.
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.25\textwidth]{img/_perceptron.pdf}
+            \caption{Example of perceptron}
+        \end{figure}
+
+        The learnt weights $w_b, w_1, \dots, w_n$ define a hyperplane for binary classification such that:
+        \[
+            w_1 x_1 + \text{\dots} + w_n x_n + w_b b = \begin{cases}
+                \texttt{positive} & \text{if $> 0$} \\
+                \texttt{negative} & \text{if $< 0$} \\
+            \end{cases}
+        \]
+        It can be shown that there are either none or infinite hyperplanes with this property.
+\end{description}
+
+
+\subsection{Training}
+\begin{algorithm}
+\caption{Perceptron training}
+\begin{lstlisting}[mathescape=true]
+    def trainPerceptron(dataset):
+        perceptron = Perceptron(weights=[0 $\dots$ 0])
+        
+        while accuracy(perceptron, dataset) != 1.0:
+            for x, y in dataset:
+                if perceptron.predict(x) != y:
+                    if y is positive_class:
+                        perceptron.weights += x
+                    else:
+                        perceptron.weights -= x
+\end{lstlisting}
+\end{algorithm}
+
+Note that the algorithm converges only if the dataset is linearly separable.
+In practice, a maximum number of iterations is set.
+
+
+
+\section{Support vector machine}
+
+\begin{description}
+    \item[Convex hull]
+        The convex hull of a set of points is the tightest enclosing convex polygon that contains those points.
+
+        Note: the convex hulls of a linearly separable dataset do not intersect.
+
+    \item[Maximum margin hyperplane] \marginnote{Maximum margin hyperplane}
+        Hyperplane with the maximum margin between two convex hulls.
+
+        In general, a subset of points (support vectors) \marginnote{Support vectors} 
+        in the training set is sufficient to define the hulls.
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.4\textwidth]{img/svm.png}
+            \caption{Maximum margin hyperplane of linearly separable data}
+        \end{figure}
+
+    \item[Support vector machine] \marginnote{Support vector machine}
+        SVM\footnote{\scriptsize\url{https://www.cs.princeton.edu/courses/archive/spring16/cos495/slides/AndrewNg_SVM_note.pdf}} 
+        finds the maximum margin hyperplane and the support vectors as a constrained quadratic optimization problem.
+        Given a dataset of $D$ elements and $n$ features, the problem is defined as:
+        \[ \max_{w_0, w_1, \dots, w_n} M \]
+        \[ 
+            \begin{split}
+                \text{subject to }  & \sum_{i=1}^{n} w_i^2 = 1 \\
+                                    & c_i(w_0 + w_1 x_{i1} + \dots + w_n x_{in}) \geq M \,\, \forall i = 1, \dots, D
+            \end{split}    
+        \]
+        where $M$ is the margin, $w_i$ are the weights of the hyperplane and $c_i = \{-1, 1 \}$ is the class.
+        The second constraint imposes the hyperplane to have a large margin. 
+        For positive labels ($c_i=1$), this is true when the hyperplane is positive.
+        For negative labels ($c_i=-1$), this is true when the hyperplane is negative.
+
+        \begin{description}
+            \item[Soft margin] \marginnote{Soft margin}
+                As real-world data is not always linearly separable, 
+                soft margin relaxes the margin constraint by adding a penalty $C$.
+                The margin constraint becomes:
+                \[ c_i(w_0 + w_1 x_{i1} + \dots + w_n x_{in}) \geq M - \xi_i \,\, \forall i = 1, \dots, D \]
+                \[ \text{where } \xi_i \geq 0 \text{ and } \sum_{i=0}^{D} \xi_i = C \]
+        \end{description}
+\end{description}
+
+
+\subsection{Kernel trick}\marginnote{Kernel trick}
+For non-linearly separable data, the boundary can be found using a non-linear mapping 
+to map the data into a new space (feature space) where a linear separation is possible.
+Then, the data and the boundary is mapped back into the original space.
+
+\begin{figure}[h]
+    \begin{subfigure}{0.49\textwidth}
+        \centering
+        \includegraphics[width=\linewidth]{img/svm_kernel_example1.png}
+    \end{subfigure}
+    \begin{subfigure}{0.49\textwidth}
+        \centering
+        \includegraphics[width=\linewidth]{img/svm_kernel_example2.png}
+    \end{subfigure}
+    \caption{Example of mapping from $\mathbb{R}^2$ to $\mathbb{R}^3$}
+\end{figure}
+
+The kernel trick allows to avoid explicitly mapping the dataset into the new space by using kernel functions.
+Known kernel functions are:
+\begin{descriptionlist}
+    \item[Linear] $K(x, y) = \langle x, y \rangle$.
+    \item[Polynomial] $K(x, y) = (\gamma \langle x, y \rangle + r)^d$, where $\gamma$, $r$ and $d$ are parameters.
+    \item[Radial based function] $K(x, y) = \exp(-\gamma \Vert x - y \Vert^2)$, where $\gamma$ is a parameter.
+    \item[Sigmoid] $K(x, y) = \tanh(\langle x, y \rangle + r)$, where $r$ is a parameter.
+\end{descriptionlist}
+
+
+\subsection{Complexity}
+Given a dataset with $D$ entries of $n$ features, the complexity of SVM scales from $O(nD^2)$ to $O(nD^3)$
+depending on the effectiveness of data caching.
+
+
+\subsection{Characteristics}
+\begin{itemize}
+    \item Training an SVM model is generally slower.
+    \item SVM is not affected by local minimums.
+    \item SVM does not suffer the curse of dimensionality.
+    \item SVM does not directly provide probability estimates. 
+        If needed, these can be computed using a computationally expensive method.
+\end{itemize}
+
+
+
+\section{Neural networks}
+
+\begin{description}
+    \item[Multilayer perceptron] \marginnote{Multilayer perceptron}
+        Hierarchical structure of perceptrons, each with an activation function.
+
+    \item[Activation function] \marginnote{Activation function}
+        Activation functions are useful to add non-linearity.
+
+        \begin{remark}
+            In a linear system, if there is noise in the input, it is transferred to the output 
+            (i.e. linearity implies that $f(x + \text{noise}) = f(x) + f(\text{noise})$).
+            On the other hand, a non-linear system is generally more robust 
+            (i.e. non-linearity generally implies that $f(x + \text{noise}) \neq f(x) + f(\text{noise})$)
+        \end{remark}
+
+    \item[Feedforward neural network] \marginnote{Feedforward neural network}
+        Network with the following flow:
+        \[ \text{Input layer} \rightarrow \text{Hidden layer} \rightarrow \text{Output layer} \]
+        Neurons at each layer are connected to all neurons of the next layer.
+\end{description}
+
+
+\subsection{Training}
+Inputs are fed to the network and backpropagation is used to update the weights.
+
+\begin{description}
+    \item[Learning rate] \marginnote{Learning rate}
+        Size of the step for gradient descent.
+
+    \item[Epoch] \marginnote{Epoch} 
+        A round of training where the entire dataset is processed.
+
+    \item[Stopping criteria] \marginnote{Stopping criteria}
+        Possible conditions to stop the training are:
+        \begin{itemize}
+            \item Small weights update.
+            \item The classification error goes below a predefined target.
+            \item Timeout or maximum number of epochs.
+        \end{itemize}
+
+    \item[Regularization] \marginnote{Regularization}
+        Smoothing of the loss function.
+\end{description}
+
+
+
+\section{K-nearest neighbors}
+
+\begin{description}
+    \item[K-nearest neighbors] \marginnote{K-nearest neighbors}
+        Given a similarity metric and a training set,
+        to predict a new observation, the $k$ most similar entries in the training set are selected
+        and the class of the new data is determined as the most frequent class among the $k$ entries.
+\end{description}
+
+
+
+\section{Binary to multi-class classification}
+
+\begin{description}
+    \item[One-vs-one strategy (OVO)] \marginnote{One-vs-one strategy (OVO)}
+        Train a classifier for all the possible pairs of classes (this will result in $\frac{C \cdot (C-1)}{2}$ pairs).
+        The class assigned to a new observation is determined through a majority vote.
+
+    \item[One-vs-rest strategy (OVR)] \marginnote{One-vs-rest strategy (OVR)}
+        Train $C$ classifiers where each is specialized to classify a specific class as positive and the others as negative.
+        The class assigned to a new observation is determined by the confidence score of each classifier.
+\end{description}
+
+
+
+\section{Ensemble methods}
+\marginnote{Ensemble methods}
+Train a set of base classifiers and make predictions by majority vote.
+If all the classifiers have the same but independent error rate, 
+the overall error of the ensemble model is lower (derived from a binomial distribution).
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.6\textwidth]{img/ensemble_error.png}
+    \caption{Relationship between the error of base classifiers and ensemble models}
+\end{figure}
+
+Different strategies to train an ensemble classifier can be used:
+\begin{descriptionlist}
+    \item[Dataset manipulation] Resampling the dataset for each base classifier:
+        \begin{description}
+            \item[Bagging] 
+                Sample with replacement with a uniform distribution.
+            \item[Boosting] 
+                Iteratively change the distribution of the training data 
+                prioritizing examples difficult to classify.
+                \begin{description}
+                    \item[Adaboost] \marginnote{Adaboost}
+                        Iteratively train base classifiers on a dataset where samples 
+                        misclassified at the previous iteration have a higher weight.
+                \end{description}
+        \end{description}
+    
+    \item[Feature manipulation]
+        Train a base classifier using only a subset of the features.
+
+    \item[Class labels manipulation]
+        Train a base classifier to classify a partition of the class labels.
+        For instance, class labels can be partitioned into two groups $A_1$ and $A_2$, and
+        the base classifier is trained to assign as label one of the two groups.
+        During inference, when a group is predicted, all labels within that group receive a vote.
+\end{descriptionlist}
+
+
+\subsection{Random forests}
+\marginnote{Random forests}
+
+Multiple decision trees trained on a different random sampling of the training set and different subsets of features.
+A prediction is made by averaging the output of each tree.
+
+\begin{description}
+    \item[Bias] \marginnote{Bias}
+        Simplicity of the target function of a model.
+    \item[Variance] \marginnote{Variance}
+        Amount of change of the target function when using different training data (i.e. how much the model overfits).
+\end{description}
+
+Random forests aim to reduce the high variance of decision trees.
--- a/src/year1/machine-learning-and-data-mining/sections/_clustering.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_clustering.tex
@ -0,0 +1,554 @@
+\chapter{Clustering}
+
+
+\section{Similarity and dissimilarity}
+
+\begin{description}
+    \item[Similarity] \marginnote{Similarity}
+        Measures how alike two objects are.
+        Often defined in the range $[0, 1]$.
+
+    \item[Dissimilarity] \marginnote{Dissimilarity}
+        Measures how two objects differ.
+        0 indicates no difference while the upper bound varies.
+\end{description}
+
+\begin{table}[ht]
+    \centering
+    \renewcommand{\arraystretch}{2}
+    \begin{tabular}{c | c | c}
+        \textbf{Attribute type} & \textbf{Dissimilarity} & \textbf{Similarity} \\
+        \hline
+        Nominal & $d(p, q) = \begin{cases} 0 & \text{if } p=q \\ 1 & \text{if } p \neq q \end{cases}$ & $s(p, q) = 1 - d(p, q)$ \\
+        \hline
+        Ordinal & $d(p, q) = \frac{\vert p - q \vert}{V}$ with $p, q \in \{ 0, \dots, V \}$ & $s(p, q) = 1 - d(p, q)$ \\
+        \hline
+        Interval or ratio & $d(p, q) = \vert p - q \vert$ & $s(p, q) = \frac{1}{1 + d(p, q)}$
+    \end{tabular}
+    \caption{Similarity and dissimilarity by attribute type}
+\end{table}
+
+\begin{description}
+    \item[Similarity properties] \phantom{}
+        \begin{enumerate}
+            \item $\texttt{sim}(p, q) = 1$ iff $p = q$. 
+            \item $\texttt{sim}(p, q) = \texttt{sim}(q, p)$. 
+        \end{enumerate}
+\end{description}
+
+
+\subsection{Distance}
+
+Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics are:
+\begin{descriptionlist}
+    \item[Minkowski distance ($L_r$)] \marginnote{Minkowski distance}
+        \[ \texttt{dist}(p, q) = \left( \sum_{d=1}^{D} \vert p_d - q_d \vert^r \right)^{\frac{1}{r}} \]
+        where $r$ is a parameter.
+
+        Common values for $r$ are:
+        \begin{descriptionlist}
+            \item[$r = 1$] 
+                Corresponds to the $L_1$ norm.
+                It is useful for discriminating 0 distance and near-0 distance as 
+                an $\varepsilon$ change in the data corresponds to an $\varepsilon$ change in the distance.
+            \item[$r = 2$]
+                Corresponds to the Euclidean distance or $L_2$ norm.
+            \item[$r = \infty$]
+                Corresponds to the $L_\infty$ norm.
+                Considers only the dimensions with the maximum difference.
+        \end{descriptionlist}
+    
+    \item[Mahalanobis distance] \marginnote{Mahalanobis distance}
+        \[ \texttt{dist}(p, q) = \sqrt{ (p-q) \matr{\Sigma}^{-1} (p-q)^T } \]
+        where $\matr{\Sigma}$ is the covariance matrix of the dataset.
+        The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them 
+        points towards a direction of greater variation of the data.
+
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.35\textwidth]{img/mahalanobis.png}
+            \caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.}
+        \end{figure}
+\end{descriptionlist}
+
+\subsubsection{Distance properties}
+\begin{descriptionlist}
+    \item[Positive definiteness] 
+        $\texttt{dist}(p, q) \geq 0$ and $\texttt{dist}(p, q) = 0$ iff $p = q$.
+    \item[Symmetry] 
+        $\texttt{dist}(p, q) = \texttt{dist}(q, p)$
+    \item[Triangle inequality] 
+        $\texttt{dist}(p, q) \leq \texttt{dist}(p, r) + \texttt{dist}(r, q)$
+\end{descriptionlist}
+
+
+
+\subsection{Vector similarity}
+
+\begin{description}
+    \item[Binary vectors]
+        Given two examples $p$ and $q$ with binary features, we can compute the following values:
+        \[ 
+            \begin{split}
+                M_{00} &= \text{ number of features that equals to 0 for both $p$ and $q$} \\
+                M_{01} &= \text{ number of features that equals to 0 for $p$ and 1 for $q$} \\
+                M_{10} &= \text{ number of features that equals to 1 for $p$ and 0 for $q$} \\
+                M_{11} &= \text{ number of features that equals to 1 for both $p$ and $q$}
+            \end{split}    
+        \]
+        Possible distance metrics are:
+        \begin{descriptionlist}
+            \item[Simple matching coefficient] \marginnote{Simple matching coefficient}
+                $\texttt{SMC}(p, q) = \frac{M_{00} + M_{11}}{M_{00} + M_{01} + M_{10} + M_{11}}$ 
+            \item[Jaccard coefficient] \marginnote{Jaccard coefficient}
+                $\texttt{JC}(p, q) = \frac{M_{11}}{M_{01} + M_{10} + M_{11}}$ 
+        \end{descriptionlist}
+
+    \item[Cosine similarity] \marginnote{Cosine similarity}
+        Cosine of the angle between two vectors:
+        \[ \texttt{cos}(p, q) = \frac{p \cdot q}{\Vert p \Vert \cdot \Vert q \Vert} \]
+
+    \item[Extended Jaccard coefficient (Tanimoto)] \marginnote{Extended Jaccard coefficient (Tanimoto)}
+        Variation of the Jaccard coefficient for continuous values:
+        \[ \texttt{T}(p, q) = \frac{p \cdot q}{\Vert p \Vert^2 + \Vert q \Vert^2 - p \cdot q} \]
+\end{description}
+
+
+\subsection{Correlation}
+
+\begin{description}
+    \item[Pearson's correlation] \marginnote{Pearson's correlation}
+        Measure of linear relationship between a pair of quantitative attributes $e_1$ and $e_2$.
+        To compute Pearson's correlation, the values of $e_1$ and $e_2$ are first standardized and then ordered to obtain the vectors $\vec{e}_1$ and $\vec{e}_2$.
+        The correlation is then computed as the dot product between $\vec{e}_1$ and $\vec{e}_2$:
+        \[ \texttt{corr}(e_1, e_2) = \langle \vec{e}_1, \vec{e}_2 \rangle \]
+
+        Pearson's correlation has the following properties:
+        \begin{itemize}
+            \item If the variables are independent, then the correlation is 0 (but not vice versa).
+            \item If the correlation is 0, then there is no linear relationship between the variables.
+            \item $+1$ implies positive linear relationship, $-1$ implies negative linear relationship.
+        \end{itemize}
+
+    \item[Symmetric uncertainty] \marginnote{Symmetric uncertainty}
+        Measure of correlation for nominal attributes:
+        \[ U(e_1, e_2) = 2 \frac{H(e_1) + H(e_2) - H(e_1, e_2)}{H(e_1) + H(e_2)} \in [0, 1] \]
+        where $H$ is the entropy.
+\end{description}
+
+
+
+
+\section{Clustering definitions}
+
+\begin{description}
+    \item[Clustering] \marginnote{Clustering}
+        Given a set of $D$-dimensional objects $\vec{x}_i$, 
+        we want to partition them into $K$ clusters (and potentially recognize outliers).
+        In other words, we are looking for a mapping:
+        \[ \texttt{cluster}(\vec{x}_i) \in \{ 1, \dots, K \} \]
+        such that objects in the same cluster are similar.
+
+    \item[Centroid] \marginnote{Centroid}
+        Average of the coordinates of the points in a cluster.
+        For a cluster $K_i$, the $d$-th coordinate of its centroid is given by:
+        \[ 
+            \texttt{centroid}(K_i)\texttt{[$d$]} 
+                = \frac{1}{\vert K_i \vert} 
+                    \sum_{\vec{x} \in K_i} \vec{x}\texttt{[$d$]} 
+        \]
+
+    \item[Medoid] \marginnote{Medoid}
+        Element of the cluster with minimum average dissimilarity to all other points.
+        Differently from the centroid, the medoid must be an existing point of the dataset.
+
+    \item[Proximity functions] \marginnote{Proximity function} 
+        Measures to determine the similarity of two data points:
+        \begin{descriptionlist}
+            \item[Euclidean distance] 
+        \end{descriptionlist}
+\end{description}
+
+
+\section{Metrics}
+
+\begin{description}
+    \item[Cohesion] \marginnote{Cohesion}
+        Measures the similarity (proximity) of the objects in the same cluster.
+        Given a cluster $K_i$, cohesion is computed as:
+        \[ \texttt{cohesion}(K_i) = \sum_{\vec{x} \in K_i} \texttt{dist}(\vec{x}, \vec{c}_i) \]
+        where $\vec{c}_i$ can be the centroid or medoid
+        and \texttt{dist} is a proximity function.
+
+    \item[Separation] \marginnote{Separation}
+        Measures the distance of two clusters.
+        Given two clusters $K_i$ and $K_j$, their separation is:
+        \[ \texttt{separation}(K_i, K_j) = \texttt{dist}(\vec{c}_i, \vec{c}_j) \]
+        where $\vec{c}_i$ and $\vec{c}_j$ are respectively the centroids of $K_i$ and $K_j$, and \texttt{dist} is a proximity function.
+
+    \item[Sum of squared errors] \marginnote{Sum of squared errors}
+        Measures for each cluster the distance between its points to its centroid.
+        Can be seen as the application of distortion (\Cref{desc:distortion}) to clustering:
+        \[ \texttt{SSE}_j = \sum_{\vec{x}_i \in K_j} \texttt{dist}(\vec{x}_i, \vec{c}_j)^2 \]
+        where $K_j$ is the $j$-th cluster and $\vec{c}_j$ is its centroid.
+
+        If $\texttt{SSE}_j$ is high, the cluster has low quality.
+        If $\texttt{SSE}_j = 0$, all points in the cluster correspond to the centroid.
+
+        The sum of squared errors of $K$ clusters is:
+        \[ \texttt{SSE} = \sum_{j=1}^{K} \texttt{SSE}_j \]
+
+    \item[Sum of squares between clusters] \marginnote{Sum of squares between clusters}
+        Given the global centroid of the dataset $\vec{c}$ and
+        $K$ clusters each with $N_i$ objects,
+        the sum of squares between clusters is given by:
+        \[ \texttt{SSB} = \sum_{i=1}^{K} N_i \cdot \texttt{dist}(\vec{c}_i, \vec{c})^2 \]
+
+    \item[Total sum of squares] \marginnote{Total sum of squares}
+        Sum of the squared distances between the points of the dataset and the global centroid.
+        It can be shown that the total sum of squares can be computed as:
+        \[ \texttt{TSS} = \texttt{SSE} + \texttt{SSB} \]
+
+        \begin{theorem}
+            Minimize \texttt{SSE} $\iff$ maximize \texttt{SSB}.
+        \end{theorem}
+        
+    \item[Silhouette score] \marginnote{Silhouette score}
+        The Silhouette score of a data point $\vec{x}_i$ belonging to a cluster $K_i$ is given by two components:
+        \begin{description}
+            \item[Sparsity contribution] 
+                The average distance of $\vec{x}_i$ to the other points in $K_i$:
+                \[ a(\vec{x}_i) = \frac{1}{\vert K_i \vert - 1} \sum_{\vec{x}_j \in K_i, \vec{x}_j \neq \vec{x}_i} \texttt{dist}(\vec{x}_i, \vec{x}_j) \]
+            
+            \item[Separation contribution] 
+                The average distance of $\vec{x}_i$ to the points in the nearest cluster:
+                \[ b(\vec{x}_i) = \min_{K_j, K_j \neq K_i} \left( \frac{1}{\vert K_j \vert} \sum_{\vec{w} \in K_j} \texttt{dist}(\vec{x}_i, \vec{w}) \right) \]
+        \end{description}
+        The Silhouette score of $\vec{x}_i$ is then computed as:
+        \[ s(\vec{x}_i) = \frac{b(\vec{x}_i) - a(\vec{x}_i)}{\max\{ a(\vec{x}_i), b(\vec{x}_i) \}} \in [-1, 1] \]
+        
+        The Silhouette score $\mathcal{S}$ of $K$ clusters is given by the average Silhouette scores of each data point.
+        $\mathcal{S} \rightarrow 1$ indicates correct clusters, $\mathcal{S} \rightarrow -1$ indicates incorrect clusters.
+
+    \item[Golden standard] \marginnote{Golden standard}
+        Evaluation using a labeled dataset.
+        Consider the elements of the same cluster as labeled with the same class.
+
+        \begin{description}
+            \item[Classification-oriented] 
+                Traditional classification metrics such as accuracy, recall, precision, \dots
+
+            \item[Similarity-oriented]
+                Given a learnt clustering scheme $y_K(\cdot)$ and the golden standard scheme $y_G(\cdot)$ where 
+                $y_i(\vec{x})$ indicates the label/cluster of $\vec{x}$, each pair of data $(\vec{x}_1, \vec{x}_2)$ can be labeled with:
+                \begin{descriptionlist}
+                    \item[\texttt{SGSK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$.
+                    \item[\texttt{SGDK}] if $y_G(\vec{x}_1) = y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$.
+                    \item[\texttt{DGSK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) = y_K(\vec{x}_2)$.
+                    \item[\texttt{DGDK}] if $y_G(\vec{x}_1) \neq y_G(\vec{x}_2)$ and $y_K(\vec{x}_1) \neq y_K(\vec{x}_2)$.
+                \end{descriptionlist}
+                Then, the following metrics can be computed:
+                \begin{descriptionlist}
+                    \item[Rand score] $\frac{\texttt{SGSK} + \texttt{DGDK}}{\texttt{SGSK} + \texttt{SGDK} + \texttt{DGSK} + \texttt{DGDK}}$
+                    \item[Adjusted rand score] Modification of the rand score to take into account that some agreements may happen by chance.
+                    \item[Jaccard coefficient] For each class $c$, the Jaccard coefficient is given by:
+                        \[ \frac{\texttt{SG$_c$SK$_c$}}{\texttt{SG$_c$SK$_c$} + \texttt{SG$_c$DK$_c$} + \texttt{DG$_c$SK$_c$}} \]
+                \end{descriptionlist}
+        \end{description}
+\end{description}
+
+
+
+\section{K-means}
+
+\begin{description}
+    \item[Algorithm] \marginnote{K-means}
+        Clustering algorithm that iteratively improves the centroids.
+        Given the desired number of clusters $K$, the algorithm works as follows:
+        \begin{enumerate}
+            \item Randomly choose $K$ initial centroids.
+            \item Each data point belongs to the cluster represented by the nearest centroid.
+            \item Update the centroids as the centroids of the newly found clusters. Go to 2.
+        \end{enumerate}
+
+    \item[Distortion] \label{desc:distortion} \marginnote{Distortion}
+        Given:
+        \begin{itemize}
+            \item a $D$-dimensional dataset of $N$ points $\vec{x}_i$;
+            \item an encoding function $\texttt{encode}: \mathbb{R}^D \rightarrow [1, K]$;
+            \item a decoding function $\texttt{decode}: [1, K] \rightarrow \mathbb{R}^D$.
+        \end{itemize}
+        Distortion (or inertia) is defined as:
+        \[ \texttt{distortion} = \sum_{i=1}^{N} \big(\vec{x}_i - \texttt{decode}(\texttt{encode}(\vec{x_i})) \big)^2 \]
+
+        \begin{theorem}
+            To minimize the distortion, it is required that:
+            \begin{enumerate}
+                \item $\vec{x}_i$ is encoded with its nearest center.
+                \item The center of a point is the centroid of the cluster it belongs to.
+            \end{enumerate}
+
+            Note that k-means alternates points 1 and 2.
+
+            \begin{proof}
+                The second point is derived by imposing the derivative of \texttt{distortion} to 0.
+            \end{proof}
+        \end{theorem}
+
+    \item[Elbow method]
+        Inertia decreases monotonically and can be used to determine an ideal number of clusters.
+        By computing the inertia for varying $K$, a plausible value is the one corresponding to the point where the slope decreases.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.4\textwidth]{img/elbow_method.png}
+            \caption{Plot of inertia. Possibly good values for $K$ are around 3}
+        \end{figure}
+
+        The Silhouette score can also be used by selecting the $K$ corresponding to its maximum.
+        Note that, compared to inertia, Silhouette is computationally more expensive.
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Termination] 
+                There are a finite number of ways to cluster $N$ objects into $K$ clusters.
+                By construction, at each iteration, the \texttt{distortion} is reduced.
+                Therefore, k-means is guaranteed to terminate.
+
+            \item[Non-optimality] 
+                The solution found by k-means is not guaranteed to be a global best.
+                The choice of starting points heavily influences the final result. 
+                The starting configuration is usually composed of points distant as far as possible.
+
+            \item[Noise]
+                Outliers heavily influence the clustering result. Sometimes, it is useful to remove them.
+
+            \item[Complexity]
+                Given a $D$-dimensional dataset of $N$ points,
+                running k-means for $T$ iterations to find $K$ clusters has complexity $O(TKND)$.
+        \end{description}
+\end{description}
+
+
+
+\section{Hierarchical clustering}
+
+\begin{description}
+    \item[Dendrogram] \marginnote{Dendrogram}
+        Tree-like structure where the root is a cluster of all the data points and 
+        the leaves are clusters with a single data point.
+
+    \item[Agglomerative] \marginnote{Agglomerative} 
+        Starts with a cluster per data point and iteratively merges them (leaves to root).
+        Uses cluster separation metrics.
+
+    \item[Divisive] \marginnote{Divisive} 
+        Starts with a cluster containing all the data points and iteratively splits them (root to leaves).
+        Uses cluster cohesion metrics.
+
+    \item[Cluster separation measures]
+        Measure the distance between two clusters $K_i$ and $K_j$.
+        \begin{descriptionlist}
+            \item[Single link] \marginnote{Single link}
+                Minimum distance of the points in the two clusters:
+                \[ \texttt{sep}(K_i, K_j) = \min_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
+                Tends to create larger clusters.
+    
+            \item[Complete link] \marginnote{Complete link}
+                Maximum distance of the points in the two clusters:
+                \[ \texttt{sep}(K_i, K_j) = \max_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
+                Tends to create more compact clusters.
+        
+            \item[Average link] \marginnote{Average link}
+                Average distance of the points in the two clusters:
+                \[ \texttt{sep}(K_i, K_j) = \frac{1}{\vert K_i \vert \cdot \vert K_j \vert} \sum_{\vec{x} \in K_i, \vec{y} \in K_j} \texttt{dist}(\vec{x}, \vec{y}) \]
+            
+            \item[Centroid-based] \marginnote{Centroid-based}
+                Distance between the centroids of the two clusters.
+
+            \item[Ward's method] \marginnote{Ward's method}
+                Let $K_m$ be the cluster obtained by merging $K_i$ and $K_j$.
+                The distance between $K_i$ and $K_j$ is determined as:
+                \[ \texttt{sep}(K_i, K_j) = \texttt{SSE}(K_m) - \big( \texttt{SSE}(K_i) + \texttt{SSE}(K_j) \big) \]
+        \end{descriptionlist}
+\end{description}
+
+
+\subsection{Agglomerative clustering}
+
+\begin{description}
+    \item[Algorithm] \marginnote{Agglomerative clustering} \phantom{}
+        \begin{enumerate}
+            \item Initialize a cluster for each data point.
+            \item Compute the distance matrix between each cluster.
+            \item Merge the two clusters with the lowest separation, 
+                drop their values from the distance matrix and add a row/column for the newly created cluster.
+            \item Go to point 2. if the number of clusters is greater than one.
+        \end{enumerate}
+
+        After the construction of the dendrogram, a cut \marginnote{Cut} can be performed at a user-defined level.
+        A cut near the root will result in few bigger clusters.
+        A cut near the leaves will result in numerous smaller clusters.
+        
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Complexity] 
+                Space complexity of $O(N^2)$ to store the distance matrix.
+                
+                Time complexity of $O(N^3)$ ($O(N)$ iterations with a $O(N^2)$ search for the pair to merge and $O(N)$ to recompute the distance matrix) 
+                that can be reduced to $O(N^2\log(N))$ when using indexing.
+        \end{description}
+\end{description}
+
+
+
+\section{Density-based clustering}
+
+Consider as clusters the high-density areas of the data space.
+
+\begin{description}
+    \item[Grid-based] 
+        Split the data space into a grid and count the number of points in each tile.
+
+    \item[Object-centered] 
+        Count, for each point, the number of neighbors within a radius.
+\end{description}
+
+
+\subsection{DBSCAN}
+
+\begin{description}
+    \item[Neighborhood] \marginnote{Neighborhood}
+        Given a radius $\varepsilon$, the neighborhood of a point $\vec{x}$ are the points within an $\varepsilon$-sphere centered on $\vec{x}$.
+
+    \item[Core point] \marginnote{Core point}
+        Given a minimum number of neighbors $m$, 
+        a point $\vec{x}$ is a core point if it has at least $m$ neighbors.
+
+    \item[Border point] \marginnote{Border point}
+        A point $\vec{x}$ is a border point if it is not a core point.
+
+    \item[Directly density reachable] \marginnote{Directly density reachable}
+        A point $\vec{p}$ is directly density reachable from $\vec{q}$ iff:
+        \begin{itemize}
+            \item $\vec{q}$ is a core point.
+            \item $\vec{q}$ is a neighbor of $\vec{p}$.
+        \end{itemize}
+
+    \item[Density reachable] \marginnote{Density reachable}
+        A point $\vec{p}$ is density reachable from $\vec{q}$ iff:
+        \begin{itemize}
+            \item $\vec{q}$ is a core point.
+            \item There exists a sequence of points $\vec{s}_1, \dots, \vec{s}_z$ such that:
+            \begin{itemize}
+                \item $\vec{s}_1$ is directly density reachable from $\vec{q}$.
+                \item $\vec{s}_{i+1}$ is directly density reachable from $\vec{s}_i$.
+                \item $\vec{p}$ is directly density reachable from $\vec{s}_z$.
+            \end{itemize}
+        \end{itemize}
+
+    \item[Density connected] \marginnote{Density connected}
+        A point $\vec{p}$ is density connected to $\vec{q}$ iff there exists a point $\vec{s}$ 
+        such that both $\vec{p}$ and $\vec{q}$ are density reachable from $\vec{s}$.
+
+    \item[Algorithm] \marginnote{DBSCAN}
+        Determine clusters as maximal sets of density connected points.
+        Border points not density connected to any core point are labeled as noise.
+
+        In other words, what happens is the following:
+        \begin{itemize}
+            \item Neighboring core points are part of the same cluster.
+            \item Border points are part of the cluster of their nearest core point neighbor.
+            \item Border points without a core point neighbor are noise.
+        \end{itemize}
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Robustness]
+                Able to find clusters of any shape and detect noise.
+
+            \item[Hyperparameters]
+                Sensible to the choice of the radius $\varepsilon$ and minimum neighbors $m$.
+
+                \begin{description}
+                    \item[K-distance method] \phantom{}
+                        \begin{enumerate}
+                            \item Determine for each point its $k$-distance as the distance to its $k$-nearest neighbors.
+                            \item Sort the points by decreasing $k$-distance and plot them.
+                            \item Use as possible $\varepsilon$ the values around the area where the slope decreases (similarly to the elbow method).
+                        \end{enumerate}
+                \end{description}
+
+            \item[Complexity]
+                Complexity of $O(N^2)$, reduced to $O(N \log N)$ if using spatial indexing.
+        \end{description}
+\end{description}
+
+
+\subsection{DENCLUE}
+
+\begin{description}
+    \item[Kernel density estimation] \marginnote{Kernel density estimation}
+        Statistical method to estimate the distribution of a dataset through a function.
+
+        \begin{description}
+            \item[Kernel function] \marginnote{Kernel function}
+                Symmetric and monotonically decreasing function to describe the influence of a data point on its neighbors.
+
+                A typical kernel function is the Gaussian.
+
+            \item[Overall density function]
+                The overall density of the dataset is obtained as the sum of the kernel function evaluated at each data point.
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.35\textwidth]{img/kernel_density_estimation.png}
+                    \caption{Example of density function from a set of points (top right) using a Gaussian kernel}
+                    \label{img:denclue}
+                \end{figure}
+        \end{description}
+
+    \item[Algorithm] \marginnote{DENCLUE}
+        Given a threshold $\xi$, DENCLUE works as follows:
+        \begin{enumerate}
+            \item Derive a density function of the dataset.
+            \item Identify local maximums and consider them as density attractors.
+            \item Associate to each data point the density attractor in the direction of maximum increase.
+            \item Points associated with the same density attractor are part of the same cluster.
+            \item Remove clusters with a density attractor lower than $\xi$.
+            \item Merge clusters connected through a path of points whose density is greater or equal to $\xi$ 
+                (e.g. in \Cref{img:denclue} the center area will result in many small clusters that can be merged with an appropriate $\xi$).
+        \end{enumerate}
+
+    \item[Properties] \phantom{}
+        \begin{description}
+            \item[Robustness]
+                Able to recognize clusters of different shapes and handle noise.
+
+            \item[High dimension weakness]
+                Does not perform well with high-dimensional data with different densities.
+
+            \item[Complexity]
+                Computational complexity of $O(N^2)$.
+        \end{description}
+\end{description}
+
+
+
+\section{Model-based clustering}
+
+Assuming that the attributes are independent random variables,
+model-based clustering finds a set of distributions (one per cluster) that describe the data.
+
+
+\subsection*{Gaussian mixture (expectation maximization)}
+
+\begin{description}
+    \item[Algorithm] \phantom{} \marginnote{Gaussian mixture} 
+    \begin{enumerate}
+        \item Select an initial set of parameters for the distributions.
+        \item Expectation step: for each data point, compute its probability to belong to each distribution.
+        \item Maximization step: tweak the parameters to maximize the likelihood (i.e. move the Gaussian towards the center of the cluster).
+        \item Go to point 2. until convergence.
+    \end{enumerate}
+\end{description}
--- a/src/year1/machine-learning-and-data-mining/sections/_crisp.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_crisp.tex
@ -0,0 +1,57 @@
+\chapter{CRISP-DM}
+
+\begin{description}
+    \item[\Acl{crisp}] \marginnote{\acs{crisp}}
+        Standardized process for data mining.
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.45\textwidth]{img/crisp.png}
+            \caption{\ac{crisp} workflow}
+        \end{figure}
+\end{description}
+
+
+\section{Business understanding}
+\begin{itemize}
+    \item Determine the objective and the success criteria.
+    \marginnote{Business understanding}
+    \item Feasibility study.
+    \item Produce a plan.
+\end{itemize}
+
+\section{Data understanding}
+\begin{itemize}
+    \item Determine the available (raw) data.
+    \marginnote{Data understanding}
+    \item Determine the cost of the data.
+    \item Collect, describe, explore and verify data.
+\end{itemize}
+
+\section{Data preparation}
+\begin{itemize}
+    \item Data cleaning.
+    \marginnote{Data preparation}
+    \item Data transformations.
+\end{itemize}
+
+\section{Modeling}
+\begin{itemize}
+    \item Select modeling technique.
+    \marginnote{Modeling}
+    \item Build/train the model.
+\end{itemize}
+
+\section{Evaluation}
+\begin{itemize}
+    \item Evaluate results.
+    \marginnote{Evaluation}
+    \item Review process.
+\end{itemize}
+
+\section{Deployment}
+\begin{itemize}
+    \item Plan deployment.
+    \marginnote{Deployment}
+    \item Plan monitoring and maintenance.
+    \item Final report and review.
+\end{itemize}
--- a/src/year1/machine-learning-and-data-mining/sections/_data_lake.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_data_lake.tex
@ -0,0 +1,206 @@
+\chapter{Data lake}
+
+\begin{description}
+    \item[Dark data] \marginnote{Dark data}
+        Acquired and stored data that are never used for decision-making processes.
+
+    \item[Data lake] \marginnote{Data lake}
+        Repository to store raw (unstructured) data.
+        It has the following features:
+        \begin{itemize}
+            \item Does not enforce a schema on write.
+            \item Allows flexible access and applies schemas on read.
+            \item Single source of truth.
+            \item Low cost and scalable.
+        \end{itemize}
+
+    \item[Storage]
+        Stored data can be classified as:
+        \begin{descriptionlist}
+            \item[Hot] \marginnote{Hot storage}
+                A low volume of highly requested data that requires low latency.
+                More expensive HW/SW.
+            \item[Cold] \marginnote{Cold storage}
+                A large amount of data that does not have latency requirements.
+                Less expensive.
+        \end{descriptionlist}
+
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.5\textwidth]{img/_storage.pdf}
+            \caption{Data storage technologies}
+        \end{figure}
+\end{description}
+
+
+\section{Traditional vs insight-driven data systems}
+\begin{tabular}{c | p{0.4\textwidth} | p{0.4\textwidth}}
+    & \textbf{\makecell[c]{Traditional (data warehouse)}} & \textbf{\makecell[c]{Insight-driven (data lake)}} \\
+    \hline
+    \textbf{Sources} & Structured data & Structured, semi-structured and unstructured data \\
+    \hline
+    \textbf{Storage} & Limited ingestion and storage capability & Virtually unlimited ingestion and storage capability \\
+    \hline
+    \textbf{Schema} & Schema designed upfront & Schema not fixed \\
+    \hline
+    \textbf{Transformations} & \ac{etl} upfront & Transformations on query \\
+    \hline
+    \textbf{Analytics} & SQL, \ac{bi} tools, full-text search & Traditional methods, self-service \ac{bi}, big data, machine learning, \dots \\
+    \hline
+    \textbf{Price} & High storage cost & Low storage cost \\
+    \textbf{Performance} & Fast queries & Scalability/speed/cost tradeoffs \\
+    \hline
+    \textbf{Quality} & High data quality & Depends on the use case \\
+\end{tabular}
+
+
+\section{Data architecture evolution}
+\begin{description}
+    \item[Traditional data warehouse] \marginnote{Traditional data warehouse} 
+        (i.e. in-house data warehouse)
+        \begin{itemize}
+            \item Structured data with predefined schemas.
+            \item High setup and maintenance cost. Not scalable.
+            \item Relational high-quality data.
+            \item Slow data ingestion.
+        \end{itemize}
+
+    \item[Modern cloud data warehouse] \marginnote{Modern cloud data warehouse} 
+        \phantom{}
+        \begin{itemize}
+            \item Structured and semi-structured data.
+            \item Low setup and maintenance cost. Scalable and easier disaster recovery.
+            \item Relational high-quality data and mixed data.
+            \item Fast data ingestion if supported.
+        \end{itemize}
+
+    \item[On-premise big data] \marginnote{On-premise big data} 
+        (i.e. in-house data lake)
+        \begin{itemize}
+            \item Any type of data with schemas on read.
+            \item High setup and maintenance cost.
+            \item Fast data ingestion.
+        \end{itemize}
+
+    \item[Cloud data lake] \marginnote{Cloud data lake} 
+        \phantom{}
+        \begin{itemize}
+            \item Any type of data with schemas on read.
+            \item Low setup and maintenance cost. Scalable and easier disaster recovery.
+            \item Fast data ingestion.
+        \end{itemize}
+\end{description}
+
+
+\section{Components}
+
+\subsection{Data ingestion} 
+    \begin{descriptionlist}
+        \item[Workload migration] \marginnote{Data ingestion}
+            Inserting all the data from an existing source.
+        \item[Incremental ingestion]
+            Inserting changes since the last ingestion.
+        \item[Streaming ingestion]   
+            Continuously inserting data.
+    \end{descriptionlist}
+
+    \begin{description}
+        \item[\Acl{cdc} (\Acs{cdc})] \marginnote{\Acl{cdc} (\Acs{cdc})}
+            Mechanism to detect changes and insert the new data into the data lake (possibly in real-time).
+    \end{description}
+
+\subsection{Storage}
+\begin{descriptionlist}
+    \item[Raw] \marginnote{Raw storage}
+        Immutable data useful for disaster recovery.
+    \item[Optimized] \marginnote{Optimized storage}
+        Optimized raw data for faster query.
+    \item[Analytics] \marginnote{Analytics storage}
+        Ready to use data.
+\end{descriptionlist}
+
+\begin{description}
+    \item[Columnar storage] \phantom{}
+        \begin{itemize}
+            \item Homogenous data are stored contiguously.
+            \item Speeds up methods that process entire columns (i.e. all the values of a feature).
+            \item Insertion becomes slower.
+        \end{itemize}
+
+    \item[Data catalog]
+        Methods to add descriptive metadata to a data lake.
+        This is useful to prevent an unorganized data lake (data swamp).
+\end{description}
+        
+\subsection{Processing and analytics} 
+\begin{descriptionlist}
+    \item[Interactive analytics] \marginnote{Processing and analytics}
+        Interactive queries to large volumes of data.
+        The results are stored back in the data lake.
+    \item[Big data analytics]
+        Data aggregations and transformations.
+    \item[Real-time analytics]   
+        Streaming analysis.
+\end{descriptionlist}
+
+
+\section{Architectures}
+
+\subsection{Lambda lake} 
+\begin{description}
+    \item[Batch layer] \marginnote{Lambda lake}
+        Receives and stores the data. Prepares the batch views for the serving layer.
+    \item[Serving layer] 
+        Indexes batch views for faster queries.
+    \item[Speed layer] 
+        Receives the data and prepares real-time views. The views are also stored in the serving layer.
+\end{description}
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.5\textwidth]{img/lambda_lake.png}
+    \caption{Lambda lake architecture}
+\end{figure}
+
+\subsection{Kappa lake} 
+\marginnote{Kappa lake}
+The data are stored in a long-term store.
+Computations only happen in the speed layer (avoids lambda lake redundancy between batch layer and speed layer).
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.5\textwidth]{img/kappa_lake.png}
+    \caption{Kappa lake architecture}
+\end{figure}
+
+\subsection{Delta lake} 
+\marginnote{Delta lake}
+Framework that adds features on top of an existing data lake.
+\begin{itemize}
+    \item ACID transactions
+    \item Scalable metadata handling
+    \item Data versioning
+    \item Unified batch and streaming
+    \item Schema enforcement
+\end{itemize}
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.7\textwidth]{img/delta_lake.png}
+    \caption{Delta lake architecture}
+\end{figure}
+
+
+\section{Metadata}
+\marginnote{Metadata}
+Metadata is used to organize a data lake.
+Useful metadata are:
+\begin{descriptionlist}
+    \item[Source] Origin of the data.
+    \item[Schema] Structure of the data.
+    \item[Format] File format or encoding.
+    \item[Quality metrics] (e.g. percentage of missing values).
+    \item[Lifecycle] Retention policies and archiving rules.
+    \item[Ownership] 
+    \item[Lineage] History of applied transformations or dependencies.
+    \item[Access control] 
+    \item[Classification] Sensitivity level of the data.
+    \item[Usage information] Record of who accessed the data and how it is used.
+\end{descriptionlist}
--- a/src/year1/machine-learning-and-data-mining/sections/_data_prepro.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_data_prepro.tex
@ -0,0 +1,163 @@
+\chapter{Data preprocessing}
+
+\section{Aggregation}
+\marginnote{Aggregation}
+
+Combining multiple attributes into a single one.
+Useful for:
+\begin{descriptionlist}
+    \item[Data reduction]
+        Reduce the number of attributes.
+
+    \item[Change of scale] 
+        View the data in a more general level of detail (e.g. from cities and regions to countries).
+
+    \item[Data stability] 
+        Aggregated data tend to have less variability.
+\end{descriptionlist}
+
+
+
+\section{Sampling}
+\marginnote{Sampling}
+Sampling can be used when the full dataset is too expensive to obtain or too expensive to process.
+Obviously, a sample has to be representative.
+
+The types of sampling techniques are:
+\begin{descriptionlist}
+    \item[Simple random] \marginnote{Simple random}
+        Extraction of a single element following a given probability distribution.
+    
+    \item[With replacement] \marginnote{With replacement}
+        Multiple extractions with repetitions following a given probability distribution
+        (i.e. multiple simple random extractions).
+
+        If the population is small, the sample may underestimate the actual population.
+
+    \item[Without replacement] \marginnote{Without replacement}
+        Multiple extractions without repetitions following a given probability distribution.
+
+    \item[Stratified] \marginnote{Stratified}
+        Split the data and sample from each partition.
+        Useful when the partitions are homogenous.
+\end{descriptionlist}
+
+\begin{description}
+    \item[Sample size]
+        The sampling size represents a tradeoff between data reduction and precision.
+        In a labeled dataset, it is important to consider the probability of sampling data from all the possible classes.
+\end{description}
+
+
+
+\section{Dimensionality reduction}
+
+\begin{description}
+    \item[Curse of dimensionality] \marginnote{Curse of dimensionality}
+        Data with a high number of dimensions result in a sparse feature space
+        where distance metrics are ineffective. 
+
+    \item[Dimensionality reduction] \marginnote{Dimensionality reduction}
+        Useful to:
+        \begin{itemize}
+            \item Avoid the curse of dimensionality.
+            \item Reduce noise.
+            \item Reduce the time and space complexity of mining and learning algorithms.
+            \item Visualize multi-dimensional data.
+        \end{itemize}
+\end{description}
+
+\subsection{Principal component analysis} \marginnote{PCA} 
+Projection of the data into a lower-dimensional space that maximizes the variance of the data.
+It can be proven that this problem can be solved by finding the eigenvectors of the covariance matrix of the data.
+
+\subsection{Feature subset selection} \marginnote{Feature subset selection} 
+    Local technique to reduce dimensionality by:
+    \begin{itemize}
+        \item Removing redundant attributes.
+        \item Removing irrelevant attributes.
+    \end{itemize}
+
+    This can be achieved by:
+    \begin{descriptionlist}
+        \item[Brute force] 
+            Try all the possible subsets of the dataset.
+
+        \item[Embedded approach]
+            Feature selection is naturally done by the learning algorithm (e.g. decision trees).
+
+        \item[Filter approach]  
+            Features are filtered using domain-specific knowledge.
+
+        \item[Wrapper approaches]  
+            A mining algorithm is used to select the best features.
+    \end{descriptionlist}
+
+
+
+
+\section{Feature creation}
+\marginnote{Feature creation}
+Useful to help a learning algorithm capture data characteristics.
+Possible approaches are:
+\begin{descriptionlist}
+    \item[Feature extraction] 
+        Features extracted from the existing ones (e.g. from a picture of a face, the eye distance can be a new feature).
+
+    \item[Mapping] 
+        Projecting the data into a new feature space.
+
+    \item[New features] 
+        Add new, possibly redundant, features.
+\end{descriptionlist}
+
+
+
+\section{Data type conversion}
+
+\subsection{One-hot encoding} \marginnote{One-hot encoding}
+    A discrete feature $E \in \{ e_1, \dots, e_n \}$ with $n$ unique values is replaced with 
+    $n$ new binary features $H_{e_1}, \dots, H_{e_n}$ each corresponding to a value of $E$.
+    For each entry, if its feature $E$ has value $e_i$, then $H_{e_i} = \texttt{true}$ and the rests are \texttt{false}.
+
+\subsection{Ordinal encoding} \marginnote{Ordinal encoding}
+    A feature whose values have an ordering can be converted into a consecutive sequence of integers
+    (e.g. ["good", "neutral", "bad"] $\mapsto$ [1, 0, -1]).
+
+\subsection{Discretization} \marginnote{Discretization}
+    Convert a continuous feature to a discrete one.
+    \begin{description}
+        \item[Binarization] \marginnote{Binarization}
+            Given a continuous feature and a threshold, 
+            it can be replaced with a new binary feature that is \texttt{true} if the value is above the threshold and \texttt{false} otherwise.
+        
+        \item[Thresholding] \marginnote{Thresholding}
+            Same as binarization but using multiple thresholds.
+
+        \item[K-bins] \marginnote{K-bins}
+            A continuous feature is discretized using $k$ bins each representing an integer from $0$ to $k-1$.
+    \end{description}
+
+
+
+\section{Attribute transformation}
+Useful for normalizing features with different scales and outliers.
+
+\begin{description}
+    \item[Mapping] \marginnote{Mapping}
+        Map the domain of a feature into a new set of values (i.e. apply a function).
+
+    \item[Standardization] \marginnote{Standardization}
+        Transform a feature with Gaussian distribution into a standard distribution.
+        \[ x = \frac{x - \mu}{\sigma} \]
+
+    \item[Rescaling] \marginnote{Rescaling}
+        Map a feature into a fixed range (e.g. scale to $[0, 1]$ or $[-1, 1]$).
+
+    \item[Affine transformation] \marginnote{Affine transformation}
+        Apply a linear transformation on a feature before rescaling it.
+        This method is more robust to outliers.
+
+    \item[Normalization] \marginnote{Normalization}
+        Normalize each data row to unit norm.
+\end{description}
--- a/src/year1/machine-learning-and-data-mining/sections/_data_warehouse.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_data_warehouse.tex
@ -0,0 +1,356 @@
+\chapter{Data warehouse}
+
+
+\begin{description}
+    \item[\Acl{bi}] \marginnote{\Acl{bi}}
+        Transform raw data into information.
+        Deliver the right information to the right people at the right time through the right channel.
+
+    \item[\Ac{dwh}] \marginnote{\Acl{dwh}}
+        Optimized repository that stores information for decision-making processes.
+        \Acp{dwh} are a specific type of \ac{dss}.
+
+        Features:
+        \begin{itemize}
+            \item Subject-oriented: focused on enterprise-specific concepts.
+            \item Integrates data from different sources and provides a unified view.
+            \item Non-volatile storage with change tracking. 
+        \end{itemize}
+
+    \item[\Ac{dm}] \marginnote{\Acl{dm}}
+        Subset of the primary \ac{dwh} with information relevant to a specific business area.
+\end{description}
+
+
+
+\section{\Acl{olap} (\Acs{olap})}
+
+\begin{description}
+    \item[\ac{olap} analyses] \marginnote{\Acl{olap} (\Acs{olap})}
+        Able to interactively navigate the information in a data warehouse.
+        Allows to visualize different levels of aggregation.
+
+    \item[\ac{olap} session] 
+        Navigation path created by the operations that a user applied.
+\end{description}
+
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.35\textwidth]{img/_olap_cube.pdf}
+    \caption{\ac{olap} data cube}
+\end{figure}
+
+
+\subsection{Operators}
+
+\begin{description}
+    \item[Roll-up] \marginnote{Roll-up}
+        \begin{minipage}{0.7\textwidth}
+            Increases the level of aggregation (i.e. \texttt{GROUP BY} in SQL).
+            Some details are collapsed together.
+        \end{minipage}
+        \hfill
+        \begin{minipage}{0.15\textwidth}
+            \centering
+            \includegraphics[width=\linewidth]{img/olap_rollup.png}
+        \end{minipage}
+
+    \item[Drill-down] \marginnote{Drill-down}
+        \begin{minipage}{0.7\textwidth}
+            Reduces the level of aggregation.
+            Some details are reintroduced.
+        \end{minipage}
+        \hfill
+        \begin{minipage}{0.15\textwidth}
+            \centering
+            \includegraphics[width=\linewidth]{img/olap_drilldown.png}
+        \end{minipage}
+    
+    \item[Slide-and-dice] \marginnote{Slide-and-dice}
+        \begin{minipage}{0.65\textwidth}
+            The slice operator reduces the number of dimensions (i.e. drops columns).
+
+            The dice operator reduces the number of data being analyzed (i.e. \texttt{LIMIT} in SQL).
+        \end{minipage}
+        \hfill
+        \begin{minipage}{0.15\textwidth}
+            \centering
+            \includegraphics[width=\linewidth]{img/olap_slicedice.png}
+        \end{minipage}    
+
+    \item[Pivot] \marginnote{Pivot}
+        \begin{minipage}{0.7\textwidth}
+            Changes the layout of the data, to analyze it from a different viewpoint.
+        \end{minipage}
+        \hfill
+        \begin{minipage}{0.15\textwidth}
+            \centering
+            \includegraphics[width=\linewidth]{img/olap_pivot.png}
+        \end{minipage}
+
+    \item[Drill-across] \marginnote{Drill-across}
+        \begin{minipage}{0.7\textwidth}
+            Links concepts from different data sources (i.e. \texttt{JOIN} in SQL).
+        \end{minipage}
+        \hfill
+        \begin{minipage}{0.15\textwidth}
+            \centering
+            \includegraphics[width=\linewidth]{img/olap_drillacross.png}
+        \end{minipage}    
+    
+    \item[Drill-through] \marginnote{Drill-through}
+        Switches from multidimensional aggregated data to operational data (e.g. a spreadsheet).
+        \begin{center}
+            \includegraphics[width=0.5\textwidth]{img/olap_drillthrough.png}
+        \end{center}    
+\end{description}
+
+
+
+\section{\Acl{etl} (\Acs{etl})}
+\marginnote{\Acl{etl} (\Acs{etl})}
+The \Ac{etl} process extracts, integrates and cleans operational data that will be loaded into a data warehouse.
+
+
+\subsection{Extraction}
+
+Extracted operational data can be:
+\begin{descriptionlist}
+    \item[Structured] \marginnote{Strucured data}
+        with a predefined data model (e.g. relational DB, CSV)
+
+    \item[Untructured] \marginnote{Unstrucured data}
+        without a predefined data model (e.g. social media content)
+\end{descriptionlist}
+
+Extraction can be of two types:
+\begin{descriptionlist}
+    \item[Static] \marginnote{Static extraction}
+        The entirety of the operational data are extracted to populate the
+        data warehouse for the first time.
+    
+    \item[Incremental] \marginnote{Incremental extraction}
+        Only changes applied since the last extraction are considered.
+        Can be based on a timestamp or a trigger.
+\end{descriptionlist}
+
+
+\subsection{Cleaning}
+
+Operational data may contain:
+\begin{descriptionlist}
+    \item[Duplicate data] 
+    \item[Missing data] 
+    \item[Improper use of fields] (e.g. saving the phone number in the \texttt{notes} field)
+    \item[Wrong values] (e.g. 30th of February)
+    \item[Inconsistencies] (e.g. use of different abbreviations)
+    \item[Typos]    
+\end{descriptionlist}
+
+Methods to clean and increase the quality of the data are:
+\begin{descriptionlist}
+    \item[Dictionary-based techniques] \marginnote{Dictionary-based cleaning}
+        Lookup tables to substitute abbreviations, synonyms or typos.
+        Applicable if the domain is known and limited.
+        
+    \item[Approximate merging] \marginnote{Approximate merging}
+        Methods to merge data that do not have a common key.
+        \begin{description}
+            \item[Approximate join]
+                Use non-key attributes to join two tables (e.g. using the name and surname instead of a unique identifier).
+
+            \item[Similarity approach]
+                Use similarity functions (e.g. edit distance) to merge multiple instances of the same information
+                (e.g. typo in customer surname).
+        \end{description}
+    
+    \item[Ad-hoc algorithms] \marginnote{Ad-hoc algorithms}
+\end{descriptionlist}
+
+
+\subsection{Transformation}
+Data are transformed to respect the format of the data warehouse:
+\begin{descriptionlist}
+    \item[Conversion] \marginnote{Conversion}
+        Modifications of types and formats (e.g. date format)
+    
+    \item[Enrichment] \marginnote{Enrichment}
+        Creating new information by using existing attributes (e.g. compute profit from receipts and expenses)
+
+    \item[Separation and concatenation] \marginnote{Separation and concatenation}
+        Denormalization of the data: introduces redundancies (i.e. breaks normal form\footnote{\url{https://en.wikipedia.org/wiki/Database_normalization}}) 
+        to speed up operations.
+\end{descriptionlist}
+
+
+\subsection{Loading}
+Adding data into a data warehouse:
+\begin{descriptionlist}
+    \item[Refresh] \marginnote{Refresh loading}
+        The entire \ac{dwh} is rewritten.
+
+    \item[Update] \marginnote{Update loading}
+        Only the changes are added to the \ac{dwh}. Old data are not modified.
+\end{descriptionlist}
+
+
+
+\section{Data warehouse architectures}
+
+The architecture of a data warehouse should meet the following requirements:
+\begin{descriptionlist}
+    \item[Separation] Separate the analytical and transactional workflows.
+    \item[Scalability] Hardware and software should be easily upgradable.
+    \item[Extensibility] Capability to host new applications and technologies without the need to redesign the system.
+    \item[Security] Access control.
+    \item[Administrability] Easily manageable.
+\end{descriptionlist}
+
+\subsection{Single-layer architecture}
+\marginnote{Single-layer architecture}
+\begin{minipage}{0.55\textwidth}
+    \begin{itemize}
+        \item Minimizes the amount of data stored (i.e. no redundances).
+        \item The source layer is the only physical layer (i.e. no separation).
+        \item A middleware provides the \ac{dwh} features.
+    \end{itemize}
+\end{minipage}
+\hfill
+\begin{minipage}{0.4\textwidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/_1layer_dwh.pdf}
+\end{minipage}
+
+
+\subsection{Two-layer architecture}
+\marginnote{Two-layer architecture}
+\begin{minipage}{0.55\textwidth}
+    \begin{itemize}
+        \item Source data (source layer) are physically separated from the \ac{dwh} (data warehouse layer).
+        \item A staging layer applies \ac{etl} procedures before populating the \ac{dwh}.
+        \item The \ac{dwh} is a centralized repository from which data marts can be created.
+            Metadata repositories store information on sources, staging and data marts schematics.
+    \end{itemize}
+\end{minipage}
+\hfill
+\begin{minipage}{0.4\textwidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/_2layer_dwh.pdf}
+\end{minipage}
+
+
+\subsection{Three-layer architecture}
+\marginnote{Three-layer architecture}
+\begin{minipage}{0.45\textwidth}
+    \begin{itemize}
+        \item A reconciled layer enhances the cleaned data coming from the staging step by 
+            adding enterprise-level details (i.e. adds more redundancy before populating the \ac{dwh}).
+    \end{itemize}
+\end{minipage}
+\hfill
+\begin{minipage}{0.5\textwidth}
+    \centering
+    \includegraphics[width=\linewidth]{img/_3layer_dwh.pdf}
+\end{minipage}
+
+
+
+\section{Conceptual modeling}
+
+\begin{description}
+    \item[\Acl{dfm} (\acs{dfm})] \marginnote{\Acl{dfm} (\acs{dfm})}
+        Conceptual model to support the design of data marts.
+        The main concepts are:
+        \begin{descriptionlist}
+            \item[Fact] 
+                Concept relevant to decision-making processes (e.g. sales).
+            \item[Measure]
+                Numerical property to describe a fact (e.g. profit).
+            \item[Dimension] 
+                Property of a fact with a finite domain (e.g. date).
+            \item[Dimensional attribute] 
+                Property of a dimension (e.g. month).
+            \item[Hierarchy] 
+                A tree where the root is a dimension and nodes are dimensional attributes (e.g. date $\rightarrow$ month).
+            \item[Primary event] 
+                Occurrence of a fact. It is described by a tuple with a value for each dimension and each measure.
+            \item[Secondary event] 
+                Aggregation of primary events. 
+                Measures of primary events are aggregated if they have the same (preselected) dimensional attributes.
+        \end{descriptionlist}
+\end{description}
+
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.8\textwidth]{img/dfm.png}
+    \caption{Example of \ac{dfm}}
+\end{figure}
+
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.5\textwidth]{img/dfm_events.png}
+    \caption{Example of primary and secondary events}
+\end{figure}
+
+
+\subsection{Aggregation operators}
+
+Measures can be classified as:
+\begin{descriptionlist}
+    \item[Flow measures] \marginnote{Flow measures}
+        Evaluated cumulatively with respect to a time interval (e.g. quantity sold).
+    \item[Level measures] \marginnote{Level measures}
+        Evaluated at a particular time (e.g. number of products in inventory).
+    \item[Unit measures] \marginnote{Unit measures}
+        Evaluated at a particular time but expressed in relative terms (e.g. unit price).
+\end{descriptionlist}
+
+Aggregation operators can be classified as:
+\begin{descriptionlist}
+    \item[Distributive] \marginnote{Distributive operators}
+        Able to calculate aggregates from partial aggregates (e.g. \texttt{SUM}, \texttt{MIN}, \texttt{MAX}).
+    \item[Algebraic] \marginnote{Algebraic operators}
+        Requires a finite number of support measures to compute the result (e.g. \texttt{AVG}).
+    \item[Holistic] \marginnote{Holistic operators}
+        Requires an infinite number of support measures to compute the result (e.g. \texttt{RANK}).
+\end{descriptionlist}
+
+\begin{description}
+    \item[Additivity] \marginnote{Additive measure}
+    A measure is additive along a dimension if an aggregation operator can be applied. 
+    \begin{table}[ht]
+        \centering
+        \begin{tabular}{l | c | c}
+                                        & \textbf{Temporal hierarchies}                             & \textbf{Non-temporal hierarchies} \\
+            \hline
+            \textbf{Flow measures}      & \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX}    & \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX} \\
+            \textbf{Level measures}     & \texttt{AVG}, \texttt{MIN}, \texttt{MAX}                  & \texttt{SUM}, \texttt{AVG}, \texttt{MIN}, \texttt{MAX} \\
+            \textbf{Unit measures}      & \texttt{AVG}, \texttt{MIN}, \texttt{MAX}                  & \texttt{AVG}, \texttt{MIN}, \texttt{MAX} \\
+        \end{tabular}
+        \caption{Allowed operators for each measure type}
+    \end{table}
+\end{description}
+
+
+
+\section{Logical design}
+\marginnote{Logical design}
+Defining the data structures (e.g. tables and relationships) according to a conceptual model.
+There are two main strategies:
+\begin{descriptionlist}
+    \item[Star schema] \marginnote{Star schema}
+        A fact table that contains all the measures is linked to dimensional tables.
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=\textwidth]{img/logical_star_schema.png}
+            \caption{Example of star schema}
+        \end{figure}
+
+    \item[Snowflake schema] \marginnote{Snowflake schema}
+        A star schema variant with partially normalized dimensional tables.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=\textwidth]{img/logical_snowflake_schema.png}
+            \caption{Example of snowflake schema}
+        \end{figure}
+\end{descriptionlist}
--- a/src/year1/machine-learning-and-data-mining/sections/_intro.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_intro.tex
@ -0,0 +1,98 @@
+\chapter{Introduction}
+
+
+\section{Data}
+
+\begin{description}
+    \item[Data] \marginnote{Data}
+        Collection of raw values.
+
+    \item[Information] \marginnote{Information}
+        Organized data (e.g. relationships, context, \dots).
+
+    \item[Knowledge] \marginnote{Knowledge}
+        Understanding information.
+\end{description}
+
+
+\subsection{Data sources}
+\begin{description}
+    \item[Transaction] \marginnote{Transaction}
+        Business event that generates or modifies data in an information system (e.g. database).
+
+    \item[Signal] \marginnote{Signal}
+        Measure produced by a sensor.
+
+    \item[External subjects]
+\end{description}
+
+
+\subsection{Software}
+\begin{description}
+    \item[\Ac{oltp}] \marginnote{\Acl{oltp}} 
+        Class of programs to support transaction-oriented applications and data storage.
+        Suitable for real-time applications.
+
+    \item[\Ac{erp}] \marginnote{\Acl{erp}} 
+        Integrated system to manage all the processes of a business.
+        Uses a shared database for all applications.
+        Suitable for real-time applications.
+\end{description}
+
+
+\subsection{Insight}
+Decisions can be classified as:
+\begin{descriptionlist}
+    \item[Structured] \marginnote{Structured decision}
+        Established and well-understood situations.
+        What is needed is known.
+    \item[Unstructured] \marginnote{Unstructured decision}
+        Unplanned and unclear situations.
+        What is needed for the decision is unknown.
+\end{descriptionlist}
+
+Different levels of insight can be extracted by:
+\begin{descriptionlist}
+    \item[\Ac{mis}] \marginnote{\Acl{mis}}
+        Standardized reporting system built on an existing \ac{oltp}.
+        Used for structured decisions.
+
+    \item[\Ac{dss}] \marginnote{\Acl{dss}}
+        Analytical system to provide support for unstructured decisions.
+
+    \item[\Ac{eis}] \marginnote{\Acl{eis}}
+        Formulate high-level decisions that impact the organization.
+
+    \item[\Ac{olap}] \marginnote{\Acl{olap}}
+        Grouped analysis of multidimensional data.
+        Involves a large amount of data.
+
+    \item[\Ac{bi}] \marginnote{\Acl{bi}}
+        Applications, infrastructure, tools and best practices to analyze information.
+\end{descriptionlist}
+
+
+
+\begin{description}
+    \item[Big data] \marginnote{Big data}
+        Large and/or complex and/or fast-changing collection of data that traditional DBMSs are unable to process.
+        \begin{description}
+            \item[Structured] e.g. relational tables.
+            \item[Unstructured] e.g. videos.
+            \item[Semi-structured] e.g. JSON.   
+        \end{description}
+
+    \item[Anaylitics] \marginnote{Anaylitics}
+        Structured decision driven by data.
+
+    \item[Data mining] \marginnote{Data mining}
+        Discovery process for unstructured decisions.
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.8\textwidth]{img/data_mining_process.png}
+            \caption{Data mining process}
+        \end{figure}
+
+    \item[Machine learning] \marginnote{Machine learning}
+        Learning models and algorithms that allow to extract patterns from data.
+\end{description}
--- a/src/year1/machine-learning-and-data-mining/sections/_machine_learning.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_machine_learning.tex
@ -0,0 +1,172 @@
+\chapter{Machine learning}
+
+\begin{description}
+    \item[Machine learning] \marginnote{Machine learning}
+        Application of methods and algorithms to extract patterns from data.
+\end{description}
+
+\section{Tasks}
+\begin{description}
+    \item[Classification] Estimation of a finite number of classes.
+    \item[Regression] Estimation of a numeric value.
+    \item[Similarity matching] Identify similar individuals.
+    \item[Clustering] Grouping individuals based on their similarities.
+    \item[Co-occurrence grouping] Identify associations between entities based on the transactions in which they appear together.
+    \item[Profiling] Behavior description.
+    \item[Link analysis] Analysis of connections (e.g. in a graph).
+    \item[Data reduction] Reduce the dimensionality of data with minimal information loss.
+    \item[Casual modeling] Understand the connections between events and actions.  
+\end{description}
+
+
+\section{Categories}
+\begin{description}
+    \item[Supervised learning] \marginnote{Supervised learning}
+        Problem where the target(s) is defined.
+    \item[Unsupervised learning] \marginnote{Unsupervised learning}
+        Problem where no specific target is known.
+    \item[Reinforcement learning] \marginnote{Reinforcement learning}
+        Learn a policy to generate a sequence of actions.
+\end{description}
+
+
+
+\section{Data}
+
+\begin{description}
+    \item[Dataset] \marginnote{Dataset}
+        Set of $N$ individuals, each described by $D$ features. 
+\end{description}
+
+
+\subsection{Data types}
+
+\begin{description}
+    \item[Categorical] Values with a discrete domain.
+        \begin{description}
+            \item[Nominal] \marginnote{Categorical nominal data}
+                The values are a set of non-ordered labels.
+
+                \textbf{Operators.} $=$, $\neq$
+                \begin{example}
+                    Name, surname, zip code.
+                \end{example}
+
+            \item[Ordinal] \marginnote{Categorical ordinal data}
+                The values are a set of totally ordered labels.
+
+                \textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$
+                \begin{example}
+                    Non-numerical quality evaluations (excellent, good, fair, poor, bad).
+                \end{example}
+        \end{description}
+
+    \item[Numerical] Values with a continuous domain.
+        \begin{description}
+            \item[Interval] \marginnote{Numerical interval data}
+                Numerical values without an univocal definition of 0 (i.e. 0 is not used as reference).
+                It is not reasonable to compare the magnitude of this type of data.
+
+                \textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$, $+$, $-$
+                \begin{example}
+                    Celsius and Fahrenheit temperature scales, CGPA, time, \dots.
+                    
+                    For instance, there is a $6.25\%$ increase from $16\text{°C}$ to $17\text{°C}$, but
+                    converted in Fahrenheit, the increase is of $2.96\%$ (from $60.8\text{°F}$ to $62.6\text{°F}$).
+                \end{example}
+
+            \item[Ratio] \marginnote{Numerical ratio data}
+                Values with an absolute 0 point.
+
+                \textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$, $+$, $-$
+                \begin{example}
+                    Kelvin temperature scale, age, income, length.
+
+                    For instance, there is a $10\%$ increase from 100\$ to 110\$.
+                    Converted in euro (1\geneuro = 1.06\$), the increase is still of $10\%$ (from $94.34\geneuro$ to $103.77\geneuro$).
+                \end{example}
+        \end{description}
+\end{description}
+
+
+\subsection{Transformations}
+\begin{center}
+    
+    \begin{tabular}{c|c|>{\raggedright\arraybackslash}m{8cm}}
+        \hline
+        \multicolumn{2}{c|}{\textbf{Data type}} & \textbf{Transformation} \\
+        \hline
+        \multirow{2}{*}{Categorical}    & Nominal  & One-to-one transformations \\
+        \cline{2-3}
+                                        & Ordinal  & Order preserving transformations (i.e. monotonic functions) \\
+        \hline
+        \multirow{2}{*}{Numerical}      & Interval & Linear transformations \\
+        \cline{2-3}
+                                        & Ratio    & Any mathematical function, standardization, variation in percentage \\
+        \hline
+    \end{tabular}
+\end{center}
+
+
+% \subsection{Dataset characteristics}
+% \begin{description}
+%     \item[Dimensionality] 
+%     \item[Sparsity] 
+%     \item[Missing data] 
+%     \item[Resolution] 
+% \end{description}
+
+
+\subsection{Dataset format}
+\begin{description}
+    \item[Relational table] \marginnote{Relational table}
+        The attributes of each record are the same.
+    
+    \item[Data matrix] \marginnote{Data matrix}
+        Matrix with $N$ rows (entries) and $D$ columns (attributes).
+    
+    \item[Sparse matrix] \marginnote{Sparse matrix}
+        Data matrix with lots of zeros.
+        \begin{example}[Bag-of-words]
+            Each row represents a document, each column represents a term.
+            The $i,j$-th cell contains the frequency of the $j$-th term in the $i$-th document.
+        \end{example}
+    
+    \item[Transactional data] \marginnote{Transactional data}
+        Each record contains a set of objects (not necessarily a relational table).
+    
+    \item[Graph data] \marginnote{Graph data}
+        Set of nodes and edges.
+    
+    \item[Ordered data] \marginnote{Ordered data}
+        e.g. temporal data.
+\end{description}
+
+
+\subsection{Data quality}
+\begin{description}
+    \item[Noise] \marginnote{Noise}
+        Alteration of the original values.
+
+    \item[Outliers] \marginnote{Outliers}
+        Data that considerably differ from the majority of the dataset.
+        May be caused by noise or rare events.
+
+        Box plots can be used to visually detect outliers.
+
+    \item[Missing values] \marginnote{Missing values}
+        Data that have not been collected.
+        Sometimes they are not easily recognizable 
+        (e.g. when special values are used to mark missing data instead of \texttt{null}).
+
+        Can be handled in different ways:
+        \begin{itemize}
+            \item Ignore the records with missing values.
+            \item Estimate or default missing values.
+            \item Ignore the fact that some values are missing (not always applicable).
+            \item Insert all the possible values and weigh them by their probability.
+        \end{itemize}
+
+    \item[Duplicated data] \marginnote{Duplicated data}
+        Data that may be merged.
+\end{description}
--- a/src/year1/machine-learning-and-data-mining/sections/_regression.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_regression.tex
@ -0,0 +1,56 @@
+\chapter{Regression}
+
+\begin{description}
+    \item[Linear regression] \marginnote{Linear regression}
+        Given:
+        \begin{itemize}
+            \item A dataset $\matr{X}$ of $N$ rows and $D$ features.
+            \item A response vector $\vec{y}$ of $N$ continuous values.
+        \end{itemize}
+        We want to learn the parameters $\vec{w} \in \mathbb{R}^D$ such that:
+        \[ \vec{y} \approx \matr{X}\vec{w}^T \]
+
+    \item[Mean squared error] \marginnote{Mean squared error}
+        To find the parameters for linear regression,
+        we minimize as loss function the mean squared error:
+        \[  
+            \mathcal{L}(\vec{w}) = \Vert \matr{X}\vec{w}^T - \vec{y} \Vert^2    
+        \]
+        Its gradient is:
+        \[ \nabla\mathcal{L}(\vec{w}) = 2\matr{X}^T(\matr{X}\vec{w}^T - \vec{y}) \]
+        Constraining it to 0, we obtain the problem:
+        \[ \matr{X}^T\matr{X}\vec{w}^T = \matr{X}^T\vec{y} \]
+        If $\matr{X}^T\matr{X}$ is invertible, this can be solved analytically but could lead to overfitting.
+        Numerical methods are therefore more suited.
+
+        Note that:
+        \begin{itemize}
+            \item MSE is influenced by the magnitude of the data.
+            \item It measures the fitness of a model in absolute terms.
+            % \item It is suited to compare different models.
+        \end{itemize}
+
+    \item[Coefficient of determination] \marginnote{Coefficient of determination}
+        Given:
+        \begin{itemize}
+            \item The mean of the observed data: $y_\text{avg} = \frac{1}{N} \sum_i \vec{y}_i$.
+            \item The sum of the squared residuals: $SS_\text{res} = \sum_i (\vec{y}_i - \vec{w}^T\vec{x}_i)^2$.
+            \item The total sum of squares: $SS_\text{tot} = \sum_i (\vec{y}_i - y_\text{avg})^2$.
+        \end{itemize}
+        The coefficient of determination is given by:
+        \[ \text{R}^2 = 1 - \frac{SS_\text{res}}{SS_\text{tot}} \]
+
+        Intuitively, $\text{R}^2$ compares the model with a horizontal straight line ($y_\text{avg}$).
+        When $\text{R}^2 = 1$, the model has a perfect fit.
+        When $\text{R}^2$ is outside the range $[0, 1]$, then the model is worse than a straight line.
+
+        Note that:
+        \begin{itemize}
+            \item $\text{R}^2$ is a standardized index.
+            \item $\text{R}^2$ tells how well the variables of the predictor can explain the variation in the target.
+            \item $\text{R}^2$ is not suited for non-linear models.
+        \end{itemize}
+
+    \item[Polynomial regression] \marginnote{Polynomial regression}
+        Find a polynomial instead of a hyperplane.
+\end{description}