diff --git a/src/ainotes.cls b/src/ainotes.cls index f91e37f..d350d25 100644 --- a/src/ainotes.cls +++ b/src/ainotes.cls @@ -15,9 +15,10 @@ \usepackage[bottom]{footmisc} \usepackage{scrlayer-scrpage} \usepackage{scrhack, algorithm, listings} -\usepackage{array, makecell} +\usepackage{array, makecell, multirow} \usepackage{acro} \usepackage{subcaption} +\usepackage{eurosym} \geometry{ margin=3cm, lmargin=1.5cm, rmargin=4.5cm, marginparwidth=3cm } \hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=black, urlcolor=black, linktoc=all } diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex index 208abc4..1dd47b7 100644 --- a/src/machine-learning-and-data-mining/main.tex +++ b/src/machine-learning-and-data-mining/main.tex @@ -28,5 +28,6 @@ \input{sections/_data_warehouse.tex} \input{sections/_data_lake.tex} \input{sections/_crisp.tex} + \input{sections/_machine_learning.tex} \end{document} \ No newline at end of file diff --git a/src/machine-learning-and-data-mining/sections/_machine_learning.tex b/src/machine-learning-and-data-mining/sections/_machine_learning.tex new file mode 100644 index 0000000..245e0bc --- /dev/null +++ b/src/machine-learning-and-data-mining/sections/_machine_learning.tex @@ -0,0 +1,172 @@ +\chapter{Machine learning} + +\begin{description} + \item[Machine learning] \marginnote{Machine learning} + Application of methods and algorithms to extract patterns from data. +\end{description} + +\section{Tasks} +\begin{description} + \item[Classification] Estimation of a finite number of classes. + \item[Regression] Estimation of a numeric value. + \item[Similarity matching] Identify similar individuals. + \item[Clustering] Grouping individuals based on their similarities. + \item[Co-occurrence groupping] Identify associations between entities based on the transactions in which they appear together. + \item[Profiling] Behavior description. + \item[Link analysis] Analysis of connections (e.g. in a graph). + \item[Data reduction] Reduce the dimensionality of data with minimal information loss. + \item[Casual modeling] Understand the connections between events and actions. +\end{description} + + +\section{Categories} +\begin{description} + \item[Supervised learning] \marginnote{Supervised learning} + Problem where the target(s) is defined. + \item[Unsupervised learning] \marginnote{Unsupervised learning} + Problem where no specific target is known. + \item[Reinforcement learning] \marginnote{Reinforcement learning} + Learn a policy to generate a sequence of actions. +\end{description} + + + +\section{Data} + +\begin{description} + \item[Dataset] \marginnote{Dataset} + Set of $N$ individuals, each described by $D$ features. +\end{description} + + +\subsection{Data types} + +\begin{description} + \item[Categorical] Values with a discrete domain. + \begin{description} + \item[Nominal] \marginnote{Categorical nominal data} + The values are a set of non-ordered labels. + + \textbf{Operators.} $=$, $\neq$ + \begin{example} + Name, surname, zip code. + \end{example} + + \item[Ordinal] \marginnote{Categorical ordinal data} + The values are a set of totally ordered labels. + + \textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$ + \begin{example} + Non-numerical quality evaluations (excellent, good, fair, poor, bad). + \end{example} + \end{description} + + \item[Numerical] Values with a continuous domain. + \begin{description} + \item[Interval] \marginnote{Numerical interval data} + Numerical values without an univocal definition of 0 (i.e. 0 is not used as reference). + It is not reasonable to compare the magnitude of this type of data. + + \textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$, $+$, $-$ + \begin{example} + Celsius and Fahrenheit temperature scales, CGPA, time. + + For instance, there is a $6.25\%$ increase from $16\text{°C}$ to $17\text{°C}$, but + converted in Fahrenheit, the increase is of $2.96\%$ (from $60.8\text{°F}$ to $62.6\text{°F}$). + \end{example} + + \item[Ratio] \marginnote{Numerical ratio data} + Values with an absolute 0 point. + + \textbf{Operators.} $=$, $\neq$, $<$, $>$, $\leq$, $\geq$, $+$, $-$ + \begin{example} + Kelvin temperature scale, age, income, length. + + For instance, there is a $10\%$ increase from 100\$ to 110\$. + Converted in euro (1\geneuro = 1.06\$), the increase is still of $10\%$ (from $94.34\geneuro$ to $103.77\geneuro$). + \end{example} + \end{description} +\end{description} + + +\subsection{Transformations} +\begin{center} + + \begin{tabular}{c|c|>{\raggedright\arraybackslash}m{8cm}} + \hline + \multicolumn{2}{c|}{\textbf{Data type}} & \textbf{Transformation} \\ + \hline + \multirow{2}{*}{Categorical} & Nominal & One-to-one transformations \\ + \cline{2-3} + & Ordinal & Order preserving transformations (i.e. monotonic functions) \\ + \hline + \multirow{2}{*}{Numerical} & Interval & Linear transformations \\ + \cline{2-3} + & Ratio & Any mathematical function, standardization, variation in percentage \\ + \hline + \end{tabular} +\end{center} + + +% \subsection{Dataset characteristics} +% \begin{description} +% \item[Dimensionality] +% \item[Sparsity] +% \item[Missing data] +% \item[Resolution] +% \end{description} + + +\subsection{Dataset format} +\begin{description} + \item[Relational table] \marginnote{Relational table} + The attributes of each record are the same. + + \item[Data matrix] \marginnote{Data matrix} + Matrix with $N$ rows (entries) and $D$ columns (attributes). + + \item[Sparse matrix] \marginnote{Sparse matrix} + Data matrix with lots of zeros. + \begin{example}[Bag-of-words] + Each row represents a document, each column represents a term. + The $i,j$-th cell contains the frequency of the $j$-th term in the $i$-th document. + \end{example} + + \item[Transactional data] \marginnote{Transactional data} + Each record contains a set of objects (not necessarily a relational table). + + \item[Graph data] \marginnote{Graph data} + Set of nodes and edges. + + \item[Ordered data] \marginnote{Ordered data} + e.g. temporal data. +\end{description} + + +\subsection{Data quality} +\begin{description} + \item[Noise] \marginnote{Noise} + Alteration of the original values. + + \item[Outliers] \marginnote{Outliers} + Data that considerably differ from the majority of the dataset. + May be caused by noise or rare events. + + Box plots can be used to visually detect outliers. + + \item[Missing values] \marginnote{Missing values} + Data that have not been collected. + Sometimes they are not easily recognizable + (e.g. when special values are used, instead of \texttt{null}, to mark missing data). + + Can be handled in different ways: + \begin{itemize} + \item Ignore the records with missing values. + \item Estimate or default missing values. + \item Ignore the fact that some values are missing (not always applicable). + \item Insert all the possible values and weight them by their probability. + \end{itemize} + + \item[Duplicated data] \marginnote{Duplicated data} + Data that may be merged. +\end{description} \ No newline at end of file