diff --git a/src/machine-learning-and-data-mining/img/_olap_cube.pdf b/src/machine-learning-and-data-mining/img/_olap_cube.pdf new file mode 100644 index 0000000..6bd59c6 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_olap_cube.pdf differ diff --git a/src/machine-learning-and-data-mining/img/olap_drillacross.png b/src/machine-learning-and-data-mining/img/olap_drillacross.png new file mode 100644 index 0000000..a9cb9d9 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/olap_drillacross.png differ diff --git a/src/machine-learning-and-data-mining/img/olap_drilldown.png b/src/machine-learning-and-data-mining/img/olap_drilldown.png new file mode 100644 index 0000000..6c79fe7 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/olap_drilldown.png differ diff --git a/src/machine-learning-and-data-mining/img/olap_drillthrough.png b/src/machine-learning-and-data-mining/img/olap_drillthrough.png new file mode 100644 index 0000000..56ce8b0 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/olap_drillthrough.png differ diff --git a/src/machine-learning-and-data-mining/img/olap_pivot.png b/src/machine-learning-and-data-mining/img/olap_pivot.png new file mode 100644 index 0000000..94221d1 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/olap_pivot.png differ diff --git a/src/machine-learning-and-data-mining/img/olap_rollup.png b/src/machine-learning-and-data-mining/img/olap_rollup.png new file mode 100644 index 0000000..fe12692 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/olap_rollup.png differ diff --git a/src/machine-learning-and-data-mining/img/olap_slicedice.png b/src/machine-learning-and-data-mining/img/olap_slicedice.png new file mode 100644 index 0000000..147631a Binary files /dev/null and b/src/machine-learning-and-data-mining/img/olap_slicedice.png differ diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex index 965eb35..2d32b14 100644 --- a/src/machine-learning-and-data-mining/main.tex +++ b/src/machine-learning-and-data-mining/main.tex @@ -3,15 +3,16 @@ \title{Machine Learning and Data Mining} \date{2023 -- 2024} -\DeclareAcronym{oltp}{short=OLTP, long=On-Line Transaction Processing} +\DeclareAcronym{oltp}{short=OLTP, long=Online Transaction Processing} \DeclareAcronym{erp}{short=ERP, long=Enterprise Resource Planning} \DeclareAcronym{mis}{short=MIS, long=Management Information System} \DeclareAcronym{dss}{short=DSS, long=Decision Support System} \DeclareAcronym{eis}{short=EIS, long=Executive Information System} -\DeclareAcronym{olap}{short=OLAP, long=On-Line Analysical Processing} +\DeclareAcronym{olap}{short=OLAP, long=Online Analysical Processing} \DeclareAcronym{bi}{short=BI, long=Business Intelligence} \DeclareAcronym{dwh}{short=DWH, long=Data Warehouse} \DeclareAcronym{dm}{short=DM, long=Data Mart} +\DeclareAcronym{etl}{short=ETL, long=Extraction{,} Transformation{,} Loading} \begin{document} diff --git a/src/machine-learning-and-data-mining/sections/_bi.tex b/src/machine-learning-and-data-mining/sections/_bi.tex index f91f081..f34d1c4 100644 --- a/src/machine-learning-and-data-mining/sections/_bi.tex +++ b/src/machine-learning-and-data-mining/sections/_bi.tex @@ -19,4 +19,173 @@ \item[\Ac{dm}] \marginnote{\Acl{dm}} Subset of the primary \ac{dwh} with information relevant to a specific business area. -\end{description} \ No newline at end of file +\end{description} + + + +\section{\Acl{olap} (\Ac{olap})} + +\begin{description} + \item[\ac{olap} analyses] \marginnote{\Acl{olap} (\Ac{olap})} + Interactively navigate the information in a data warehouse. + Allows to visualize different levels of aggregation. + + \item[\ac{olap} session] + Navigation path created by the operations of a user. +\end{description} + +\begin{figure}[ht] + \centering + \includegraphics[width=0.35\textwidth]{img/_olap_cube.pdf} + \caption{\ac{olap} data cube} +\end{figure} + + +\subsection{Operators} + +\begin{description} + \item[Roll-up] \marginnote{Roll-up} + Increases the level of aggregation (i.e. \texttt{GROUP BY} in SQL). Some details are collapsed together. + + \item[Drill-down] \marginnote{Drill-down} + Reduces the level of aggregation. Some details are reintroduced. + + \item[Slide-and-dice] \marginnote{Slide-and-dice} + The slice operator reduces the number of dimensions (i.e. drops columns). + + The dice operator reduces the number of data being analyzed (i.e. \texttt{LIMIT} in SQL). + + \item[Pivot] \marginnote{Pivot} + Changes the layout of the data to analyze it from a different viewpoint. + + \item[Drill-across] \marginnote{Drill-across} + Links concepts from different data sources (i.e. \texttt{JOIN} in SQL). + + \item[Drill-through] \marginnote{Drill-through} + Switches from multidimensional aggregated data to operational data (e.g. a spreadsheet). +\end{description} + +\begin{figure}[ht] + \begin{subfigure}{.33\textwidth} + \centering + \includegraphics[width=.60\linewidth]{img/olap_rollup.png} + \caption{\ac{olap} roll-up} + \end{subfigure}% + \begin{subfigure}{.33\textwidth} + \centering + \includegraphics[width=.60\linewidth]{img/olap_drilldown.png} + \caption{\ac{olap} drill-down} + \end{subfigure} + \begin{subfigure}{.33\textwidth} + \centering + \includegraphics[width=.80\linewidth]{img/olap_slicedice.png} + \caption{\ac{olap} slide-and-dice} + \end{subfigure} + \\ + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=.35\linewidth]{img/olap_pivot.png} + \caption{\ac{olap} pivot} + \end{subfigure} + \begin{subfigure}{.5\textwidth} + \centering + \includegraphics[width=.35\linewidth]{img/olap_drillacross.png} + \caption{\ac{olap} drill-across} + \end{subfigure} + \\ + \begin{subfigure}{\textwidth} + \centering + \includegraphics[width=.60\linewidth]{img/olap_drillthrough.png} + \caption{\ac{olap} drill-through} + \end{subfigure} +\end{figure} + + + +\section{\Acl{etl} (\Ac{etl})} +\marginnote{\Acl{etl} (\Ac{etl})} +The \Ac{etl} process extracts, integrates and cleans operational data that will be loaded into a data warehouse. + + +\subsection{Extraction} + +Extracted operational data can be: +\begin{descriptionlist} + \item[Structured] \marginnote{Strucured data} + with a predefined data model (e.g. relational DB, CSV) + + \item[Untructured] \marginnote{Unstrucured data} + without a predefined data model (e.g. social media content) +\end{descriptionlist} + +Extraction can be of two types: +\begin{descriptionlist} + \item[Static] \marginnote{Static extraction} + The entirety of the operational data are extracted to populate the + data warehouse for the first time. + + \item[Incremental] \marginnote{Incremental extraction} + Only changes applied since the last extraction are considered. + Can be based on a timestamp or a trigger. +\end{descriptionlist} + + +\subsection{Cleaning} + +Operational data may contain: +\begin{descriptionlist} + \item[Duplicate data] + \item[Missing data] + \item[Improper use of fields] (e.g. saving the phone number in the \texttt{notes} field) + \item[Wrong values] (e.g. 30th of February) + \item[Inconsistency] (e.g. use of different abbreviations) + \item[Typos] +\end{descriptionlist} + +Methods to increase the quality of the data are: +\begin{descriptionlist} + \item[Dictionary-based techniques] \marginnote{Dictionary-based cleaning} + Lookup tables to substitute abbreviations, synonyms or typos. + Applicable if the domain is known and limited. + + \item[Approximate merging] \marginnote{Approximate merging} + Merging data that do not have a common key. + \begin{description} + \item[Approximate join] + Use non-key attributes to join two tables (e.g. using the name and surname instead of an identifier). + + \item[Similarity approach] + Use similarity functions (e.g. edit distance) to merge multiple instances of the same information + (e.g. typo in customer surname). + \end{description} + + \item[Ad-hoc algorithms] \marginnote{Ad-hoc algorithms} +\end{descriptionlist} + + +\subsection{Transformation} +Data are transformed to respect the format of the data warehouse: +\begin{descriptionlist} + \item[Conversion] \marginnote{Conversion} + modifications of types and formats (e.g. date format) + + \item[Enrichment] \marginnote{Enrichment} + creating new information by using existing attributes (e.g. compute profit from receipts and expenses) + + \item[Separation and concatenation] \marginnote{Separation and concatenation} + Denormalization of the data: introduces redundances (i.e. breaks normal form\footnote{\url{https://en.wikipedia.org/wiki/Database_normalization}}) + to speed up operations. +\end{descriptionlist} + + +\subsection{Loading} +Adding data into a data warehouse: +\begin{descriptionlist} + \item[Refresh] \marginnote{Refresh loading} + The entire \ac{dwh} is rewritten. + + \item[Update] \marginnote{Update loading} + Only the changes are added to the \ac{dwh}. Old data is not modified. +\end{descriptionlist} + +