diff --git a/src/machine-learning-and-data-mining/img/_storage.pdf b/src/machine-learning-and-data-mining/img/_storage.pdf new file mode 100644 index 0000000..874697f Binary files /dev/null and b/src/machine-learning-and-data-mining/img/_storage.pdf differ diff --git a/src/machine-learning-and-data-mining/img/delta_lake.png b/src/machine-learning-and-data-mining/img/delta_lake.png new file mode 100644 index 0000000..bc30e69 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/delta_lake.png differ diff --git a/src/machine-learning-and-data-mining/img/kappa_lake.png b/src/machine-learning-and-data-mining/img/kappa_lake.png new file mode 100644 index 0000000..1829921 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/kappa_lake.png differ diff --git a/src/machine-learning-and-data-mining/img/lambda_lake.png b/src/machine-learning-and-data-mining/img/lambda_lake.png new file mode 100644 index 0000000..e8c5c32 Binary files /dev/null and b/src/machine-learning-and-data-mining/img/lambda_lake.png differ diff --git a/src/machine-learning-and-data-mining/img/storage.drawio b/src/machine-learning-and-data-mining/img/storage.drawio new file mode 100644 index 0000000..45745f5 --- /dev/null +++ b/src/machine-learning-and-data-mining/img/storage.drawio @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/machine-learning-and-data-mining/main.tex b/src/machine-learning-and-data-mining/main.tex index 3e2f4de..6cfcf6a 100644 --- a/src/machine-learning-and-data-mining/main.tex +++ b/src/machine-learning-and-data-mining/main.tex @@ -14,6 +14,7 @@ \DeclareAcronym{dm}{short=DM, long=Data Mart} \DeclareAcronym{etl}{short=ETL, long=Extraction{,} Transformation{,} Loading} \DeclareAcronym{dfm}{short=DFM, long=Dimensional Fact Model} +\DeclareAcronym{cdc}{short=CDC, long=Change Data Capture} \begin{document} diff --git a/src/machine-learning-and-data-mining/sections/_bi.tex b/src/machine-learning-and-data-mining/sections/_bi.tex index 3ddc65d..fd52e63 100644 --- a/src/machine-learning-and-data-mining/sections/_bi.tex +++ b/src/machine-learning-and-data-mining/sections/_bi.tex @@ -107,8 +107,8 @@ -\section{\Acl{etl} (\Ac{etl})} -\marginnote{\Acl{etl} (\Ac{etl})} +\section{\Acl{etl} (\Acs{etl})} +\marginnote{\Acl{etl} (\Acs{etl})} The \Ac{etl} process extracts, integrates and cleans operational data that will be loaded into a data warehouse. @@ -352,4 +352,215 @@ There are mainly two strategies: \includegraphics[width=\textwidth]{img/logical_snowflake_schema.png} \caption{Example of snowflake schema} \end{figure} +\end{descriptionlist} + + + +\section{Data lake} +\begin{description} + \item[Dark data] \marginnote{Dark data} + Acquired and stored data that are never used for decision-making processes. + + \item[Data lake] \marginnote{Data lake} + Repository to store raw (unstructured) data. + It has the following features: + \begin{itemize} + \item Does not enforce a schema on write. + \item Allows flexible access and applies schemas on read. + \item Single source of truth. + \item Low cost and scalable. + \end{itemize} + + \item[Storage] + Stored data can be classified as: + \begin{descriptionlist} + \item[Hot] \marginnote{Hot storage} + A low volume of highly requested data that require low latency. + More expensive HW/SW. + \item[Cold] \marginnote{Cold storage} + A large amount of data that does not have latency requirements. + Less expensive. + \end{descriptionlist} + + \begin{figure}[ht] + \centering + \includegraphics[width=0.5\textwidth]{img/_storage.pdf} + \caption{Data storage technologies} + \end{figure} +\end{description} + + +\subsection{Traditional vs insight-driven data systems} +\begin{tabular}{c | p{0.4\textwidth} | p{0.4\textwidth}} + & \textbf{\makecell[c]{Traditional (data warehouse)}} & \textbf{\makecell[c]{Insight-driven (data lake)}} \\ + \hline + \textbf{Sources} & Structured data & Structured, semi-structured and unstructured data \\ + \hline + \textbf{Storage} & Limited ingestion and storage capability & Virtually unlimited ingestion and storage capability \\ + \hline + \textbf{Schema} & Schema designed upfront & Schema not fixed \\ + \hline + \textbf{Transformations} & \ac{etl} upfront & Transformations on query \\ + \hline + \textbf{Analytics} & SQL, \ac{bi} tools, full-text search & Traditional methods, self-service \ac{bi}, big data, machine learning, \dots \\ + \hline + \textbf{Price} & High storage cost & Low storage cost \\ + \textbf{Performance} & Fast queries & Scalability/speed/cost tradeoffs \\ + \hline + \textbf{Quality} & High data quality & Depends on the use case \\ +\end{tabular} + + +\subsection{Data architecture evolution} +\begin{description} + \item[Traditional data warehouse] \marginnote{Traditional data warehouse} + (i.e. in-house data warehouse) + \begin{itemize} + \item Structured data with predefined schemas. + \item High setup and maintenance cost. Not scalable. + \item Relational high-quality data. + \item Slow data ingestion. + \end{itemize} + + \item[Modern cloud data warehouse] \marginnote{Modern cloud data warehouse} + \phantom{} + \begin{itemize} + \item Structured and semi-structured data. + \item Low setup and maintenance cost. Scalable and easier disaster recovery. + \item Relational high-quality data and mixed data. + \item Fast data ingestion if supported. + \end{itemize} + + \item[On-premise big data] \marginnote{On-premise big data} + (i.e. in-house data lake) + \begin{itemize} + \item Any type of data with schemas on read. + \item High setup and maintenance cost. + \item Fast data ingestion. + \end{itemize} + + \item[Cloud data lake] \marginnote{Cloud data lake} + \phantom{} + \begin{itemize} + \item Any type of data with schemas on read. + \item Low setup and maintenance cost. Scalable and easier disaster recovery. + \item Fast data ingestion. + \end{itemize} +\end{description} + + +\subsection{Components} +\begin{description} + \item[Data ingestion] \marginnote{Data ingestion} + \phantom{} + \begin{descriptionlist} + \item[Workload migration] + Inserting all the data from an existing source. + \item[Incremental ingestion] + Inserting changes since the last ingestion. + \item[Streaming ingestion] + Continuously inserting data. + \end{descriptionlist} + + \begin{description} + \item[\Acl{cdc} (\Acs{cdc})] \marginnote{\Acl{cdc} (\Acs{cdc})} + Mechanism to detect changes and insert the new data into the data lake (possibly in real-time). + \end{description} + + \item[Storage] + \phantom{} + \begin{descriptionlist} + \item[Raw] \marginnote{Raw storage} + Immutable data useful for disaster recovery. + \item[Optimized] \marginnote{Optimized storage} + Optimized raw data for faster query. + \item[Analytics] \marginnote{Analytics storage} + Ready to use data. + \end{descriptionlist} + + \begin{description} + \item[Columnar storage] \phantom{} + \begin{itemize} + \item Homogenous data are stores contiguously. + \item Speeds up methods that process entire columns (i.e. all the values of a feature). + \item Insertion becomes slower. + \end{itemize} + + \item[Data catalog] + Methods to add descriptive metadata to a data lake. + This is useful to prevent an unorganized data lake (data swamp). + \end{description} + + + \item[Processing and analytics] \marginnote{Processing and analytics} + \phantom{} + \begin{descriptionlist} + \item[Interactive analytics] + Interactive queries to large volumes of data. + The results are stored back in the data lake. + \item[Big data analytics] + Data aggregations and transformations. + \item[Real-time analytics] + Streaming analysis. + \end{descriptionlist} +\end{description} + + +\subsection{Architectures} + +\begin{description} + \item[Lambda lake] \marginnote{Lambda lake} + \phantom{} + \begin{description} + \item[Batch layer] Receives and stores the data. Prepares the batch views for the serving layer. + \item[Serving layer] Indexes batch views for faster queries. + \item[Speed layer] Receives the data and prepares real-time views. The views are also stored in the serving layer. + \end{description} + \begin{figure}[ht] + \centering + \includegraphics[width=0.5\textwidth]{img/lambda_lake.png} + \caption{Lambda lake architecture} + \end{figure} + + \item[Kappa lake] \marginnote{Kappa lake} + The data are stored in a long-term store. + Computations only happen in the speed layer (avoids lambda lake redundancy between batch layer and speed layer). + \begin{figure}[ht] + \centering + \includegraphics[width=0.5\textwidth]{img/kappa_lake.png} + \caption{Kappa lake architecture} + \end{figure} + + \item[Delta lake] \marginnote{Delta lake} + Framework that adds features on top of an existing data lake. + \begin{itemize} + \item ACID transactions + \item Scalable metadata handling + \item Data versioning + \item Unified batch and streaming + \item Schema enforcement + \end{itemize} + \begin{figure}[ht] + \centering + \includegraphics[width=0.7\textwidth]{img/delta_lake.png} + \caption{Delta lake architecture} + \end{figure} +\end{description} + + +\subsection{Metadata} +\marginnote{Metadata} +Metadata are used to organize a data lake. +Useful metadata are: +\begin{descriptionlist} + \item[Source] Origin of the data. + \item[Schema] Structure of the data. + \item[Format] File format or encoding. + \item[Quality metrics] (e.g. percentage of missing values). + \item[Lifecycle] Retention policies and archiving rules. + \item[Ownership] + \item[Lineage] History of applied transformations or dependencies. + \item[Access control] + \item[Classification] Sensitivity level of the data. + \item[Usage information] Record of who accessed the data and how it is used. \end{descriptionlist} \ No newline at end of file