Add ML/DM data lake

This commit is contained in:
2023-10-14 14:42:09 +02:00
parent 4fa1e2e09c
commit 0c83b7bbc2
7 changed files with 245 additions and 2 deletions

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 322 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

View File

@ -0,0 +1,31 @@
<mxfile host="app.diagrams.net" modified="2023-10-13T17:44:38.951Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0" etag="0k6DN-mG6fDlB8POdY3R" version="22.0.4" type="device">
<diagram name="Pagina-1" id="Obl2eNAEIfPRNowj_f7H">
<mxGraphModel dx="1195" dy="622" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="j0uoPLtJFFh1yWsyPPyp-1" value="" style="endArrow=classic;html=1;rounded=0;strokeWidth=2;startArrow=classic;startFill=1;fontSize=20;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="180" y="400" as="sourcePoint" />
<mxPoint x="680" y="400" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="j0uoPLtJFFh1yWsyPPyp-2" value="Data warehouse" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
<mxGeometry x="180" y="360" width="150" height="30" as="geometry" />
</mxCell>
<mxCell id="j0uoPLtJFFh1yWsyPPyp-3" value="&lt;div align=&quot;right&quot; style=&quot;font-size: 20px;&quot;&gt;Data lake&lt;br style=&quot;font-size: 20px;&quot;&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
<mxGeometry x="530" y="360" width="150" height="30" as="geometry" />
</mxCell>
<mxCell id="j0uoPLtJFFh1yWsyPPyp-4" value="Data hub" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
<mxGeometry x="360" y="360" width="150" height="30" as="geometry" />
</mxCell>
<mxCell id="j0uoPLtJFFh1yWsyPPyp-5" value="Hot" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
<mxGeometry x="180" y="410" width="60" height="30" as="geometry" />
</mxCell>
<mxCell id="j0uoPLtJFFh1yWsyPPyp-6" value="Cold" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=20;" vertex="1" parent="1">
<mxGeometry x="620" y="410" width="60" height="30" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

View File

@ -14,6 +14,7 @@
\DeclareAcronym{dm}{short=DM, long=Data Mart}
\DeclareAcronym{etl}{short=ETL, long=Extraction{,} Transformation{,} Loading}
\DeclareAcronym{dfm}{short=DFM, long=Dimensional Fact Model}
\DeclareAcronym{cdc}{short=CDC, long=Change Data Capture}
\begin{document}

View File

@ -107,8 +107,8 @@
\section{\Acl{etl} (\Ac{etl})}
\marginnote{\Acl{etl} (\Ac{etl})}
\section{\Acl{etl} (\Acs{etl})}
\marginnote{\Acl{etl} (\Acs{etl})}
The \Ac{etl} process extracts, integrates and cleans operational data that will be loaded into a data warehouse.
@ -352,4 +352,215 @@ There are mainly two strategies:
\includegraphics[width=\textwidth]{img/logical_snowflake_schema.png}
\caption{Example of snowflake schema}
\end{figure}
\end{descriptionlist}
\section{Data lake}
\begin{description}
\item[Dark data] \marginnote{Dark data}
Acquired and stored data that are never used for decision-making processes.
\item[Data lake] \marginnote{Data lake}
Repository to store raw (unstructured) data.
It has the following features:
\begin{itemize}
\item Does not enforce a schema on write.
\item Allows flexible access and applies schemas on read.
\item Single source of truth.
\item Low cost and scalable.
\end{itemize}
\item[Storage]
Stored data can be classified as:
\begin{descriptionlist}
\item[Hot] \marginnote{Hot storage}
A low volume of highly requested data that require low latency.
More expensive HW/SW.
\item[Cold] \marginnote{Cold storage}
A large amount of data that does not have latency requirements.
Less expensive.
\end{descriptionlist}
\begin{figure}[ht]
\centering
\includegraphics[width=0.5\textwidth]{img/_storage.pdf}
\caption{Data storage technologies}
\end{figure}
\end{description}
\subsection{Traditional vs insight-driven data systems}
\begin{tabular}{c | p{0.4\textwidth} | p{0.4\textwidth}}
& \textbf{\makecell[c]{Traditional (data warehouse)}} & \textbf{\makecell[c]{Insight-driven (data lake)}} \\
\hline
\textbf{Sources} & Structured data & Structured, semi-structured and unstructured data \\
\hline
\textbf{Storage} & Limited ingestion and storage capability & Virtually unlimited ingestion and storage capability \\
\hline
\textbf{Schema} & Schema designed upfront & Schema not fixed \\
\hline
\textbf{Transformations} & \ac{etl} upfront & Transformations on query \\
\hline
\textbf{Analytics} & SQL, \ac{bi} tools, full-text search & Traditional methods, self-service \ac{bi}, big data, machine learning, \dots \\
\hline
\textbf{Price} & High storage cost & Low storage cost \\
\textbf{Performance} & Fast queries & Scalability/speed/cost tradeoffs \\
\hline
\textbf{Quality} & High data quality & Depends on the use case \\
\end{tabular}
\subsection{Data architecture evolution}
\begin{description}
\item[Traditional data warehouse] \marginnote{Traditional data warehouse}
(i.e. in-house data warehouse)
\begin{itemize}
\item Structured data with predefined schemas.
\item High setup and maintenance cost. Not scalable.
\item Relational high-quality data.
\item Slow data ingestion.
\end{itemize}
\item[Modern cloud data warehouse] \marginnote{Modern cloud data warehouse}
\phantom{}
\begin{itemize}
\item Structured and semi-structured data.
\item Low setup and maintenance cost. Scalable and easier disaster recovery.
\item Relational high-quality data and mixed data.
\item Fast data ingestion if supported.
\end{itemize}
\item[On-premise big data] \marginnote{On-premise big data}
(i.e. in-house data lake)
\begin{itemize}
\item Any type of data with schemas on read.
\item High setup and maintenance cost.
\item Fast data ingestion.
\end{itemize}
\item[Cloud data lake] \marginnote{Cloud data lake}
\phantom{}
\begin{itemize}
\item Any type of data with schemas on read.
\item Low setup and maintenance cost. Scalable and easier disaster recovery.
\item Fast data ingestion.
\end{itemize}
\end{description}
\subsection{Components}
\begin{description}
\item[Data ingestion] \marginnote{Data ingestion}
\phantom{}
\begin{descriptionlist}
\item[Workload migration]
Inserting all the data from an existing source.
\item[Incremental ingestion]
Inserting changes since the last ingestion.
\item[Streaming ingestion]
Continuously inserting data.
\end{descriptionlist}
\begin{description}
\item[\Acl{cdc} (\Acs{cdc})] \marginnote{\Acl{cdc} (\Acs{cdc})}
Mechanism to detect changes and insert the new data into the data lake (possibly in real-time).
\end{description}
\item[Storage]
\phantom{}
\begin{descriptionlist}
\item[Raw] \marginnote{Raw storage}
Immutable data useful for disaster recovery.
\item[Optimized] \marginnote{Optimized storage}
Optimized raw data for faster query.
\item[Analytics] \marginnote{Analytics storage}
Ready to use data.
\end{descriptionlist}
\begin{description}
\item[Columnar storage] \phantom{}
\begin{itemize}
\item Homogenous data are stores contiguously.
\item Speeds up methods that process entire columns (i.e. all the values of a feature).
\item Insertion becomes slower.
\end{itemize}
\item[Data catalog]
Methods to add descriptive metadata to a data lake.
This is useful to prevent an unorganized data lake (data swamp).
\end{description}
\item[Processing and analytics] \marginnote{Processing and analytics}
\phantom{}
\begin{descriptionlist}
\item[Interactive analytics]
Interactive queries to large volumes of data.
The results are stored back in the data lake.
\item[Big data analytics]
Data aggregations and transformations.
\item[Real-time analytics]
Streaming analysis.
\end{descriptionlist}
\end{description}
\subsection{Architectures}
\begin{description}
\item[Lambda lake] \marginnote{Lambda lake}
\phantom{}
\begin{description}
\item[Batch layer] Receives and stores the data. Prepares the batch views for the serving layer.
\item[Serving layer] Indexes batch views for faster queries.
\item[Speed layer] Receives the data and prepares real-time views. The views are also stored in the serving layer.
\end{description}
\begin{figure}[ht]
\centering
\includegraphics[width=0.5\textwidth]{img/lambda_lake.png}
\caption{Lambda lake architecture}
\end{figure}
\item[Kappa lake] \marginnote{Kappa lake}
The data are stored in a long-term store.
Computations only happen in the speed layer (avoids lambda lake redundancy between batch layer and speed layer).
\begin{figure}[ht]
\centering
\includegraphics[width=0.5\textwidth]{img/kappa_lake.png}
\caption{Kappa lake architecture}
\end{figure}
\item[Delta lake] \marginnote{Delta lake}
Framework that adds features on top of an existing data lake.
\begin{itemize}
\item ACID transactions
\item Scalable metadata handling
\item Data versioning
\item Unified batch and streaming
\item Schema enforcement
\end{itemize}
\begin{figure}[ht]
\centering
\includegraphics[width=0.7\textwidth]{img/delta_lake.png}
\caption{Delta lake architecture}
\end{figure}
\end{description}
\subsection{Metadata}
\marginnote{Metadata}
Metadata are used to organize a data lake.
Useful metadata are:
\begin{descriptionlist}
\item[Source] Origin of the data.
\item[Schema] Structure of the data.
\item[Format] File format or encoding.
\item[Quality metrics] (e.g. percentage of missing values).
\item[Lifecycle] Retention policies and archiving rules.
\item[Ownership]
\item[Lineage] History of applied transformations or dependencies.
\item[Access control]
\item[Classification] Sensitivity level of the data.
\item[Usage information] Record of who accessed the data and how it is used.
\end{descriptionlist}