Moved SMM in year1

2026-02-04 15:51:43 +01:00 · 2023-12-27 17:49:28 +01:00
parent c98859ed9e
commit 3dc77a448a
33 changed files with 1 additions and 1 deletions
--- a/src/year1/statistical-and-mathematical-methods-for-ai/ainotes.cls
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/ainotes.cls
@ -0,0 +1 @@
+../../ainotes.cls
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_backward_pass.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_backward_pass.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_descent_local_flat.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_descent_local_flat.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_forward_pass.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_forward_pass.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_gradient_contour.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_gradient_contour.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_inherent_error.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_inherent_error.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_joint_probability_example.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_joint_probability_example.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_orthogonal_complement.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_orthogonal_complement.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/_rank_k_approx.pdf
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/_rank_k_approx.pdf
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/auto_diff.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/auto_diff.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/cliff.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/cliff.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/convex_function.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/convex_function.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/convex_set.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/convex_set.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/floatingpoint_range.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/floatingpoint_range.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_bad.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_bad.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_good.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_good.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/inherent_error.drawio
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/inherent_error.drawio
@ -0,0 +1,88 @@
+<mxfile host="app.diagrams.net" modified="2023-09-22T09:37:27.395Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0" etag="3qzh6VvLSaXopiRghqnY" version="21.7.0" type="device">
+  <diagram name="Pagina-1" id="mETDQKEhh33VIil_YAIY">
+    <mxGraphModel dx="819" dy="401" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-1" value="" style="ellipse;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="200" y="300" width="150" height="150" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-2" value="" style="ellipse;fillStyle=auto;fillColor=#99CCFF;" parent="1" vertex="1">
+          <mxGeometry x="280" y="340" width="10" height="10" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-3" value="" style="ellipse;fillColor=#99CCFF;" parent="1" vertex="1">
+          <mxGeometry x="280" y="400" width="10" height="10" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-4" value="U&amp;nbsp; " style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
+          <mxGeometry x="240" y="330" width="40" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-5" value="&lt;div align=&quot;right&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;&amp;nbsp;U&lt;/font&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;+ΔU&amp;nbsp; &lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="230" y="390" width="50" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-6" value="" style="ellipse;whiteSpace=wrap;html=1;" parent="1" vertex="1">
+          <mxGeometry x="420" y="300" width="150" height="150" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-7" value="" style="ellipse;fillStyle=auto;fillColor=#99CCFF;" parent="1" vertex="1">
+          <mxGeometry x="480" y="340" width="10" height="10" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-8" value="" style="ellipse;fillColor=#99CCFF;" parent="1" vertex="1">
+          <mxGeometry x="480" y="400" width="10" height="10" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-9" value="&amp;nbsp;V" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
+          <mxGeometry x="490" y="330" width="50" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-10" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;&amp;nbsp;V&lt;/font&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;+ΔV&amp;nbsp; &lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="490" y="390" width="50" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-11" value="" style="endArrow=classic;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;curved=1;" parent="1" source="AFoxFzemWGuV3oYDkwgm-2" target="AFoxFzemWGuV3oYDkwgm-7" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="410" y="420" as="sourcePoint" />
+            <mxPoint x="460" y="370" as="targetPoint" />
+            <Array as="points">
+              <mxPoint x="390" y="310" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-12" value="" style="endArrow=classic;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;curved=1;" parent="1" source="AFoxFzemWGuV3oYDkwgm-3" target="AFoxFzemWGuV3oYDkwgm-8" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="300" y="355" as="sourcePoint" />
+            <mxPoint x="530" y="355" as="targetPoint" />
+            <Array as="points">
+              <mxPoint x="390" y="360" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-13" value="&lt;font face=&quot;Times New Roman&quot; size=&quot;1&quot;&gt;&lt;i&gt;&lt;font style=&quot;font-size: 15px;&quot;&gt;f&lt;/font&gt;&lt;/i&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="330" y="290" width="110" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-15" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="AFoxFzemWGuV3oYDkwgm-7" target="AFoxFzemWGuV3oYDkwgm-8" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="450" y="380" as="sourcePoint" />
+            <mxPoint x="500" y="330" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-16" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;exitX=0.5;exitY=0;exitDx=0;exitDy=0;" parent="1" source="AFoxFzemWGuV3oYDkwgm-3" target="AFoxFzemWGuV3oYDkwgm-2" edge="1">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="270" y="410" as="sourcePoint" />
+            <mxPoint x="320" y="360" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-17" value="&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;Δ&lt;/font&gt;U&amp;nbsp; " style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
+          <mxGeometry x="240" y="360" width="40" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-18" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;&amp;nbsp;&lt;/font&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;ΔV&amp;nbsp; &lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="490" y="360" width="30" height="30" as="geometry" />
+        </mxCell>
+        <mxCell id="AFoxFzemWGuV3oYDkwgm-19" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;Inherent error &lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
+          <mxGeometry x="580" y="355" width="90" height="40" as="geometry" />
+        </mxCell>
+        <mxCell id="x--qwbr77Wqyja1BnvlK-2" value="" style="endArrow=classic;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;exitX=0;exitY=0.5;exitDx=0;exitDy=0;strokeWidth=2;" edge="1" parent="1" source="AFoxFzemWGuV3oYDkwgm-19" target="AFoxFzemWGuV3oYDkwgm-18">
+          <mxGeometry width="50" height="50" relative="1" as="geometry">
+            <mxPoint x="600" y="375" as="sourcePoint" />
+            <mxPoint x="450" y="370" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/linear_regression.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/linear_regression.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/machine_eps.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/machine_eps.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/non_convex_set.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/non_convex_set.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/normal_distribution.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/normal_distribution.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/img/valley.png
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/img/valley.png
--- a/src/year1/statistical-and-mathematical-methods-for-ai/metadata.json
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/metadata.json
@ -0,0 +1,11 @@
+{
+    "name": "Statistical and Mathematical Methods for Artificial Intelligence",
+    "year": 1,
+    "semester": 1,
+    "pdfs": [
+        {
+            "name": null,
+            "path": "smm.pdf"
+        }
+    ]
+}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
@ -0,0 +1,208 @@
+\chapter{Finite numbers}
+
+
+
+\section{Sources of error}
+
+\begin{description}
+    \item[Measure error] \marginnote{Measure error}
+        Precision of the measuring instrument.
+
+    \item[Arithmetic error] \marginnote{Arithmetic error}
+        Propagation of rounding errors in each step of an algorithm.
+
+    \item[Truncation error] \marginnote{Truncation error}
+        Approximating an infinite procedure to a finite number of iterations.
+
+    \item[Inherent error] \marginnote{Inherent error}
+        Caused by the finite representation of the data (floating-point).
+        \begin{figure}[h]
+            \centering
+            \includegraphics[width=0.6\textwidth]{img/_inherent_error.pdf}
+            \caption{Inherent error visualization}
+        \end{figure}
+\end{description}
+
+
+
+\section{Error measurement}
+
+Let $x$ be a value and $\hat{x}$ its approximation. Then:
+\begin{descriptionlist}
+    \item[Absolute error] 
+        \[
+            E_{a} = \hat{x} - x 
+            \marginnote{Absolute error}
+        \] 
+        Note that, out of context, the absolute error is meaningless.
+    \item[Relative error] 
+        \[
+            E_{r} = \frac{\hat{x} - x}{x} 
+            \marginnote{Relative error}
+        \] 
+\end{descriptionlist}
+
+
+
+\section{Representation in base \texorpdfstring{$\beta$}{B}}
+
+Let $\beta \in \mathbb{N}_{> 1}$ be the base.
+Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as:
+\begin{equation}
+    \label{eq:finnum_b_representation}
+    x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots + d_n\beta^{-n})\beta^p
+\end{equation}
+where:
+\begin{itemize}
+    \item $0 \leq d_i \leq \beta-1$
+    \item $d_1 \neq 0$
+    \item starting from an index $i$, not all $d_j$ ($j \geq i$) are equal to $\beta-1$
+\end{itemize}
+%
+\Cref{eq:finnum_b_representation} can be represented using the normalized scientific notation as: \marginnote{Normalized scientific notation}
+\[
+    x = \pm (0.d_1d_2\dots) \beta^p
+\]
+where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponent}. \marginnote{Mantissa\\Exponent}
+
+
+
+\section{Floating-point}
+A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the parameters: \marginnote{Floating-point}
+\begin{itemize}
+    \item $\beta$: base
+    \item $t$: precision (number of digits in the mantissa)
+    \item $[L, U]$: range of the exponent
+\end{itemize}
+
+Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized form:
+\begin{eqnarray}
+    x = \pm (0.d_1d_2 \dots d_t) \beta^p & L \leq p \leq U
+\end{eqnarray}
+We denote with $\texttt{fl}(x)$ the representation of $x \in \mathbb{R}$ in a given floating-point system.
+
+\begin{example}
+    In $\mathcal{F}(10, 5, -3, 3)$, $x=12.\bar{3}$ is represented as:
+    \begin{equation*}
+        \texttt{fl}(x) = + 0.12333 \cdot 10^2
+    \end{equation*}
+\end{example}
+
+
+\subsection{Numbers distribution}
+Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the total amount of representable numbers is:
+\begin{equation*}
+    2(\beta-1) \beta^{t-1} (U-L+1)+1
+\end{equation*}
+%
+Representable numbers are more sparse towards the exponent upper bound and more dense towards the lower bound.
+It must be noted that there is an underflow area around 0.
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.8\textwidth]{img/floatingpoint_range.png}
+    \caption{Floating-point numbers in $\mathcal{F}(2, 3, -1, 2)$}
+\end{figure}
+
+
+\subsection{Number representation}
+Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in:
+\begin{descriptionlist}
+    \item[Exact representation] 
+        if $p \in [L, U]$ and $d_i=0$ for $i>t$.
+
+    \item[Approximation] \marginnote{Truncation\\Rounding}
+        if $p \in [L, U]$ but $d_i$ may not be 0 for $i>t$. 
+        In this case, the representation is obtained by truncating or rounding the value.
+
+    \item[Underflow] \marginnote{Underflow}
+        if $p < L$. In this case, the value is approximated to 0.
+
+    \item[Overflow] \marginnote{Overflow}
+        if $p > U$. In this case, an exception is usually raised.
+\end{descriptionlist}
+
+
+\subsection{Machine precision}
+Machine precision $\varepsilon_{\text{mach}}$ determines the accuracy of a floating-point system. \marginnote{Machine precision}
+Depending on the approximation approach, machine precision can be computed as:
+\begin{descriptionlist}
+    \item[Truncation] $\varepsilon_{\text{mach}} = \beta^{1-t}$
+    \item[Rounding] $\varepsilon_{\text{mach}} = \frac{1}{2}\beta^{1-t}$
+\end{descriptionlist}
+Therefore, rounding results in more accurate representations.
+
+$\varepsilon_{\text{mach}}$ is the smallest distance among the representable numbers (\Cref{fig:finnum_eps}).
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.2\textwidth]{img/machine_eps.png}
+    \caption{Visualization of $\varepsilon_{\text{mach}}$ in $\mathcal{F}(2, 3, -1, 2)$}
+    \label{fig:finnum_eps}
+\end{figure}\\
+%
+In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest representable number such that:
+\begin{equation*}
+    \texttt{fl}(1 + \varepsilon_{\text{mach}}) > 1.
+\end{equation*}
+
+
+\subsection{IEEE standard}
+IEEE 754 defines two floating-point formats:
+\begin{descriptionlist}
+    \item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{\texttt{float32}}
+        \begin{center}
+            \small
+            \begin{tabular}{|c|c|c|}
+                \hline
+                1 (sign) & 8 (exponent) & 23 (mantissa) \\
+                \hline
+            \end{tabular}
+        \end{center}
+
+    \item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{\texttt{float64}}
+        \begin{center}
+            \small
+            \begin{tabular}{|c|c|c|}
+                \hline
+                1 (sign) & 11 (exponent) & 52 (mantissa) \\
+                \hline
+            \end{tabular}
+        \end{center}
+\end{descriptionlist}
+As the first digit of the mantissa is always 1, it does not need to be stored.
+Moreover, special configurations are reserved to represent \texttt{Inf} and \texttt{NaN}.
+
+
+\subsection{Floating-point arithmetic}
+Let:
+\begin{itemize}
+    \item $+: \mathbb{R} \times \mathbb{R} \rightarrow \mathbb{R}$ be a real numbers operation.
+    \item $\oplus: \mathcal{F} \times \mathcal{F} \rightarrow \mathcal{F}$ be the corresponding operation in a floating-point system.
+\end{itemize}
+%
+To compute $x \oplus y$, a machine:
+\begin{enumerate}
+    \item Calculates $x + y$ in a high precision register 
+        (still approximated, but more precise than the floating-point system used to store the result)
+    \item Stores the result as $\texttt{fl}(x + y)$
+\end{enumerate}
+
+A floating-point operation causes a small rounding error:
+\[
+    \left\vert \frac{(x \oplus y) - (x + y)}{x+y} \right\vert < \varepsilon_{\text{mach}}
+\]
+%
+However, some operations may be subject to the \textbf{cancellation} problem which causes information loss.
+\marginnote{Cancellation}
+\begin{example}
+    Given $x = 1$ and $y = 1 \cdot 10^{-17}$, we want to compute $x + y$ in $\mathcal{F}(10, 16, U, L)$.
+    It is assumed that $U$ and $L$ are sufficient for this example.
+    \begin{equation*}
+        \begin{split}
+            z & = \texttt{fl}(x) + \texttt{fl}(y) \\
+              & = 0.1 \cdot 10^1 + 0.1 \cdot 10^{-16} \\
+              & = (0.1 + 0.\overbrace{0\dots0}^{\mathclap{16\text{ zeros}}}1) \cdot 10^1 \\
+              & = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}}1 \cdot 10^1
+        \end{split}
+    \end{equation*}
+    Then, we have that $\texttt{fl}(z) = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}} \cdot 10^1 = 1 = x$.
+\end{example}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
@ -0,0 +1,342 @@
+\chapter{Gradient methods}
+
+
+\section{Minimum of a function}
+
+Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$.
+\begin{descriptionlist}
+    \item[Stationary point] \marginnote{Stationary point}
+        $\vec{x}^*$ is a stationary point of $f$ iff: 
+        \[ \nabla f(\vec{x}^*) = \nullvec \]
+
+    \item[Local minimum] \marginnote{Local minimum}
+        $\vec{x}^* \in \mathbb{R}^N$ is a local minimum of $f$ iff:
+        \[ \exists \varepsilon \in \mathbb{R} \text{ s.t. } 
+            f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
+        
+    \item[Strict local minimum] \marginnote{Strict local minimum}
+        $\vec{x}^* \in \mathbb{R}^N$ is a strict local minimum of $f$ iff:
+        \[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
+            f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
+
+    \item[Global minimum] \marginnote{Global minimum}
+        $\vec{x}^* \in \mathbb{R}^N$ is a global minimum of $f$ iff:
+        \[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
+        
+    \item[Strict global minimum] \marginnote{Strict global minimum}
+        $\vec{x}^* \in \mathbb{R}^N$ is a strict global minimum of $f$ iff:
+        \[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
+\end{descriptionlist}
+
+Note that $\max \{ f(x) \} = \min \{ -f(x)$ \}. 
+
+
+\subsection{Optimality conditions}
+
+\begin{description}
+    \item[First-order condition] \marginnote{First-order condition}
+        Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$.
+        \[ \text{If } \vec{x}^* \text{ local minimum of } f \Rightarrow \nabla f(\vec{x}^*) = \nullvec \]
+
+    \item[Second-order condition] \marginnote{Second-order condition}
+        Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and twice differentiable.
+        \[ 
+            \text{If } \nabla f(\vec{x}^*) = \nullvec \text{ and } \nabla^2 f(\vec{x}^*) \text{ positive definite} \Rightarrow 
+            \vec{x}^* \text{ strict local minimum of } f 
+        \]
+\end{description}
+
+As the second-order condition requires computing the Hessian matrix, which is expensive, in practice only the first-order condition is checked.
+
+
+
+\section{Descent methods}
+
+\marginnote{Descent methods}
+Descent methods are iterative methods that have the property:
+\[ f(\vec{x}_k) < f(\vec{x}_{k-1}) \]
+
+The iteration is defined as:
+\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
+where $\vec{p}_{k-1} \in \mathbb{R}^N$ is the search direction and \marginnote{Search direction\\Step length}
+$\alpha_{k-1} \in \mathbb{R}$ is the step length.
+
+Note: descent methods usually converge to a local minimum.
+
+\begin{figure}
+    \centering
+    \includegraphics[width=0.5\linewidth]{img/_gradient_contour.pdf}
+    \caption{Descent method steps in $\mathbb{R}^2$ (i.e. moving across contour lines)}
+\end{figure}
+
+
+\subsection{Choice of the search direction}
+
+\begin{description}
+    \item[Descent direction] \marginnote{Descent direction}
+        $\vec{p} \in \mathbb{R}^N$ is a descent direction of $f$ in $\vec{x}$ if:
+        \[ \exists \bar{\alpha} > 0, \forall \alpha \in [0, \bar{\alpha}]: f(\vec{x} + \alpha \vec{p}) < f(\vec{x}) \]
+\end{description}
+
+\begin{theorem}
+    Let $\vec{p} \in \mathbb{R}^N$, $\vec{p} \neq \nullvec$.
+    \[ \text{If } \vec{p}^T \nabla f(\vec{x}) < 0 \Rightarrow \vec{p} \text{ descent direction of } f \text{ in } x \]
+\end{theorem}
+
+\begin{theorem}
+    For all $\vec{x}$, $\vec{p} = -\nabla f(\vec{x})$ is a descent direction of $f$ in $x$.
+\end{theorem}
+\begin{proof}
+    \[
+        \begin{split}
+            \vec{p}^T \nabla f(\vec{x}) < 0 &\iff -(\nabla f(\vec{x}))^T \nabla f(\vec{x}) < 0 \\
+                &\iff - \Vert \nabla f(\vec{x}) \Vert_2^2 < 0
+        \end{split}
+    \]
+    This holds as the norm is always positive.
+\end{proof}
+
+\begin{description}
+    \item[Gradient-like methods] \marginnote{Gradient-like methods}
+        Gradient-like methods are descent methods that use $-\nabla f$ as step.
+\end{description}
+
+
+\subsection{Choice of the step length}
+\begin{description}
+    \item[Constant] 
+        In machine learning, it is common to set a constant value for the step (learning rate), 
+        but it can be proved that this does not guarantee convergence.
+    
+    \item[Backtracking procedure] \marginnote{Backtracking procedure}
+        $\alpha_k$ is chosen such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
+        \begin{lstlisting}[mathescape=true, belowskip = -0.8\baselineskip]
+            def backtracking($\tau$, $c_1$):
+                $\alpha_k$ = 1 # Initial guess
+                while $f(x_k + \alpha_k \nabla f(\vec{x}_k))$ > $f(\vec{x}_k)$ + $c_1 \alpha_k \nabla f(\vec{x}_k)^T \nabla f(\vec{x}_k)$:
+                    $\alpha_k$ = $\alpha_k$ / $\tau$
+                return $\alpha_k$
+        \end{lstlisting}
+        It can be proved that, by using the backtracking procedure, gradient methods converge to a local minimum.
+\end{description}
+
+
+\subsection{Stopping condition}
+\marginnote{Stopping condition}
+We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, when $\nabla f(\vec{x}_k) \approx \nullvec$.
+We can verify this by checking the norm of the gradient against a tolerance $\tau$:
+\begin{descriptionlist}
+    \item[Absolute condition] $\Vert \nabla f(x_k) \Vert_2 < \tau$ 
+    \item[Relative condition] $\frac{\Vert \nabla f(x_k) \Vert_2}{\Vert \nabla f(x_0) \Vert_2} < \tau$ 
+\end{descriptionlist}
+
+A generic gradient-like method can then be defined as:
+\begin{lstlisting}[mathescape=true]
+    def gradientMethod($f$, $\vec{x}_0$):
+        $k$ = 0
+        while stoppingCondition($f$, $\vec{x}_k$, $\vec{x}_0$):
+            $p_k$ = $-\nabla f(\vec{x}_k)$
+            $\alpha_k$ = backtracking($\dots$)
+            $\vec{x}_{k+1}$ = $\vec{x}_k$ + $\alpha_k \vec{p}_k$
+            $k$ = $k$ + 1
+        return $x_k$
+\end{lstlisting}
+
+
+\subsection{Problems}
+
+\begin{description}
+    \item[Choice of the initialization point] \marginnote{Initialization point}
+        The starting point of an iterative method is a user-defined parameter.
+        For simple problems, it is usually chosen randomly in $[-1, +1]$.
+        
+        For complex problems, the choice of the initialization point is critical as 
+        it may cause numerical instabilities or bad results.
+        Heuristics can be used to select an adequate starting point.
+
+    \item[Flat regions and local optima] \marginnote{Flat regions and local optima}
+        Flat regions slow down the learning speed,
+        while a local optima causes the method to converge at a poor solution.
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.9\textwidth]{img/_descent_local_flat.pdf}
+            \caption{Flat regions and local minima}
+        \end{figure}
+    
+    \item[Differential curvature]
+        Different magnitudes of the partial derivatives may cause the problem of
+        vanishing and exploding gradient. \marginnote{Vanishing gradient\\Exploding gradient}
+        This causes the learning process to require more iterations to adjust the direction.
+
+        In practice, as the gradient of complex functions is only an instantaneous direction of best decrease and
+        does not represent the direction to the minimum in the long term, 
+        many updates are required for a gradient method to converge.
+
+        A method to mitigate this issue is to use feature normalization techniques.
+
+    \item[Non-differentiable objective function]
+        If the objective function has a small number of non-differentiable points,
+        the gradient descent method can be applied with minor modifications.
+        
+        If lots of points are non-differentiable, the gradients will not be informative enough 
+        to determine a decrease direction.
+
+    \item[Difficult topologies]
+        \marginnote{Cliff}
+        A cliff in the objective function causes problems when evaluating the gradient at the edge.
+        With a small step size, there is a slowdown in convergence. 
+        With a large step size, there is an overshoot that may cause the algorithm to diverge.
+        % a slowdown when evaluating 
+        % the gradient at the edge using a small step size and 
+        % an overshoot when the step is too large.
+
+        \marginnote{Valley}
+        A valley in the objective function causes a gradient method to bounce between the sides
+        to a point where no significant progress can be made.
+
+        \begin{figure}[ht]
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=.30\linewidth]{img/cliff.png}
+                \caption{Cliff region}
+            \end{subfigure}%
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=.30\linewidth]{img/valley.png}
+                \caption{Ping pong tournament in a valley}
+            \end{subfigure}
+        \end{figure}
+\end{description}
+
+
+
+\section{Convex functions}
+
+\begin{description}
+    \item[Convex set] \marginnote{Convex set}
+        Informally, a set is convex if, for any two points of the set,
+        the points laying on the segment connecting them are also part of the set.
+
+        \begin{figure}[ht]
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=.25\linewidth]{img/convex_set.png}
+                \caption{Convex set}
+            \end{subfigure}%
+            \begin{subfigure}{.5\textwidth}
+                \centering
+                \includegraphics[width=.25\linewidth]{img/non_convex_set.png}
+                \caption{Non-convex set}
+            \end{subfigure}
+        \end{figure}
+
+    \item[Convex function] \marginnote{Convex function}
+        Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
+        $f$ is convex if:
+        \[ 
+            \forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]: 
+                f(t\vec{x}_1 + (1-t)\vec{x}_2) \leq t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
+        \]
+
+        In other words, the segment connecting two points of the function lays above the graph.
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.55\textwidth]{img/convex_function.png}
+            \caption{Convex function}
+        \end{figure}
+
+    \item[Strictly convex function] \marginnote{Strictly convex function}
+        Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
+        $f$ is strictly convex if:
+        \[ 
+            \forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]: 
+                f(t\vec{x}_1 + (1-t)\vec{x}_2) < t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
+        \]
+\end{description}
+
+
+\subsection{Properties}
+% \marginnote{Convex properties}
+\begin{itemize}
+    \item $\text{if } f \text{ convex} \Rightarrow \text{any local minimum of } f \text{ is also global}$
+    \item $\text{if } f \text{ strictly convex} \Rightarrow \text{the global minimum of } f \text{ is unique}$
+    \item $\text{if } f \text{ convex and differentiable} \Rightarrow \text{any stationary point of } f \text{ is a global minimum}$
+\end{itemize}
+
+
+\subsection{Quadratic functions}
+\marginnote{Quadratic function}
+A quadratic function has form:
+\[ f(\vec{x}) = \frac{1}{2}\vec{x}^T\matr{A}\vec{x} - \vec{x}^T\vec{b} + c \]
+where $\matr{A} \in \mathbb{R}^{n \times n}$, $\vec{b} \in \mathbb{R}^n$ and $c \in \mathbb{R}$.
+
+\begin{theorem}
+    If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive semidefinite,
+    then $f$ is convex.
+\end{theorem}
+
+\begin{theorem}
+    If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive definite,
+    then $f$ is strictly convex.
+\end{theorem}
+
+\begin{theorem}
+    \marginnote{Least squares quadratic function}
+    The least squares problem $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is a quadratic function.
+\end{theorem}
+\begin{proof}
+    \[
+        \begin{split}
+            (\matr{A}\vec{x} - \vec{b})^T(\matr{A}\vec{x} - \vec{b}) &= (\vec{x}^T\matr{A}^T - \vec{b}^T)(\matr{A}\vec{x} - \vec{b}) \\
+                &= \vec{x}^T\matr{A}^T\matr{A}\vec{x} - \vec{b}^T\matr{A}\vec{x} - \vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \\
+        \end{split}
+    \]
+    As $\vec{b}^T\matr{A}\vec{x} = \vec{x}^T\matr{A}^T\vec{b}$, we have:
+    \[ \vec{x}^T\matr{A}^T\matr{A}\vec{x} - 2\vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \]
+
+    Let $\matr{B} = \matr{A}^T\matr{A}$, $\vec{q} = \matr{A}^T\vec{b}$ and $c = \vec{b}^T\vec{b}$, 
+    we have the quadratic form:
+    \[ \vec{x}^T\matr{B}\vec{x} - 2\vec{x}^T\vec{q} + c \]
+
+    $\matr{B}$ is symmetric positive semidefinite (i.e. $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is convex).
+    Moreover, when $\matr{A}$ is full-rank, $\matr{B}$ is symmetric positive definite (i.e. strictly convex).
+\end{proof}
+
+
+
+\section{Gradient descent with momentum}
+\marginnote{Momentum}
+The momentum is an additional term to keep track of previous iterations:
+\[
+    \Delta \vec{x}_k = \vec{x}_k - \vec{x}_{k-1} = \gamma \Delta \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1})
+\]
+where $\gamma \in [0, 1]$. An iteration is therefore defined as:
+\[
+    \vec{x}_k = \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1}) + \gamma \Delta\vec{x}_{k-1}
+\]
+
+
+
+\section{Stochastic gradient descent (SGD)}
+\marginnote{Stochastic gradient descent}
+SGD is a stochastic approximation of gradient descent that uses an approximation of the gradient.
+Given $N$ data points, the loss can be defined as the sum of the individual losses:
+\[ L(\vec{x}) = \sum_{n=1}^{N} L_n(\vec{x}) \]
+where $\vec{x}$ is the vector of parameters.
+The corresponding gradient can be computed as:
+\[ \nabla L(\vec{x}) = \sum_{n=1}^{N} \nabla L_n(\vec{x}) \]
+
+\marginnote{Mini-batch}
+SGD reduces the amount of computation by approximating the gradient with a subset (mini-batch) $B$ of $\nabla L_n$:
+\[ \nabla L(\vec{x}) = \sum_{i \in B} \nabla L_i(\vec{x}) \]
+
+\begin{theorem}
+    Under some assumptions and with an appropriate decrease in learning rate, 
+    SGD is guaranteed to converge to a local minimum.
+\end{theorem}
+
+Different sizes of the mini-batch result in different behavior:
+\begin{descriptionlist}
+    \item[Large mini-batches] accurate estimates of the gradient.
+    \item[Small mini-batches] faster computation.
+\end{descriptionlist}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
@ -0,0 +1,344 @@
+\chapter{Linear algebra}
+
+
+\section{Vector space}
+
+A \textbf{vector space} over $\mathbb{R}$ is a nonempty set $V$, whose elements are called vectors, with two operations: 
+\marginnote{Vector space}
+\begin{center}
+    \begin{tabular}{l c}
+        Addition & $+ : V \times V \rightarrow V$ \\
+        Scalar multiplication & $\cdot : \mathbb{R} \times V \rightarrow V$
+    \end{tabular}
+\end{center}
+A vector space has the following properties:
+\begin{enumerate}
+    \item Addition is commutative and associative
+    \item A null vector exists: $\exists \nullvec \in V$ s.t. $\forall \vec{u} \in V: \nullvec + \vec{u} = \vec{u} + \nullvec = \vec{u}$
+    \item An identity element for scalar multiplication exists: $\forall \vec{u} \in V: 1\vec{u} = \vec{u}$
+    \item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$.\\
+        $\vec{a}$ is denoted as $-\vec{u}$.
+    \item Distributive properties:
+        \[ \forall \alpha \in \mathbb{R}, \forall \vec{u}, \vec{w} \in V: \alpha(\vec{u} + \vec{w}) = \alpha \vec{u} + \alpha \vec{w} \]
+        \[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha + \beta)\vec{u} = \alpha \vec{u} + \beta \vec{u} \]
+    \item Associative property:
+        \[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha \beta)\vec{u} = \alpha (\beta \vec{u}) \]
+\end{enumerate}
+%
+A subset $U \subseteq V$ of a vector space $V$ is a \textbf{subspace} iff $U$ is a vector space.
+\marginnote{Subspace}
+
+
+\subsection{Basis}
+\marginnote{Basis}
+Let $V$ be a vector space of dimension $n$.
+A basis $\beta = \{ \vec{v}_1, \dots, \vec{v}_n \}$ of $V$ is a set of $n$ linearly independent vectors of $V$.\\ 
+Each element of $V$ can be represented as a linear combination of the vectors in the basis $\beta$:
+\[ \forall \vec{w} \in V: \vec{w} = \lambda_1\vec{v}_1 + \dots + \lambda_n\vec{v}_n \text{ where } \lambda_i \in \mathbb{R} \]
+%
+The canonical basis of a vector space is a basis where each vector represents a dimension $i$ \marginnote{Canonical basis}
+(i.e. 1 in position $i$ and 0 in all other positions).
+\begin{example}
+    The canonical basis $\beta$ of $\mathbb{R}^3$ is $\beta = \{ (1, 0, 0), (0, 1, 0), (0, 0, 1) \}$
+\end{example}
+
+\subsection{Dot product}
+The dot product of two vectors in $\vec{x}, \vec{y} \in \mathbb{R}^n$ is defined as: \marginnote{Dot product}
+\begin{equation*}
+    \left\langle \vec{x}, \vec{y} \right\rangle =
+    \vec{x}^T \vec{y} = \sum_{i=1}^{n} x_i \cdot y_i
+\end{equation*}
+
+
+\section{Matrix}
+
+This is a {\tiny(very formal definition of)} matrix: \marginnote{Matrix}
+\begin{equation*}
+    \matr{A} =
+    \begin{pmatrix}
+        a_{11} & a_{12} & \dots  & a_{1n} \\
+        a_{21} & a_{22} & \dots  & a_{2n} \\
+        \vdots & \vdots & \ddots & \vdots \\
+        a_{m1} & a_{m2} & \dots  & a_{mn}
+    \end{pmatrix}
+\end{equation*}
+
+\subsection{Invertible matrix}
+A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is invertible (non-singular) if: \marginnote{Non-singular matrix}
+\begin{equation*}
+    \exists \matr{B} \in \mathbb{R}^{n \times n}: \matr{AB} = \matr{BA} = \matr{I}
+\end{equation*}
+where $\matr{I}$ is the identity matrix. $\matr{B}$ is denoted as $\matr{A}^{-1}$.
+
+\subsection{Kernel}
+The null space (kernel) of a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ is a subspace such that: \marginnote{Kernel}
+\begin{equation*}
+    \text{Ker}(\matr{A}) = \{ \vec{x} \in \mathbb{R}^n : \matr{A}\vec{x} = \nullvec \}
+\end{equation*}
+%
+\begin{theorem} \label{th:kernel_invertible}
+    A square matrix $\matr{A}$ with $\text{\normalfont Ker}(\matr{A}) = \{\nullvec\}$ is non singular.
+\end{theorem}
+
+\subsection{Similar matrices} \marginnote{Similar matrices}
+Two matrices $\matr{A}$ and $\matr{D}$ are \textbf{similar} if there exists an invertible matrix $\matr{P}$ such that:
+\[ \matr{D} = \matr{P}^{-1} \matr{A} \matr{P} \]
+
+
+
+\section{Norms}
+
+\subsection{Vector norms}
+The norm of a vector is a function: \marginnote{Vector norm}
+\begin{equation*}
+    \Vert \cdot \Vert: \mathbb{R}^n \rightarrow \mathbb{R}
+\end{equation*}
+such that for each $\lambda \in \mathbb{R}$ and $\vec{x}, \vec{y} \in \mathbb{R}^n$:
+\begin{itemize}
+    \item $\Vert \vec{x} \Vert \geq 0$
+    \item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = \nullvec$
+    \item $\Vert \lambda \vec{x} \Vert = \vert \lambda \vert \cdot \Vert \vec{x} \Vert$
+    \item $\Vert \vec{x} + \vec{y} \Vert \leq \Vert \vec{x} \Vert + \Vert \vec{y} \Vert$
+\end{itemize}
+%
+Common norms are:
+\begin{descriptionlist}
+    \item[2-norm] $\Vert \vec{x} \Vert_2 = \sqrt{ \sum_{i=1}^{n} x_i^2 }$
+    
+    \item[1-norm] $\Vert \vec{x} \Vert_1 = \sum_{i=1}^{n} \vert x_i \vert$
+    
+    \item[$\infty$-norm] $\Vert \vec{x} \Vert_{\infty} = \max_{1 \leq i \leq n} \vert x_i \vert$
+\end{descriptionlist}
+%
+In general, different norms tend to maintain the same proportion.
+In some cases, unbalanced results may be obtained when comparing different norms.
+\begin{example}
+    Let $\vec{x} = (1, 1000)$ and $\vec{y} = (999, 1000)$. Their norms are:
+    \begin{center}
+        \begin{tabular}{l l}
+            $\Vert \vec{x} \Vert_{2} = \sqrt{1000001}$ & $\Vert \vec{y} \Vert_{2} = \sqrt{1998001}$ \\
+            $\Vert \vec{x} \Vert_{\infty} = 1000$ & $\Vert \vec{y} \Vert_{\infty} = 1000$ \\
+        \end{tabular} 
+    \end{center}
+\end{example}
+
+
+\subsection{Matrix norms}
+The norm of a matrix is a function: \marginnote{Matrix norm}
+\begin{equation*}
+    \Vert \cdot \Vert: \mathbb{R}^{m \times n} \rightarrow \mathbb{R}
+\end{equation*}
+such that for each $\lambda \in \mathbb{R}$ and $\matr{A}, \matr{B} \in \mathbb{R}^{m \times n}$:
+\begin{itemize}
+    \item $\Vert \matr{A} \Vert \geq 0$
+    \item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \matr{0}$
+    \item $\Vert \lambda \matr{A} \Vert = \vert \lambda \vert \cdot \Vert \matr{A} \Vert$
+    \item $\Vert \matr{A} + \matr{B} \Vert \leq \Vert \matr{A} \Vert + \Vert \matr{B} \Vert$
+\end{itemize}
+%
+Common norms are:
+\begin{descriptionlist}
+    \item[2-norm] 
+        $\Vert \matr{A} \Vert_2 = \sqrt{ \rho(\matr{A}^T\matr{A}) }$,\\
+        where $\rho(\matr{X})$ is the largest absolute value of the eigenvalues of $\matr{X}$ (spectral radius).
+    
+    \item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$ (i.e. max sum of the columns in absolute value)
+    
+    \item[Frobenius norm] $\Vert \matr{A} \Vert_F = \sqrt{ \sum_{i=1}^{m} \sum_{j=1}^{n} a_{i,j}^2 }$
+\end{descriptionlist}
+
+
+
+\section{Symmetric, positive definite matrices}
+
+\begin{description}
+    \item[Symmetric matrix] \marginnote{Symmetric matrix}
+        A square matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is symmetric $\iff \matr{A} = \matr{A}^T$
+    
+    \item[Positive semidefinite matrix] \marginnote{Positive semidefinite matrix}
+        A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is positive semidefinite iff
+        \begin{equation*}
+            \forall \vec{x} \in \mathbb{R}^n \smallsetminus \{0\}: \vec{x}^T \matr{A} \vec{x} \geq 0
+        \end{equation*}
+
+    \item[Positive definite matrix] \marginnote{Positive definite matrix}
+        A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is positive definite iff
+        \begin{equation*}
+            \forall \vec{x} \in \mathbb{R}^n \smallsetminus \{0\}: \vec{x}^T \matr{A} \vec{x} > 0
+        \end{equation*}
+        %
+        It has the following properties:
+        \begin{enumerate}
+            \item The null space of $\matr{A}$ has the null vector only: $\text{Ker}(\matr{A}) = \{ \nullvec \}$. \\
+                Which implies that $\matr{A}$ is non-singular (\Cref{th:kernel_invertible}).
+            \item The diagonal elements of $\matr{A}$ are all positive.
+        \end{enumerate}
+\end{description}
+
+
+
+\section{Orthogonality}
+\begin{description}
+    \item[Angle between vectors] \marginnote{Angle between vectors}
+        The angle $\omega$ between two vectors $\vec{x}$ and $\vec{y}$ can be obtained from:
+        \begin{equation*}
+            \cos\omega = \frac{\left\langle \vec{x}, \vec{y} \right\rangle }{\Vert \vec{x} \Vert_2 \cdot \Vert \vec{y} \Vert_2}
+        \end{equation*}
+    
+    \item[Orthogonal vectors] \marginnote{Orthogonal vectors}
+        Two vectors $\vec{x}$ and $\vec{y}$ are orthogonal ($\vec{x} \perp \vec{y}$) when:
+        \[ \left\langle \vec{x}, \vec{y} \right\rangle = 0 \]
+    
+    \item[Orthonormal vectors] \marginnote{Orthonormal vectors}
+        Two vectors $\vec{x}$ and $\vec{y}$ are orthonormal when:
+        \[ \vec{x} \perp \vec{y} \text{ and } \Vert \vec{x} \Vert = \Vert \vec{y} \Vert=1 \]
+        \begin{theorem}
+            The canonical basis of a vector space is orthonormal.
+        \end{theorem}
+    
+    \item[Orthogonal matrix] \marginnote{Orthogonal matrix}
+        A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is orthogonal if its columns are \underline{orthonormal} vectors.
+        It has the following properties:
+        \begin{enumerate}
+            \item $\matr{A}\matr{A}^T = \matr{I} = \matr{A}^T\matr{A}$, which implies $\matr{A}^{-1} = \matr{A}^T$.
+            \item The length of a vector is unchanged when mapped through an orthogonal matrix: 
+                \[ \Vert \matr{A}\vec{x} \Vert^2 = \Vert \vec{x} \Vert^2 \]
+            \item The angle between two vectors is unchanged when both are mapped through an orthogonal matrix:
+                \[ 
+                    \cos\omega = \frac{(\matr{A}\vec{x})^T(\matr{A}\vec{y})}{\Vert \matr{A}\vec{x} \Vert \cdot \Vert \matr{A}\vec{y} \Vert} = 
+                        \frac{\vec{x}^T\vec{y}}{\Vert \vec{x} \Vert \cdot \Vert \vec{y} \Vert}
+                \]
+        \end{enumerate}
+        Note: an orthogonal matrix represents a rotation.
+
+    \item[Orthogonal basis] \marginnote{Orthogonal basis}
+        Given a $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
+        $\beta$ is an orthogonal basis if:
+        \[ \vec{b}_i \perp \vec{b}_j \text{ for } i \neq j \text{ (i.e.} \left\langle \vec{b}_i, \vec{b}_j \right\rangle = 0 \text{)} \]
+
+    \item[Orthonormal basis] \marginnote{Orthonormal basis}
+        Given a $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
+        $\beta$ is an orthonormal basis if:
+        \[ \Vert \vec{b}_i \Vert_2 = 1 \text{ (or} \left\langle \vec{b}_i, \vec{b}_i \right\rangle = 1 \text{)} \]
+    
+    \item[Orthogonal complement] \marginnote{Orthogonal complement}
+        Given a $n$-dimensional vector space $V$ and a $m$-dimensional subspace $U \subseteq V$.
+        The orthogonal complement $U^\perp$ of $U$ is a $(n-m)$-dimensional subspace of $V$ such that it
+        contains all the vectors orthogonal to every vector in $U$:
+        \[ \forall \vec{w} \in V: \vec{w} \in U^\perp \iff (\forall \vec{u} \in U: \vec{w} \perp \vec{u}) \]
+        %
+        Note that $U \cap U^\perp = \{ \nullvec \}$ and 
+        it is possible to represent all vectors in $V$ as a linear combination of both the basis of $U$ and $U^\perp$.
+        
+        The vector $\vec{w} \in U^\perp$ s.t. $\Vert \vec{w} \Vert = 1$ is the \textbf{normal vector} of $U$. \marginnote{Normal vector}
+        %
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.4\textwidth]{img/_orthogonal_complement.pdf}
+            \caption{Orthogonal complement of a subspace $U \subseteq \mathbb{R}^3$}
+        \end{figure}
+\end{description}
+
+
+
+\section{Projections}
+Projections are methods to map high-dimensional data into a lower-dimensional space 
+while minimizing the compression loss.\\
+\marginnote{Orthogonal projection}
+Let $V$ be a vector space and $U \subseteq V$ a subspace of $V$.
+A linear mapping $\pi: V \rightarrow U$ is a (orthogonal) projection if:
+\[ \pi^2 = \pi \circ \pi = \pi \]
+In other words, applying $\pi$ multiple times gives the same result (i.e. idempotency).\\
+$\pi$ can be expressed as a transformation matrix $\matr{P}_\pi$ such that:
+\[ \matr{P}_\pi^2 = \matr{P}_\pi \] 
+
+\subsection{Projection onto general subspaces} \marginnote{Projection onto subspace basis}
+To project a vector $\vec{x} \in \mathbb{R}^n$ into a lower-dimensional subspace $U \subseteq \mathbb{R}^n$,
+it is possible to use the basis of $U$.\\
+%
+Let $m = \text{dim}(U)$ be the dimension of $U$ and 
+$\matr{B} = (\vec{b}_1, \dots, \vec{b}_m) \in \mathbb{R}^{n \times m}$ an ordered basis of $U$.
+A projection $\pi_U(\vec{x})$ represents $\vec{x}$ as a linear combination of the basis:
+\[ \pi_U(\vec{x}) = \sum_{i=1}^{m} \lambda_i \vec{b}_i = \matr{B}\vec{\uplambda} \]
+where $\vec{\uplambda} = (\lambda_1, \dots, \lambda_m)^T \in \mathbb{R}^{m}$ are the new coordinates of $\vec{x}$ 
+and is found by minimizing the distance between $\pi_U(\vec{x})$ and $\vec{x}$.
+
+
+
+\section{Eigenvectors and eigenvalues}
+
+Given a square matrix $\matr{A} \in \mathbb{R}^{n \times n}$, 
+$\lambda \in \mathbb{C}$ is an eigenvalue of $\matr{A}$ \marginnote{Eigenvalue}
+with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if: \marginnote{Eigenvector}
+\[ \matr{A}\vec{x} = \lambda\vec{x} \]
+
+It is equivalent to say that:
+\begin{itemize}
+    \item $\lambda$ is an eigenvalue of $\matr{A} \in \mathbb{R}^{n \times n}$
+    \item $\exists \vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ s.t. $\matr{A}\vec{x} = \lambda\vec{x}$ \\
+        Equivalently the system $(\matr{A} - \lambda \matr{I}_n)\vec{x} = \nullvec$ is non-trivial ($\vec{x} \neq \nullvec$).
+    \item $\text{rank}(\matr{A} - \lambda \matr{I}_n) < n$
+    \item $\det(\matr{A} - \lambda \matr{I}_n) = 0$ (i.e. $(\matr{A} - \lambda \matr{I}_n)$ is singular {\footnotesize(i.e. not invertible)})
+\end{itemize}
+
+Note that eigenvectors are not unique.
+Given an eigenvector $\vec{x}$ of $\matr{A}$ with eigenvalue $\lambda$, 
+we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is an eigenvector of $\matr{A}$:
+\[ \matr{A}(c\vec{x}) = c(\matr{A}\vec{x}) = c\lambda\vec{x} = \lambda(c\vec{x}) \]
+
+% \begin{theorem}
+%     The eigenvalues of a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ are all in $\mathbb{R}$.
+% \end{theorem}
+
+\begin{theorem} \marginnote{Eigenvalues and positive definiteness}
+    $\matr{A} \in \mathbb{R}^{n \times n}$ is symmetric positive definite $\iff$
+    its eigenvalues are all positive.
+\end{theorem}
+
+\begin{description}
+    \item[Eigenspace] \marginnote{Eigenspace}
+        Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalue $\lambda$.
+        This set is a subspace of $\mathbb{R}^n$.
+    
+    \item[Eigenspectrum] \marginnote{Eigenspectrum}
+        Set of all eigenvalues of $\matr{A} \in \mathbb{R}^{n \times n}$.
+\end{description}
+
+
+\begin{description}
+    \item[Geometric multiplicity] \marginnote{Geometric multiplicity}
+        Given an eigenvalue $\lambda$ of a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
+        The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated to $\lambda$.
+\end{description}
+
+
+\begin{theorem} \marginnote{Linearly independent eigenvectors}
+    Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$. 
+    If its $n$ eigenvectors $\vec{x}_1, \dots, \vec{x}_n$ are associated to distinct eigenvalues, 
+    then $\vec{x}_1, \dots, \vec{x}_n$ are linearly independent (i.e. they form a basis of $\mathbb{R}^n$).
+
+    \begin{descriptionlist}
+        \item[Defective matrix] \marginnote{Defective matrix}
+            A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is defective if it has less than $n$ linearly independent eigenvectors.
+    \end{descriptionlist}
+\end{theorem}
+
+
+\begin{theorem}[Spectral theorem] \label{th:spectral_theorem} \marginnote{Spectral theorem}
+    Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
+    Its eigenvectors form an orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
+\end{theorem}
+
+
+\subsection{Diagonalizability}
+\marginnote{Diagonalizable matrix}
+A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is diagonalizable if it is similar to a diagonal matrix $\matr{D} \in \mathbb{R}^{n \times n}$:
+\[ \exists \matr{P} \in \mathbb{R}^{n \times n} \text{ s.t. } \matr{P} \text{ invertible and } \matr{D} = \matr{P}^{-1}\matr{A}\matr{P} \]
+
+\begin{theorem}
+    Similar matrices have the same eigenvalues.
+\end{theorem}
+
+\begin{theorem} \marginnote{Symmetric matrix diagonalizability}
+    A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is always diagonalizable.
+\end{theorem}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
@ -0,0 +1,242 @@
+\chapter{Linear systems}
+
+A linear system:
+\begin{equation*}
+    \begin{cases}
+        a_{1,1}x_1 + a_{1,2}x_2 + \dots + a_{1,n}x_n = b_1\\
+        a_{2,1}x_1 + a_{2,2}x_2 + \dots + a_{2,n}x_n = b_2\\
+        \hspace*{7em} \vdots \\
+        a_{m,1}x_1 + a_{m,2}x_2 + \dots + a_{m,n}x_n = b_m\\
+    \end{cases}
+\end{equation*}
+can be represented as:
+\[ \matr{A}\vec{x} = \vec{b} \]
+where:
+\[
+    \matr{A} = 
+    \begin{pmatrix}
+        a_{1,1} & a_{1, 2} & \hdots & a_{1,n} \\
+        a_{2,1} & a_{2, 2} & \hdots & a_{2,n} \\
+        \vdots  & \vdots   & \ddots & \vdots  \\
+        a_{m,1} & a_{m, 2} & \hdots & a_{m,n}
+    \end{pmatrix} \in \mathbb{R}^{m \times n}
+    \hspace*{2em}
+    %
+    \vec{x} = 
+    \begin{pmatrix}
+        x_1 \\
+        x_2 \\ 
+        \vdots \\
+        x_n
+    \end{pmatrix} \in \mathbb{R}^n
+    \hspace*{2em}
+    %
+    \vec{b} = 
+    \begin{pmatrix}
+        b_1 \\
+        b_2 \\ 
+        \vdots \\
+        b_m
+    \end{pmatrix} \in \mathbb{R}^m
+\]
+    
+
+
+\section{Square linear systems}
+\marginnote{Square linear system}
+A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$
+has a unique solution iff one of the following conditions is satisfied:
+\begin{enumerate}
+    \item $\matr{A}$ is non-singular (invertible)
+    \item $\text{rank}(\matr{A}) = n$ (full rank)
+    \item $\matr{A}\vec{x}$ only admits the solution $\vec{x} = \nullvec$
+\end{enumerate}
+
+The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
+\[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
+However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
+Therefore, numerical methods are usually more suited.
+The two main families of methods are:
+\begin{itemize}
+    \item Direct methods.
+    \item Iterative methods.
+\end{itemize}
+
+
+
+\section{Direct methods}
+\marginnote{Direct methods}
+Direct methods compute the solution of a linear system in a finite number of steps.
+Compared to iterative methods, they are more precise but more expensive.
+
+The most common approach consists in factorizing the matrix $\matr{A}$.
+
+\subsection{Gaussian factorization}
+\marginnote{Gaussian factorization\\(LU decomposition)}
+Given a square linear system $\matr{A}\vec{x} = \vec{b}$, 
+the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that:
+\begin{itemize}
+    \item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix.
+    \item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix.
+\end{itemize}
+%
+The system can be decomposed into:
+\[
+    \begin{split}
+        \matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
+            & \iff 
+            \begin{cases}
+                \matr{L}\vec{y} = \vec{b} \\
+                \vec{y} = \matr{U}\vec{x} 
+            \end{cases}
+    \end{split}
+\]
+To find the solution, it is sufficient to solve in order:
+\begin{enumerate}
+    \item $\matr{L}\vec{y} = \vec{b}$ (solved w.r.t. $\vec{y}$)
+    \item $\vec{y} = \matr{U}\vec{x}$ (solved w.r.t. $\vec{x}$)
+\end{enumerate}
+
+The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\
+$O(\frac{n^3}{3})$ is the time complexity of the LU factorization. 
+$O(n^2)$ is the complexity to directly solve a system with a triangular matrix (forward or backward substitutions).
+
+
+\subsection{Gaussian factorization with pivoting}
+\marginnote{Gaussian factorization with pivoting}
+During the computation of $\matr{A} = \matr{L}\matr{U}$ 
+(using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}), 
+a division by 0 may occur.
+A method to prevent this problem (and to lower the algorithmic error (i.e. overflows)) is to change the order of the rows of $\matr{A}$ before decomposing it.
+This is achieved by using a permutation matrix $\matr{P}$, which is obtained as a permutation of the identity matrix.
+
+The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
+The system can be decomposed into:
+\[
+    \begin{split}
+        \matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
+            & \iff 
+            \begin{cases}
+                \matr{L}\vec{y} = \matr{P}\vec{b} \\
+                \vec{y} = \matr{U}\vec{x}
+            \end{cases}
+    \end{split}
+\]
+
+An alternative formulation (which is what \texttt{SciPy} uses) 
+is defined as:
+\[\matr{A} = \matr{P}\matr{L}\matr{U} \iff \matr{P}^T\matr{A} = \matr{L}\matr{U} \]
+It must be noted that $\matr{P}$ is orthogonal, so $\matr{P}^T = \matr{P}^{-1}$.
+The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can be found as above.
+
+
+\subsection{Cholesky factorization}
+Given a symmetric positive definite matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
+It is possible to decompose $\matr{A}$ as:
+\[ \matr{A} = \matr{L}\matr{L}^T \]
+where $\matr{L}$ is lower triangular.
+
+A square system where $\matr{A}$ is symmetric definite positive can be solved as above using the Cholesky factorization.
+This method has time complexity $O(\frac{n^3}{6})$.
+
+
+
+
+\section{Iterative methods}
+\marginnote{Iterative methods}
+Iterative methods solve a linear system by computing a sequence that converges to the exact solution.
+Compared to direct methods, they are less precise but computationally faster and more suited for large systems. 
+
+The overall idea is to build a sequence of vectors $\vec{x}_k$ 
+that converges to the exact solution $\vec{x}^*$:
+\[ \lim_{k \rightarrow \infty} \vec{x}_k = \vec{x}^* \]
+Generally, the first vector $\vec{x}_0$ is given (or guessed). Subsequent vectors are computed w.r.t. the previous iteration 
+as $\vec{x}_k = g(\vec{x}_{k-1})$.
+
+The two most common families of iterative methods are:
+\begin{descriptionlist}
+    \item[Stationary methods] \marginnote{Stationary methods}
+        compute the sequence as:
+        \[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \]
+        where $\matr{B}$ is called iteration matrix and $\vec{d}$ is computed from the $\vec{b}$ vector of the system.
+        The time complexity per iteration is $O(n^2)$.
+    
+    \item[Gradient-like methods] \marginnote{Gradient-like methods}
+        have the form:
+        \[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
+        where $\alpha_{k-1} \in \mathbb{R}$ and the vector $\vec{p}_{k-1}$ is called direction.
+\end{descriptionlist}
+
+\subsection{Stopping criteria}
+\marginnote{Stopping criteria}
+One or more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite).
+The most common approaches are:
+\begin{descriptionlist}
+    \item[Residual based]
+        The algorithm is terminated when the current solution is close enough to the exact solution.
+        The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$.
+        Given a tolerance $\varepsilon$, the algorithm may stop when:
+        \begin{itemize}
+            \item $\Vert \vec{r}_k \Vert \leq \varepsilon$ (absolute)
+            \item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$ (relative)
+        \end{itemize}
+
+    \item[Update based] 
+        The algorithm is terminated when the difference between iterations is very small.
+        Given a tolerance $\tau$, the algorithm stops when:
+        \[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \]
+\end{descriptionlist}
+Obviously, as the sequence is truncated, a truncation error is introduced when using iterative methods.
+
+
+
+\section{Condition number}
+Inherent error causes inaccuracies during the resolution of a system.
+This problem is independent of the algorithm and is estimated using exact arithmetic.
+
+Given a system $\matr{A}\vec{x} = \vec{b}$, we perturbate $\matr{A}$ and/or $\vec{b}$ and study the inherited error.
+For instance, if we perturbate $\vec{b}$, we obtain the following system:
+\[ \matr{A}\tilde{\vec{x}} = (\vec{b} + \Delta\vec{b}) \]
+After finding $\tilde{\vec{x}}$, we can compute the inherent error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
+
+By comparing $\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert$ and $\left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert$, 
+we can compute the error introduced by the perturbation.
+It can be shown that the distance is:
+\[ 
+    \left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert \leq 
+    \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \cdot \left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert 
+\]
+Finally, we can define the \textbf{condition number} of a matrix $\matr{A}$ as: \marginnote{Condition number}
+\[ K(\matr{A}) = \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \]
+
+A system is \textbf{ill-conditioned} if $K(\matr{A})$ is large \marginnote{Ill-conditioned}
+(i.e. a small perturbation of the input causes a large change in the output).
+Otherwise, it is \textbf{well-conditioned}. \marginnote{Well-conditioned}
+
+
+\section{Linear least squares problem}
+
+A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$ 
+does not generally have a solution.
+\marginnote{Linear least squares}
+Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
+\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
+In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
+This problem is usually formulated as:
+\[ 
+    \tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
+\]
+It always admits a solution and, depending on $\text{rank}(\matr{A})$, there are two possible cases:
+\begin{descriptionlist}
+    \item[$\text{rank}(\matr{A}) = n$] 
+        The solution is unique for each $b \in \mathbb{R}^m$.
+        \marginnote{Normal equation}
+        It is found by solving the normal equation:
+        \[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
+        $\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
+    
+    \item[$\text{rank}(\matr{A}) < n$]
+        The system admits infinite solutions.
+        Of all the solutions $S$, we are interested in the one with minimum norm:
+        \[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
+\end{descriptionlist}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
@ -0,0 +1,306 @@
+\chapter{Machine learning}
+
+
+\section{Models}
+
+\begin{description}
+    \item[Function model] \marginnote{Function model}
+        The model (predictor) is a deterministic function:
+        \[ f: \mathbb{R}^D \rightarrow \mathbb{R} \]
+
+        In this course, only linear functions are considered:
+        \[ f_\vec{\uptheta}(\vec{x}) = \uptheta_0 + \uptheta_1 x_1 + \dots + \uptheta_D x_D = \vec{\uptheta}^T \vec{x} \]
+        where $\vec{x} = \begin{pmatrix} 1, x_1, \dots, x_D \end{pmatrix}$ is the input vector and
+        $\vec{\uptheta} = \begin{pmatrix} \uptheta_0, \dots, \uptheta_D \end{pmatrix}$ is the parameter vector.
+
+    \item[Probabilistic model] \marginnote{Probabilistic model}
+        The model is a multivariate probabilistic distribution that 
+        is able to quantify uncertainty in noisy data.
+\end{description}
+
+
+
+\section{Learning}
+
+
+\subsection{Empirical risk minimization}
+\marginnote{Empirical risk minimization}
+Used for function models.
+The parameters of the predictor are directly obtained as an optimization problem that aims to minimize the distance
+between the prediction and the ground truth.
+
+Let $(\vec{x}_n, y_n)$ be a dataset of $N$ elements
+where $\vec{x}_n \in \mathbb{R}^D$ are the examples and $y_n \in \mathbb{R}$ are the labels.
+We want to estimate a predictor $f_\vec{\uptheta}(\vec{x}) = \vec{\uptheta}^T \vec{x}$ with parameters $\vec{\uptheta}$
+such that, with the ideal parameters $\vec{\uptheta}^*$, it fits the data well:
+\[ f_{\vec{\uptheta}^*}(\vec{x}_n) \approx y_n \]
+
+We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n)$.
+
+\begin{description}
+    \item[Loss function] \marginnote{Loss function}
+        A loss function $\ell(y_n, \hat{y}_n)$ indicates how a predictor fits the data.
+
+        An assumption commonly made in machine learning is that 
+        the dataset $(\vec{x}_n, y_n)$ is independent and identically distributed. 
+        Therefore, the empirical mean is a good estimate of the population mean.
+
+    \item[Empirical risk] \marginnote{Empirical risk}
+        Given the example matrix $\matr{X} = \begin{pmatrix} \vec{x}_1, \dots, \vec{x}_N \end{pmatrix} \in \mathbb{R}^{N \times D}$
+        and the label vector $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix} \in \mathbb{R}^N$.
+        The empirical risk is given by the average loss:
+        \[ \textbf{R}_\text{emp}(f_\vec{\uptheta}, \matr{X}, \vec{y}) = \frac{1}{N} \sum_{n=1}^{N} \ell(y_n, \hat{y}_n) \]
+
+        \begin{description}
+            \item[Least-squares loss] \marginnote{Least-squares loss}
+                The least-squares loss is defined as:
+                \[ \ell(y_n, \hat{y}_n) = (y_n - \hat{y}_n)^2 \]
+
+                Therefore, the minimization task is:
+                \[ 
+                    \min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - f_\vec{\uptheta}(\vec{x}_n))^2 =
+                    \min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - \vec{\uptheta}^T\vec{x}_n)^2 =
+                    \min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2
+                \]
+        \end{description}
+
+    \item[Expected risk] \marginnote{Expected risk}
+        The expected risk is defined as:
+        \[ \textbf{R}_\text{true}(f_\vec{\uptheta}) = \mathbb{E}_{\vec{x}, y}[\ell(y, f_\vec{\uptheta}(\vec{x}_\text{test}))] \]
+        where the parameters $\vec{\uptheta}$ are fixed and the samples are taken from a test set.
+
+    \item[Overfitting] \marginnote{Overfitting}
+        \sloppy
+        A predictor $f_\vec{\uptheta}$ is overfitting when $\textbf{R}_\text{emp}(f, \matr{X}_\text{train}, \vec{y}_\text{train})$
+        underestimates $\textbf{R}_\text{true}(f_\vec{\uptheta})$ (i.e. the loss on the training set is low, but on the test set is high).
+
+    \item[Regularization] \marginnote{Regularization}
+        Method that introduces a penalty term to the loss that
+        helps to find a compromise between the accuracy and the complexity of the solution:
+        \[ \bar{\ell}(y_n, \hat{y}_n) = \ell(y_n, \hat{y}_n) + \lambda \mathcal{R}(\vec{\uptheta}) \]
+        where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the regularizer (penalty term).
+
+        \begin{description}
+            \item[Regularized least squares] \marginnote{Regularized least squares} 
+                A simple regularization term for the least squares problem is $\Vert \vec{\uptheta} \Vert^2$.
+                The problem becomes:
+                \[ \min_{\vec{\uptheta} \in \mathbb{R}^D} 
+                    \{ \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \] 
+        \end{description}
+\end{description}
+
+
+\subsection{Maximum likelihood estimation (MLE)}
+% \marginnote{Maximum likelihood estimation (MLE)}
+Used for probabilistic models.
+The parameters are determined as the most likely to predict the correct label given an input.
+
+\begin{description}
+    \item[Negative log-likelihood] \marginnote{Negative log-likelihood}
+        \sloppy
+        Given a random variable $\bm{x}$ and a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$, 
+        the negative log-likelihood of $\bm{x}$ is:
+        \[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
+        Note that:
+        \begin{itemize}
+            \item The minus is added as we are converting the problem of maximizing the likelihood to a minimization problem.
+            \item The logarithm is useful for numerical stability.
+        \end{itemize}
+        $\mathcal{L}_{\bm{x}}(\vec{\uptheta})$ indicates how likely it is to observe $\bm{x}$ with
+        $\vec{\uptheta}$ as the parameters of the predictor.
+
+        Given a dataset $(\bm{x}_n, y_n)$ of $N$ independent and identically distributed (i.i.d.) elements,
+        optimizing the likelihood allows to find the most likely parameters to represent the dataset.
+        As the dataset is independent, we have that:
+        \[ p_\vec{\uptheta}(\vec{y} \vert \matr{X}) = \prod_{n=1}^{N} p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
+        where $\matr{X} = \begin{pmatrix} \bm{x}_1, \dots, \bm{x}_N \end{pmatrix}$ and
+        $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix}$.
+        Moreover, as the dataset is identically distributed, 
+        each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.
+
+        By applying the logarithm, we have that the negative log-likelihood of an i.i.d. dataset is defined as:
+        \[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
+        and to find good parameters $\vec{\uptheta}$, we solve the problem:
+        \[ 
+            \min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) =  
+            \min_{\vec{\uptheta} \in \mathbb{R}^D} -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) 
+        \]
+
+        \begin{description}
+            \item[Gaussian likelihood] \marginnote{Gaussian likelihood}
+                Using a linear model $\bm{x}^T\vec{\uptheta}$ as predictor and 
+                assuming that the likelihood has a Gaussian distribution as follows:
+                \[ p_\vec{\uptheta}(y_n \,\vert\, \bm{x}_n) = \mathcal{N}(y_n \,\vert\, \bm{x}_n^T\vec{\uptheta}, \sigma^2) \]
+                where the Gaussian distribution has mean $\bm{x}_n^T\vec{\uptheta}$ (i.e. $f_\vec{\uptheta}(\bm{x}_n))$ 
+                and variance $\sigma^2$ for the $n$-th data point.
+        
+                The negative log-likelihood is:
+                \[
+                    \begin{split}
+                        \mathcal{L}(\vec{\uptheta}) &= -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \\
+                            &= -\sum_{n=1}^{N} \log \mathcal{N}(y_n \vert \bm{x}_n^T\vec{\uptheta}, \sigma^2) \\
+                            &= -\sum_{n=1}^{N} \log \left( \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) \right) \\
+                            &= -\sum_{n=1}^{N} \log\exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}} \\
+                            &= \frac{1}{2\sigma^2} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}
+                    \end{split}  
+                \]
+        
+                The minimization problem becomes:
+                \[
+                    \begin{split}
+                        \min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) &= 
+                            \min_{\vec{\uptheta} \in \mathbb{R}^D} 
+                                \overbrace{\frac{1}{2\sigma^2}}^{\mathclap{\text{constant}}} 
+                                \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - 
+                                \overbrace{\sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}}^{\mathclap{\text{constant}}} \\
+                            &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 \\
+                            &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2
+                    \end{split}    
+                \]
+                which corresponds to the least squares problem.
+        \end{description}
+
+        \begin{figure}[ht]
+            \begin{subfigure}{.45\textwidth}
+                \centering
+                \includegraphics[width=.75\linewidth]{img/gaussian_mle_good.png}
+                \caption{When the parameters are good, the label will be near the mean (i.e. predictor)}
+            \end{subfigure}
+            \hspace*{1em}
+            \begin{subfigure}{.45\textwidth}
+                \centering
+                \includegraphics[width=.75\linewidth]{img/gaussian_mle_bad.png}
+                \caption{When the parameters are bad, the label will be far from the mean}
+            \end{subfigure}
+
+            \caption{Geometric interpretation of the Gaussian likelihood}
+        \end{figure}
+\end{description}
+
+
+\subsection{Maximum a posteriori estimation (MAP)}
+\marginnote{Maximum a posteriori (MAP)}
+Maximum a posteriori estimation uses the opposite distribution of MLE and maximizes:
+\[ 
+    \max_{\vec{\uptheta} \in \mathbb{R}^D} p(\vec{\uptheta} \vert \matr{X}, \vec{y}) =
+    \min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{\uptheta} \vert \matr{X}, \vec{y})
+\]
+In other words, it maximizes the probability of a set of parameters $\vec{\uptheta}$ given the observation of the dataset $(\matr{X}, \vec{y})$.
+By applying the Bayes' theorem, the problem becomes:
+\[ 
+    \begin{split}
+        \min_{\vec{\uptheta} \in \mathbb{R}^D} 
+            -\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
+        \min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
+        &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
+    \end{split}
+\]
+
+\begin{description}
+    \item[Gaussian posteriori] \marginnote{Gaussian posteriori}
+        By assuming that the conditional probability of the dataset follows a Gaussian distribution (as in MLE),
+        the problem becomes:
+        \[ 
+            \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \} = 
+            \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 -\log p(\vec{\uptheta}) \} 
+        \]
+
+        Moreover, assuming that $p(\vec{\uptheta}) \sim \mathcal{N}(0, \matr{\Sigma})$, we have that:
+        \[ -\log p(\vec{\uptheta}) = \frac{1}{2\sigma^2} \Vert \vec{\uptheta} \Vert^2 \]
+
+        Therefore, the problem becomes:
+        \[ \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
+        MAP can be seen as a regularization factor for MLE.
+\end{description}
+
+
+
+\section{Linear regression}
+\marginnote{Linear regression}
+Given a dataset of inputs $\vec{x}_n \in \mathbb{R}^D$ with corresponding labels $y_n = f(\vec{x}_n) + \varepsilon$,
+where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise,
+we want to estimate the function $f$.
+
+\begin{description}
+    \item[Model]
+        We use as the predictor:
+        \[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \]
+        Because of the noise, we use a probabilistic model with likelihood:
+        \[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, f(\vec{x}), \sigma^2) \]
+
+    \item[Parameter estimation]  
+        To estimate $\vec{\uptheta}$, we can use MLE:
+        \[ \min_{\vec{\uptheta} \in \mathbb{R}^D} -p_\vec{\uptheta}(\vec{y} \vert \matr{X}) \]
+\end{description}
+
+
+\subsection{Maximum likelihood estimation with features}
+\marginnote{MLE with features}
+Linear regression is linear only with respect to the parameters $\vec{\uptheta}$. 
+Therefore, it is possible to apply any transformation to the inputs of the predictor $f$ such that:
+\[ f(\vec{x}_n) = (\phi(\vec{x}_n))^T \vec{\uptheta}  \]
+where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ is a transformation and 
+$\vec{\uptheta} \in \mathbb{R}^K$ are the parameters.
+
+Given a dataset of $N$ entries $\vec{x}_n \in \mathbb{R}^D$ with labels $y_n \in \mathbb{R}$
+and a transformation function $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$,
+the transformed features can be expressed through a feature matrix $\matr{\Phi} \in \mathbb{R}^{N \times K}$:
+\[
+    \matr{\Phi} = 
+    \begin{pmatrix}
+        (\phi(\vec{x}_1))^T \\ \vdots \\ (\phi(\vec{x}_N))^T
+    \end{pmatrix} 
+    =
+    \begin{pmatrix}
+        \phi_0(\vec{x}_1) & \cdots & \phi_{K-1}(\vec{x}_1) \\ 
+        \vdots & \ddots & \vdots \\ 
+        \phi_0(\vec{x}_N) & \cdots & \phi_{K-1}(\vec{x}_N) \\ 
+    \end{pmatrix}
+\]
+
+The negative log-likelihood can be defined as:
+\[ 
+    -\log p_\vec{\uptheta}(\vec{y} \,\vert\, \matr{X}) =
+    \frac{1}{2\sigma^2} (\vec{y} - \matr{\Phi}\vec{\uptheta})^T (\vec{y} - \matr{\Phi}\vec{\uptheta}) + \text{constant}
+\]
+As $\matr{\Phi}$ is (usually) full-rank and convex, the problem can be solved directly using normal equations:
+\[ 
+    \matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \matr{\Phi}^T \vec{y} \iff 
+    \vec{\uptheta} = (\matr{\Phi}^T \matr{\Phi})^{-1} \matr{\Phi}^T \vec{y}
+\]
+Obviously, the negative log-likelihood can also be minimized by using a gradient method.
+
+\begin{description}
+    \item[Root mean square error (RMSE)] \marginnote{Root mean square error (RMSE)}
+        RMSE is computed as:
+            \[ 
+                \sqrt{ \frac{1}{N} \Vert \vec{y} - \matr{\Phi}\vec{\uptheta} \Vert^2 } =
+                \sqrt{ \frac{1}{N} \sum_{n=1}^{N}(y_n - (\phi(\vec{x}_n))^T\vec{\uptheta})^2 }
+            \]
+            Differently from MSE, RMSE allows to compare errors of datasets with different sizes
+            and scales its result to the labels.
+
+            By comparing the RMSE of the train and test sets, it is possible to check if a model is overfitting.
+\end{description}
+
+\begin{description}
+    \item[Polynomial regression] \marginnote{Polynomial regression}
+        The transformation function $\phi: \mathbb{R} \rightarrow \mathbb{R}^K$ is defined as:
+        \[  
+            \phi(x) = 
+            \begin{pmatrix}
+                \phi_0(x) \\ \phi_1(x) \\ \phi_2(x) \\ \vdots \\ \phi_{K-1}(x)
+            \end{pmatrix}
+            = 
+            \begin{pmatrix}
+                1 \\ x \\ x^2 \\ \vdots \\ x^{K-1}
+            \end{pmatrix}
+        \]
+        The predictor is then defined as:
+        \[ 
+            \begin{split}
+                f(x) &= (\phi(x))^T \vec{\uptheta} \\
+                    &= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i = \sum_{i=0}^{K-1} x^i \vartheta_i
+            \end{split}
+        \]
+\end{description}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
@ -0,0 +1,226 @@
+\chapter{Matrix decomposition}
+
+
+\section{Eigendecomposition}
+\marginnote{Eigendecomposition}
+Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
+If the eigenvectors of $\matr{A}$ form a basis of $\mathbb{R}^n$,
+then $\matr{A} \in \mathbb{R}^{n \times n}$ can be decomposed into:
+\[ \matr{A} = \matr{P}\matr{D}\matr{P}^{-1} \]
+where $\matr{P} \in \mathbb{R}^{n \times n}$ contains the eigenvectors of $\matr{A}$ as its columns and 
+$\matr{D}$ is a diagonal matrix whose diagonal contains the eigenvalues of $\matr{A}$.
+
+Note that a symmetric matrix can always be decomposed (\Cref{th:spectral_theorem})
+
+
+
+\section{Singular value decomposition}
+\marginnote{Singular value decomposition}
+Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r \in [0, \min\{m, n\}]$.
+The singular value decomposition (SVD) of $\matr{A}$ is always possible and has form:
+\[
+    \matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
+\]
+\[
+    =
+    \begin{pmatrix}
+        \begin{pmatrix} \\ \vec{u}_1 \\ \\ \end{pmatrix}    &
+        \dots                                               &
+        \begin{pmatrix} \\ \vec{u}_m \\ \\ \end{pmatrix} 
+    \end{pmatrix}
+    \begin{pmatrix}
+        \sigma_1    & 0         & 0                 \\
+        0           & \ddots    & 0                 \\
+        0           & 0    & \sigma_{\min\{m, n\}}  \\
+    \end{pmatrix}
+    \begin{pmatrix}
+        \begin{pmatrix} & \vec{v}_1 & \end{pmatrix} \\
+        \vdots                                      \\
+        \begin{pmatrix} & \vec{v}_n & \end{pmatrix} \\
+    \end{pmatrix}
+\]
+where:
+\begin{itemize}
+    \item 
+        $\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix whose columns $\vec{u}_i$ are called left-singular vectors.
+    
+    \item 
+        $\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix whose columns $\vec{v}_i$ are called right-singular vectors.
+    
+    \item 
+        $\matr{\Sigma} \in \mathbb{R}^{m \times n}$ is a matrix with $\matr{\Sigma}_{i,j} = 0$ (i.e. diagonal if it was a square matrix) and
+        the singular values $\sigma_i, i = 1 \dots \min\{m, n\}$ on the diagonal.
+        By convention $\sigma_1 \geq \sigma_2 \geq \dots \geq \sigma_r \geq 0$.
+        Note that singular values $\sigma_j = 0$ for $(r + 1) \leq j \leq \min\{m, n\}$ 
+        (i.e. singular values at indexes after $\text{rank}(\matr{A})$ are always 0).
+\end{itemize}
+
+\marginnote{Singular value equation}
+We can also represent SVD as a \textbf{singular value equation}, which resembles the eigenvalue equation:
+\[  \matr{A}\vec{v}_i = \sigma_i\vec{u}_i \text{ for } i = 1, \dots, r \]
+This is derived from:
+\[ 
+    \matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T 
+        \iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}\matr{V}^T\matr{V} 
+        \iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}
+\]
+
+\subsection{Singular values and eigenvalues}
+\marginnote{Eigendecomposition of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$}
+Given $\matr{A} \in \mathbb{R}^{m \times n}$, we can obtain the eigenvalues and eigenvectors 
+of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$ through SVD.
+
+For $\matr{A}^T\matr{A}$, we can compute:
+\[
+\begin{split}
+    \matr{A}^T\matr{A} & = (\matr{U}\matr{\Sigma}\matr{V}^T)^T(\matr{U}\matr{\Sigma}\matr{V}^T) \text{ using } (\matr{A}\matr{B})^T = \matr{B}^T\matr{A}^T \\
+        & = (\matr{V}\matr{\Sigma}^T\matr{U}^T)(\matr{U}\matr{\Sigma}\matr{V}^T) \\
+        & = \matr{V}\matr{\Sigma}^T\matr{\Sigma}\matr{V}^T \\
+        & = \matr{V}\matr{\Sigma}^2\matr{V}^T
+\end{split}    
+\]
+As $\matr{V}$ is orthogonal ($\matr{V}^T = \matr{V}^{-1}$), we can apply the eigendecomposition theorem:
+\begin{itemize}
+    \item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$.
+    \item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$.
+\end{itemize}
+
+The same process holds for $\matr{A}\matr{A}^T$. In this case, the columns of $\matr{U}$ (left-singular vectors) are the eigenvectors.
+
+
+\subsection{Singular values and 2-norm}
+Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$, 
+we have that $\matr{A}^T\matr{A} = \matr{A}^2 = \matr{A}\matr{A}^T$ (as $\matr{A}^T = \matr{A}$).
+
+The eigenvalues of $\matr{A}^2$ are $\lambda_1^2, \dots,\lambda_n^2$, where $\lambda_i$ are eigenvalues of $\matr{A}$.
+Alternatively, the eigenvalues of $\matr{A}^2$ are the squared singular values of $\matr{A}$: $\lambda_i^2 = \sigma_i^2$.
+Moreover, the eigenvalues of $\matr{A}^{-1}$ are $\frac{1}{\lambda_1}, \dots, \frac{1}{\lambda_n}$.
+
+\marginnote{2-norm using SVD}
+We can compute the 2-norm as:
+\[ \Vert \matr{A} \Vert_2 = \sqrt{\rho(\matr{A}^T\matr{A})} = \sqrt{\rho(\matr{A}^2)} = \sqrt{\max\{\sigma_1^2, \dots, \sigma_r^2\}} = \sigma_1 \]
+\[ 
+    \Vert \matr{A}^{-1} \Vert_2 = \sqrt{\rho((\matr{A}^{-1})^T(\matr{A}^{-1}))} = 
+    \sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} = 
+    \sqrt{\max \left\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2} \right\}} = \frac{1}{\sigma_r}
+\]
+Furthermore, we can compute the condition number of $\matr{A}$ as:
+\[ K(\matr{A}) = \Vert \matr{A} \Vert_2 \cdot \Vert \matr{A}^{-1} \Vert_2 = \sigma_1 \cdot \frac{1}{\sigma_r} \]
+
+
+
+\subsection{Application: Matrix approximation}
+Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ and its SVD decomposition $\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T$,
+we can construct a rank-1 matrix (dyad) $\matr{A}_i \in \mathbb{R}^{m \times n}$ as: \marginnote{Dyad}
+\[ \matr{A}_i = \vec{u}_i \vec{v}_i^T \]
+where $\vec{u}_i \in \mathbb{R}^m$ is the $i$-th column of $\matr{U}$ and
+$\vec{v}_i \in \mathbb{R}^n$ is the $i$-th column of $\matr{V}$.
+Then, we can compose $\matr{A}$ as a sum of dyads:
+\[ \matr{A}_i = \sum_{i=1}^{r} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{r} \sigma_i \matr{A}_i \]
+
+\marginnote{Rank-$k$ approximation}
+By considering only the first $k < r$ singular values, we can obtain a rank-$k$ approximation of $\matr{A}$:
+\[ \hat{\matr{A}}(k) = \sum_{i=1}^{k} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{k} \sigma_i \matr{A}_i \]
+
+\begin{theorem}[Eckart-Young]
+    Given $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r$.
+    For any $k \leq r$ (this theorem is interesting for $k < r$), the rank-$k$ approximation is:
+    \[ 
+        \hat{\matr{A}}(k) = \arg \min_{\matr{B} \in \mathbb{R}^{m \times n}, \text{rank}(\matr{B}) = k} \Vert \matr{A} - \matr{B} \Vert_2 
+    \]
+\end{theorem}
+In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closest one to $\matr{A}$.
+Moreover, the error of the rank-$k$ approximation is:
+\[
+    \Vert \matr{A} - \hat{\matr{A}}(k) \Vert_2 = 
+        \left\Vert \sum_{i=1}^{r} \sigma_i \matr{A}_i - \sum_{j=1}^{k} \sigma_j \matr{A}_j \right\Vert_2 =
+        \left\Vert \sum_{i=k+1}^{r} \sigma_i \matr{A}_i \right\Vert_2 = 
+        \sigma_{k+1}
+\]
+
+\subsubsection{Image compression}
+Each dyad requires $1 + m + n$ (respectively for $\sigma_i$, $\vec{u}_i$ and $\vec{v}_i$) numbers to be stored.
+A rank-$k$ approximation requires to store $k(1 + m + n)$ numbers.
+Therefore, the compression factor is given by: \marginnote{Compression factor}
+\[
+    c_k = 1 - \frac{k(1 + m + n)}{mn}
+\]
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.60\textwidth]{img/_rank_k_approx.pdf}
+    \caption{Approximation of an image}
+\end{figure}
+
+
+
+\subsection{Application: Linear least squares problem} \label{sec:lls}
+Given a least squares problem:
+\[ 
+    \tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
+\]
+When $\text{rank}(\matr{A}) < n$, the system admits infinite solutions.
+Of all the solutions $S$, we are interested in the one with minimum norm:
+\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
+This problem can be solved using SVD:
+\[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
+
+
+\subsection{Application: Polynomial interpolation}
+\marginnote{Polynomial interpolation}
+Given a set of $m$ data $(x_i, y_i), i=1, \dots, m$, 
+we want to find a polynomial of degree $n$ ($m > n$) that approximates it.
+In other words, we want to find a function:
+\[ f(x) = c_0 + c_1 x + c_2 x^2 + \dots + c_n x^n \]
+that minimizes the residual vector $\vec{r} = (r_1, \dots, r_m)$, 
+where $r_i = \vert y_i - f(x_i) \vert$.
+We can formulate this as a linear system:
+\[
+    \vec{r} = \vec{y} - \matr{A}\vec{c} = 
+    \begin{pmatrix}
+        y_1     \\
+        \vdots  \\
+        y_m
+    \end{pmatrix}
+    -
+    \begin{pmatrix}
+        1       & x_1    & x_1^2    & \dots     & x_1^n  \\
+        \vdots  & \vdots & \vdots   & \ddots    & \vdots \\ 
+        1       & x_m    & x_m^2    & \dots     & x_m^n  
+    \end{pmatrix}
+    \begin{pmatrix}
+        c_0     \\
+        \vdots  \\
+        c_n
+    \end{pmatrix}
+\]
+that can be solved as a linear least squares problem:
+\[ \min_{\vec{c} \in \mathbb{R}^n} \Vert \vec{y} - \matr{A}\vec{c} \Vert_2^2 \]
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.40\textwidth]{img/linear_regression.png}
+    \caption{Interpolation using a polynomial of degree 1}
+\end{figure}
+
+
+
+\section{Eigendecomposition vs SVD}
+\begin{center}
+    \begin{tabular}{m{16em} | m{16em}}
+        \hline
+        \multicolumn{1}{c|}{\textbf{Eigendecomposition}} & \multicolumn{1}{c}{\textbf{SVD}} \\
+        \multicolumn{1}{c|}{$\matr{A} = \matr{P}\matr{D}\matr{P}^{-1}$} & \multicolumn{1}{c}{$\matr{A}=\matr{U}\matr{\Sigma}\matr{V}$} \\
+        \hline
+        Only defined for square matrices $\matr{A} \in \mathbb{R}^{n \times n}$ with eigenvectors that form a basis of $\mathbb{R}^n$ 
+        & Always exists \\
+        \hline
+        $\matr{P}$ is not necessarily orthogonal & $\matr{U}$ and $\matr{V}$ are orthogonal \\
+        \hline
+        The elements on the diagonal of $\matr{D}$ may be in $\mathbb{C}$ 
+        & The elements on the diagonal of $\matr{\Sigma}$ are all non-negative reals \\
+        \hline
+        \multicolumn{2}{c}{For symmetric matrices, eigendecomposition and SVD are the same} \\
+        \hline
+    \end{tabular}
+\end{center}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
@ -0,0 +1,522 @@
+\chapter{Probability and statistics}
+
+
+\begin{description}
+    \item[Probability]
+        Model of a process where the underlying uncertainty is captured by random variables.
+    \item[Statistics] 
+        Determines the underlying process that explains an observation.
+\end{description}
+
+
+\section{Probability}
+\begin{description}
+    \item[State space] \marginnote{State space}
+        Set $\Omega$ of all the possible results of an experiment.
+        \begin{example}
+            A coin is tossed two times. 
+            $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$
+        \end{example}
+
+    \item[Event] \marginnote{Event}
+        Set of possible results (i.e. $A$ is an event if $A \subseteq \Omega$)
+
+    \item[Probability] \marginnote{Probability}
+        Let $\mathcal{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
+        The probability of an event is a function:
+        \[ \prob{A}: \mathcal{E} \rightarrow [0, 1] \]
+        \begin{example}
+            Let $\Omega$ be as above.
+            Given an event $A = \{ (\text{T}, \text{H}), (\text{H}, \text{T}) \}$, 
+            its probability is: $\prob{A} = \frac{2}{4} = \frac{1}{2}$
+        \end{example}
+
+    \item[Conditional probability] \marginnote{Conditional probability}
+        Probability of an event $B$, knowing that another event $A$ happened:
+        \[ \prob{B \vert A} = \frac{\prob{A \cap B}}{\prob{A}} \text{, with } \prob{A} \neq 0 \]
+
+        \begin{example}
+            A coin is tossed three times. 
+            Given the events $A = \{ \text{tails two times} \}$ and $B = \{ \text{one heads and one tails} \}$
+            We have that:
+
+            \begin{minipage}{\linewidth}
+                \centering
+                \small
+                $\Omega = \{ 
+                    (\text{T}, \text{T}, \text{T}), (\text{T}, \text{T}, \text{H}), (\text{T}, \text{H}, \text{T})
+                    (\text{T}, \text{H}, \text{H}), (\text{H}, \text{T}, \text{T}), (\text{H}, \text{T}, \text{H})
+                    (\text{H}, \text{H}, \text{T}), (\text{H}, \text{H}, \text{H})
+                \}$
+            \end{minipage}
+
+            \begin{minipage}{.325\linewidth}
+                \centering
+                $\prob{A} = \frac{4}{8} = \frac{1}{2}$
+            \end{minipage}
+            \begin{minipage}{.325\linewidth}
+                \centering
+                $\prob{B} = \frac{6}{8} = \frac{3}{4}$
+            \end{minipage}
+            \begin{minipage}{.325\linewidth}
+                \centering
+                $\prob{A \cap B} = \frac{3}{8}$
+            \end{minipage}
+
+            \begin{minipage}{.48\linewidth}
+                \centering
+                $\prob{A \vert B} = \frac{3/8}{3/4} = \frac{1}{2}$
+            \end{minipage}
+            \begin{minipage}{.48\linewidth}
+                \centering
+                $\prob{B \vert A} = \frac{3/8}{1/2} = \frac{3}{4}$
+            \end{minipage}
+        \end{example}
+
+    \item[Independent events] \marginnote{Independent events}
+        Two events $A$ and $B$ are independent if:
+        \[ \prob{A \cap B} = \prob{A}\prob{B} \]
+        It follows that:
+
+        \begin{minipage}{.48\linewidth}
+            \centering
+            $\prob{A \vert B} = \prob{A}$
+        \end{minipage}
+        \begin{minipage}{.48\linewidth}
+            \centering
+            $\prob{B \vert A} = \prob{B}$
+        \end{minipage}
+
+        In general, given $n$ events $A_1, \dots, A_n$, they are independent if:
+        \[ \prob{A_1 \cap \dots \cap A_n} = \prod_{i=1}^{n} \prob{A_i} \]
+\end{description}
+
+
+
+\section{Random variables}
+\begin{description}
+    \item[Random variable (RV)] \marginnote{Random variable}
+        A random variable $X$ is a function:
+        \[ X: \Omega \rightarrow \mathbb{R} \]
+
+    \item[Target space/Support] \marginnote{Target space}
+        Given a random variable $X$, 
+        the target space (or support) $\mathcal{T}_X$ of $X$ is the set of all its possible values:
+        \[ \mathcal{T}_X = \{ x \mid x = X(\omega), \forall \omega \in \Omega \} \]
+\end{description}
+
+
+\subsection{Discrete random variables}
+
+\begin{description}
+    \item[Discrete random variable] \marginnote{Discrete random variable}
+        A random variable $X$ is discrete if its target space $\mathcal{T}_X$ is finite or countably infinite.
+
+        \begin{example}
+            A coin is tossed twice.
+
+            Given the random variable $X(\omega) = \{ \text{number of heads} \}$.
+            We have that $\mathcal{T}_X = \{ 0, 1, 2 \}$, therefore $X$ is discrete.
+        \end{example}
+
+        \begin{example}
+            Roll a die until 6 comes out.
+
+            Given the random variable $Y(\omega) = \{ \text{number of rolls before 6} \}$.
+            We have that $\mathcal{T}_Y = \{ 1, 2, \dots \} = \mathbb{N} \smallsetminus \{0\}$, 
+            therefore $Y$ is discrete as $\mathcal{T}_Y$ is a countable set.
+        \end{example}
+
+    \item[Probability mass function (PMF)] \marginnote{Probability mass function (PMF)}
+        Given a discrete random variable $X$, its probability mass function is a function $p_X: \mathcal{T}_X \rightarrow [0, 1]$ such that:
+        \[ p_X(x) = \prob{X = x}, \forall x \in \mathcal{T}_X \]
+
+        A PMF has the following properties:
+        \begin{enumerate}
+            \item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$
+            \item $\sum_{x \in \mathcal{T}_X} p_X(x) = 1$
+            \item Let $A \subseteq \Omega$, $\prob{X = x \in A} = \sum_{x \in A} p_X(x)$
+        \end{enumerate}
+
+        We denote with $X \sim p_X$ a random variable $X$ with PMF $p_X$.
+
+        \begin{example}
+            Let $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$.
+            Given a random variable $X = \{ \text{number of heads} \}$ with $\mathcal{T}_X = \{ 0, 1, 2 \}$.
+            Its PMF is:
+            \[
+                \begin{split}
+                    p_X &= \prob{X = 0} = \frac{1}{4} \\
+                    p_X &= \prob{X = 1} = \frac{2}{4} \\
+                    p_X &= \prob{X = 2} = \frac{1}{4}
+                \end{split}  
+            \]
+        \end{example}
+\end{description}
+
+
+\subsection{Continuous random variables}
+
+\begin{description}
+    \item[Continuous random variable] \marginnote{Continuous random variable}
+        A random variable $X$ is continuous if its target space $\mathcal{T}_X$ is uncountably infinite (i.e. a subset of $\mathbb{R}$).
+        Usually, $\mathcal{T}_X$ is an interval or a union of intervals.
+
+        \begin{example}
+            Given a random variable $Z = \{ \text{Time before the arrival of a client} \}$.
+            $Z$ is continuous as $\mathcal{T}_Z = [a, b] \subseteq [0, +\infty[$ is an uncountable set.
+        \end{example}
+
+    \item[Probability density function (PDF)] \marginnote{Probability density function (PDF)}
+        Given a continuous random variable $X$, 
+        its probability density function is a function $p_X: \mathcal{T}_X \rightarrow \mathbb{R}$ such that:
+        \[ \prob{X \in A} = \int_{A} p_X(x) \,dx \]
+        \[ \prob{a \leq X \leq b} = \int_{a}^{b} p_X(x) \,dx \]
+        Note that $\prob{X = a} = \prob{a \leq X \leq a} = \int_{a}^{a} p_X(x) \,dx = 0$
+
+        A PDF has the following properties:
+        \begin{enumerate}
+            \item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$ 
+            \item $\int_{x \in  \mathcal{T}_X} p_X(x) \,dx = 1$
+            \item $\prob{X \in A} =  \int_{A} p_X(x) \,dx$
+        \end{enumerate}
+
+        We denote with $X \sim p_X$ a random variable $X$ with PDF $p_X$.
+    \end{description}
+
+
+
+\section{Discrete joint distribution}
+
+\begin{description}
+    \item[Univariate distribution] \marginnote{Univariate distribution}
+        Distribution with one random variable.
+    
+    \item[Multivariate distribution] \marginnote{Multivariate distribution}
+        Distribution with multiple random variables.
+    
+    \item[Joint probability] \marginnote{Joint probability}
+        Let $X$ and $Y$ be random variables respectively with target space $\mathcal{T}_X$ and $\mathcal{T}_Y$.
+        The joint probability of $X$ and $Y$ has target space $\mathcal{T}_{XY} = \mathcal{T}_X \times \mathcal{T}_Y$
+        and its PMF is:
+        \[ p_{XY}(x_i, y_j) = \prob{X = x_i \cap Y = y_j} \]
+
+        $p_X(x)$ and $p_Y(y)$ are the \textbf{marginal probabilities}. \marginnote{Marginal probability}
+
+        \begin{example}
+            Let $X$ and $Y$ be random variables respectively with five and three possible states.
+            \begin{center}
+                \includegraphics[width=0.4\textwidth]{img/_joint_probability_example.pdf}
+            \end{center}
+            We denote with:
+            \begin{itemize}
+                \item $N$ the number of events.
+                \item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p_{XY}(x, y) = n_{ij}$).
+                \item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column.
+                \item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row.
+            \end{itemize}
+
+            The marginal probabilities are:\\
+            \begin{minipage}{.48\linewidth}
+                \centering
+                \[ p_X(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
+            \end{minipage}
+            \begin{minipage}{.48\linewidth}
+                \centering
+                \[ p_Y(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
+            \end{minipage}
+
+            The conditional probabilities can be computed as:
+            \[ \prob{Y = y_j \vert X = x_i} = \frac{p_{XY}(x_i, y_i)}{p_X(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
+            \[ \prob{X = x_i \vert Y = y_j} = \frac{p_{XY}(x_i, y_i)}{p_Y(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
+        \end{example}
+\end{description}
+
+
+
+\section{Rules of probability}
+
+\subsection{Sum rule}
+\marginnote{Sum rule\\Marginalization property}
+Given $X$ and $Y$ random variables. The sum rule states that:
+\[
+    p_X(\bm{x}) =
+    \begin{cases}
+        \sum_{\bm{y} \in \mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
+        \int_{\mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
+    \end{cases}
+\]
+
+The sum rule relates the joint distribution and the marginal distribution.
+In fact, the sum rule can be applied to any subset of the random variables of a joint distribution.
+Given $\bm{x} = \begin{pmatrix} x_1, \dots, x_D \end{pmatrix}^T$, 
+the marginal w.r.t. $x_i$ can be obtained by integrating/summing out all random variables except $x_i$:
+\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\smallsetminus i} \]
+
+\subsection{Product rule}
+\marginnote{Product rule}
+\[ p(\bm{x}, \bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) = p(\bm{x} \vert \bm{y}) p(\bm{y}) \]
+
+
+
+\section{Bayes' theorem}
+\begin{theorem}
+    \marginnote{Bayes' theorem}
+    Given two random variables $X$ and $Y$:
+    \[
+        \overbrace{p(\bm{x} \vert \bm{y})}^{\mathclap{\text{posterior}}} = 
+            \frac
+                { \overbrace{p(\bm{y} \vert \bm{x})}^{\mathclap{\text{likelihood }}}  \overbrace{p(\bm{x})}^{\mathclap{\text{ prior}}} }
+                {\underbrace{p(\bm{y})}_{\mathclap{\text{evidence}}}} 
+    \]
+    where:
+    \begin{descriptionlist}
+        \item[Prior] \marginnote{Prior}
+            is the prior knowledge of the unobserved data $\bm{x}$.
+
+        \item[Likelihood] \marginnote{Likelihood}
+            describes the relation between $\bm{x}$ and $\bm{y}$.
+
+        \item[Posterior] \marginnote{Posterior}
+            represents the quantity of interest (i.e. knowledge on $\bm{x}$ after observing $\bm{y}$).
+        
+        \item[Evidence/Marginal likelihood] \marginnote{Evidence/Marginal likelihood}
+            normalizes the posterior. It is defined independently from $\bm{x}$ (i.e. is constant) as:
+            \[ p(\bm{y}) = \int p(\bm{y} \vert \bm{x}) p(\bm{x}) \,d\bm{x} \]
+    \end{descriptionlist}
+\end{theorem}
+\begin{proof}
+    This is a direct consequence of the product rule:
+    \[ 
+        p(\bm{x} \vert \bm{y}) p(\bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) \iff
+        p(\bm{x} \vert \bm{y}) p(\bm{y}) = \frac{p(\bm{y} \vert \bm{x}) p(\bm{x})}{p(\bm{y})}
+    \]
+\end{proof}
+
+Note: sometimes, instead of the full posterior, the maximum is considered (with loss of information):
+\[ \max_x p(x \vert y) = \max_x \frac{p(y \vert x) p(x)}{\underbrace{p(y)}_{\mathclap{\text{constant}}}} = \max_x p(y \vert x) p(x) \]
+
+
+
+\section{Statistics}
+
+\begin{description}
+    \item[Statistic] \marginnote{Statistic}
+        A statistic of a random variable is a deterministic function defined on it. 
+\end{description}
+
+
+\subsection{Mean}
+\begin{description}
+    \item[Expected value (univariate)] \marginnote{Expected value (univariate)}
+        Given a function $g$ of a random variable $X \sim p(x)$,
+        its expected value is:
+        \[ 
+            \mathbb{E}_X[g(x)] = 
+            \begin{cases}
+                \sum_{x \in \mathcal{T}_X} g(x)p(x) & \text{if } $X$ \text{ is discrete} \\
+                \int_{\mathcal{T}_X} g(x)p(x) \,dx  & \text{if } $X$ \text{ is continuous} \\
+            \end{cases}
+        \]
+
+    \item[Expected value (multivariate)] \marginnote{Expected value (multivariate)}
+        A multivariate random variable $X$ can be seen as 
+        a vector of univariate random variables $\begin{pmatrix} X_1, \dots, X_D \end{pmatrix}^T$.
+        Its expected value can be computed element-wise as:
+        \[ 
+            \mathbb{E}_X[g(\bm{x})] = 
+            \begin{pmatrix} \mathbb{E}_{X_1}[g(x_1)] \\ \vdots \\ \mathbb{E}_{X_D}[g(x_D)] \end{pmatrix} \in \mathbb{R}^D
+        \]
+
+    \item[Mean] \marginnote{Mean}
+        Given a random variable $X \sim p(x)$,
+        the mean of $X$ is its expected value with $g$ defined as the identity:
+        \[ 
+            \mathbb{E}_X[x] = 
+            \begin{cases}
+                \sum_{x \in \mathcal{T}_X} x \cdot p(x) & \text{if } $X$ \text{ is discrete} \\
+                \int_{\mathcal{T}_X} x \cdot p(x) \,dx  & \text{if } $X$ \text{ is continuous} \\
+            \end{cases}
+        \]
+\end{description}
+
+
+\subsection{Variance}
+\begin{description}
+    \item[Covariance (univariate)] \marginnote{Covariance (univariate)}
+        Given two univariate random variables $X$ and $Y$, their covariance is:
+        \[ \text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[(x - \mathbb{E}_X[x])(y - \mathbb{E}_Y[y])] \]
+
+        \begin{lemma} 
+            $\text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[x, y] - \mathbb{E}_{X}[x]\mathbb{E}_{Y}[y]$
+        \end{lemma}
+
+    \item[Variance (univariate)] \marginnote{Variance (univariate)}
+        The variance of a univariate random variable is given by:
+        \[ \mathbb{V}_X[x] = \text{Cov}_X[x, x] \]
+        Its square root is the standard deviation $\sigma(x)$.
+
+    \item[Covariance (multivariate)] \marginnote{Covariance (multivariate)}
+        Given two multivariate random variables 
+        $X$ and $Y$ with states $\bm{x} \in \mathbb{R}^D$ and $\bm{y} \in \mathbb{R}^E$,
+        their covariance is:
+        \[ 
+            \text{Cov}_{XY}[\bm{x}, \bm{y}] = \text{Cov}_{XY}[\bm{y}, \bm{x}]^T =
+            \mathbb{E}_{XY}[\bm{xy}^T] - \mathbb{E}_{X}[\bm{x}]\mathbb{E}_{Y}[\bm{y}]^T \in \mathbb{R}^{D \times E}
+        \]
+
+
+    \item[Variance (multivariate)] \marginnote{Variance (multivariate)}
+        Given a multivariate random variable $X$ with 
+        states $\bm{x} \in \mathbb{R}^D$ and mean vector $\bm{\mu} \in \mathbb{R}^D$.
+        Its variance is given by:
+        \[
+            \begin{split}
+                \mathbb{V}_X[\bm{x}] &= \text{Cov}_X[\bm{x}, \bm{x}] \\
+                    &= \mathbb{E}_X[\bm{xx}^T] - \mathbb{E}_X[\bm{x}]\mathbb{E}_X[\bm{x}]^T \\
+                    &= 
+                    \begin{pmatrix}
+                        \text{Cov}[x_1, x_1] & \text{Cov}[x_1, x_2] & \cdots & \text{Cov}[x_1, x_D] \\
+                        \text{Cov}[x_2, x_1] & \text{Cov}[x_2, x_2] & \cdots & \text{Cov}[x_2, x_D] \\
+                        \vdots & \vdots & \ddots & \vdots \\
+                        \text{Cov}[x_D, x_1] & \text{Cov}[x_D, x_2] & \cdots & \text{Cov}[x_D, x_D] \\
+                    \end{pmatrix} \in \mathbb{R}^{D \times D}
+            \end{split}
+        \]
+        This matrix is called covariance matrix and is symmetric positive semidefinite.
+
+    \item[Correlation] \marginnote{Correlation}
+        Given two random variables $X$ and $Y$, their correlation is:
+        \[ \text{corr}[x, y] = \frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}} \in [-1, 1] \]
+        \begin{itemize}
+            \item When $\text{corr}[x, y] \rightarrow +1$, $x$ and $y$ are expected to grow together.
+            \item When $\text{corr}[x, y] \rightarrow -1$, $x$ grows when $y$ decreases and vice versa.
+            \item When $\text{corr}[x, y] \rightarrow 0$, $x$ and $y$ are not correlated.
+        \end{itemize}
+\end{description}
+
+
+\subsection{Empirical mean and variance}
+In practice, it is not always possible to compute statistics on the real population.
+Empirical observations can be made on a (finite) subset of the real population sampled as 
+a finite number of identical random variables $X_1, \dots, X_N$.
+
+\begin{description}
+    \item[Empirical mean] \marginnote{Empirical mean}
+        \[ \bar{x} = \frac{1}{N} \sum_{n=1}^{N}x_n \]
+    \item[Empirical variance] \marginnote{Empirical variance}
+        \[ \sigma^2 = \frac{1}{N} \sum_{n=1}^{N}(x_n - \bar{x})^2 \]
+\end{description}
+
+
+
+\section{Random variables properties}
+
+\subsection{Manipulations}
+\begin{itemize}
+    \item $\mathbb{E}[\bm{x} + \bm{y}] = \mathbb{E}[\bm{x}] + \mathbb{E}[\bm{y}]$
+    \marginnote{Manipulations of random variables}
+    \item $\mathbb{E}[\bm{x} - \bm{y}] = \mathbb{E}[\bm{x}] - \mathbb{E}[\bm{y}]$
+    \item $\mathbb{V}[\bm{x} + \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] + \text{Cov}[\bm{x}, \bm{y}] + \text{Cov}[\bm{y}, \bm{x}]$
+    \item $\mathbb{V}[\bm{x} - \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] - \text{Cov}[\bm{x}, \bm{y}] - \text{Cov}[\bm{y}, \bm{x}]$
+\end{itemize}
+
+
+\subsection{Statistical independence}
+\marginnote{Statistical independence}
+Two random variables $X$ and $Y$ are statistically independent iff:
+\[ p(\bm{x}, \bm{y}) = p(\bm{x})p(\bm{y}) \]
+
+\begin{theorem}
+    If $X$ and $Y$ are statistically independent, then:
+    \begin{itemize}
+        \item $p(\bm{x} \vert \bm{y}) = p(\bm{x})$ and $p(\bm{y} \vert \bm{x}) = p(\bm{y})$
+        \item $\mathbb{V}_{XY}[\bm{x} + \bm{y}] = \mathbb{V}_X[\bm{x}] + \mathbb{V}_Y[\bm{y}]$
+        \item $\text{Cov}_{XY}[\bm{x}, \bm{y}] = \nullvec$
+    \end{itemize}
+\end{theorem}
+
+
+\subsection{Conditional independence}
+\marginnote{Conditional independence}
+Two random variables $X$ and $Y$ are conditionally independent given $Z$ iff:
+\[ p(\bm{x}, \bm{y} \vert \bm{z}) = p(\bm{x} \vert \bm{z}) p(\bm{y} \vert \bm{z}) \, \forall \bm{z} \in \mathcal{T}_Z \]
+
+
+\subsection{Inner product}
+\marginnote{Inner product of random variables}
+Given two zero mean random variables $X$ and $Y$, their inner product is defined as:
+\[ \left\langle X, Y \right\rangle = \text{Cov}[x, y] \]
+The covariance matrix is symmetric positive definite.
+
+Moreover, we have that:
+\begin{itemize}
+    \item $\Vert X \Vert = \sqrt{\langle X, X \rangle} = \sqrt{\text{Cov}[x, x]} = \sqrt{\mathbb{V}[x]} = \sigma[x]$
+    \item 
+        $\cos\theta = \frac{\langle X, Y \rangle}{\Vert X \Vert \cdot \Vert Y \Vert} = 
+        \frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}}$, where $\theta$ is the angle between $X$ and $Y$.
+    \item $X \perp Y \iff \langle X, Y \rangle = 0 \iff \text{Cov}[x, y] = 0 \iff X \text{ and } Y \text{ uncorrelated}$
+\end{itemize}
+
+
+
+\section{Common distributions}
+
+\subsection{Discrete random variables}
+\begin{descriptionlist}
+    \item[Uniform distribution] \marginnote{Uniform distribution}
+        Given a discrete random variable $X$ with $\vert \mathcal{T}_X \vert = N$,
+        $X$ has a uniform distribution if:
+        \[ p_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
+    
+    \item[Poisson distribution] \marginnote{Poisson distribution}
+        Given a discrete random variable $X$ with mean $\lambda$,
+        $X$ has a poisson distribution if:
+        \[ p_X(x) = e^{-\lambda} \frac{\lambda^x}{x!}, \forall x \in \mathcal{T}_X \]
+
+        A poisson distribution has $\mathbb{E}[x] = \lambda$ and $\mathbb{V}[x] = \lambda$.
+\end{descriptionlist}
+
+
+\subsection{Continuous random variables}
+\begin{descriptionlist}
+    \item[Continuous uniform distribution] \marginnote{Continuous uniform distribution}
+        Given a continuous random variable $X$ with $\mathcal{T}_X = [a, b]$,
+        $X$ has a continuous uniform distribution if:
+        \[ p_X(x) = \frac{1}{b-a}, \forall x \in \mathcal{T}_X \]
+    
+    \item[Normal distribution] \marginnote{Normal distribution}
+        Given a continuous random variable $X$ and the parameters $\mu$ (mean) and $\sigma$ (variance).
+        $X$ has a normal distribution if:
+        \[ p_X(x) = \frac{1}{\sigma \sqrt{2\pi}} e^{\frac{-(x-\mu)^2}{2\sigma^2}} , \forall x \in \mathcal{T}_X\]
+
+        In the multivariate case, it is defined as:
+        \[ 
+            p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}, \matr{\Sigma}) = 
+                (2\pi)^{-\frac{D}{2}} \vert \matr{\Sigma} \vert^{-\frac{1}{2}} e^{(-\frac{1}{2}(\bm{x} - \bm{\mu})^T\matr{\Sigma}^{-1}(\bm{x}-\bm{\mu}))}
+                \in \mathbb{R}
+        \]
+        where $\bm{\mu}$ is the mean vector and  $\matr{\Sigma}$ the covariance matrix.
+
+        \begin{description}
+            \item[Standard normal distribution] \marginnote{Standard normal distribution}
+                Normal distribution with $\mu = 0$ and $\sigma = 1$ (univariate) or 
+                $\bm{\mu} = \nullvec$ and $\matr{\Sigma} = \matr{I}$ (multivariate).
+        \end{description}
+
+        \begin{figure}[ht]
+            \centering
+            \includegraphics[width=0.40\textwidth]{img/normal_distribution.png}
+            \caption{Normal distributions and standard normal distribution}
+        \end{figure}
+
+
+        \begin{theorem}[Linearity]
+            \marginnote{Gaussian sum and linear transformations}
+            Given $X$ and $Y$ independent Gaussian random variables with
+            $p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}_x, \matr{\Sigma}_x)$ and
+            $p(\bm{y}) = \mathcal{N}(\bm{y} \vert \bm{\mu}_y, \matr{\Sigma}_y)$.
+            It holds that:
+            \[ p(a\bm{x} + b\bm{y}) = \mathcal{N}(a\bm{\mu}_x + b\bm{\mu}_y, a^2\matr{\Sigma}_x + b^2\matr{\Sigma}_y) \]
+        \end{theorem}
+\end{descriptionlist}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
@ -0,0 +1,360 @@
+\chapter{Vector calculus}
+
+
+\section{Gradient of real-valued multivariate functions}
+
+\begin{description}
+    \item[Gradient] \marginnote{Gradient}
+        Given a function $f: \mathbb{R}^n \rightarrow \mathbb{R}$, 
+        the gradient is a row vector containing the partial derivatives of $f$:
+        \[ 
+            \nabla f(\vec{x}) = 
+            \begin{pmatrix}
+                \frac{\partial f(\vec{x})}{\partial x_1} & \frac{\partial f(\vec{x})}{\partial x_2} & \dots & \frac{\partial f(\vec{x})}{\partial x_n}
+            \end{pmatrix}
+            \in \mathbb{R}^{1 \times n}
+        \]
+
+    \item[Hessian] \marginnote{Hessian matrix}
+        Given a function $f: \mathbb{R}^n \rightarrow \mathbb{R}$, 
+        the Hessian matrix $\matr{H} \in \mathbb{R}^{n \times n}$ contains the second derivatives of $f$:
+        \[
+            \matr{H} = 
+            \begin{pmatrix}
+                \frac{\partial f}{\partial x_1^2}               & \frac{\partial f}{\partial x_1 \partial x_2} & \dots & \frac{\partial f}{\partial x_1 \partial x_n} \\
+                \frac{\partial f}{\partial x_2 \partial x_1}    & \frac{\partial f}{\partial x_2^2} & \dots & \vdots \\
+                \vdots                                          & \vdots & \ddots & \vdots \\
+                \frac{\partial f}{\partial x_n \partial x_1}    & \dots & \dots & \frac{\partial f}{\partial x_n^2}
+            \end{pmatrix} 
+        \]
+        In other words, $H_{i,j} = \frac{\partial f}{\partial x_i \partial x_j}$.
+        Moreover, $\matr{H}$ is symmetric.
+\end{description}
+
+\subsection{Partial differentiation rules}
+\begin{description}
+    \item[Product rule] \marginnote{Product rule}
+        Let $f, g: \mathbb{R}^n \rightarrow \mathbb{R}$:
+        \[ 
+            \frac{\partial}{\partial \vec{x}} (f(\vec{x})g(\vec{x})) = 
+                \frac{\partial f}{\partial \vec{x}} g(\vec{x}) + f(\vec{x}) \frac{\partial g}{\partial \vec{x}}
+        \]
+    \item[Sum rule] \marginnote{Sum rule}
+        Let $f, g: \mathbb{R}^n \rightarrow \mathbb{R}$:
+        \[
+            \frac{\partial}{\partial \vec{x}} (f(\vec{x}) + g(\vec{x})) =
+                \frac{\partial f}{\partial \vec{x}} + \frac{\partial g}{\partial \vec{x}}
+        \]
+    \item[Chain rule] \marginnote{Chain rule}
+        Let $f: \mathbb{R}^n \rightarrow \mathbb{R}$ and $\vec{g}$ a vector of $n$ functions $g_i: \mathbb{R}^m \rightarrow \mathbb{R}$:
+        \[
+            \frac{\partial}{\partial \vec{x}} (f \circ \vec{g})(\vec{x}) = 
+                \frac{\partial}{\partial \vec{x}} \Big( f(\vec{g}(\vec{x})) \Big) =
+                \frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial \vec{x}}
+        \]
+
+        For instance, consider a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables 
+        $g_1(t), g_2(t): \mathbb{R} \rightarrow \mathbb{R}$ that are functions of $t$. 
+        The gradient of $f$ with respect to $t$ is:
+        \[
+            \frac{\text{d}f}{\text{d}t} = 
+            % \frac{\partial f}{\partial (g_1, g_2)} \frac{\partial (g_1, g_2)}{\partial t} =
+            \begin{pmatrix}
+                \frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
+            \end{pmatrix}
+            \begin{pmatrix}
+                \frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
+            \end{pmatrix}
+            = \frac{\partial f}{\partial g_1} \frac{\partial g_1}{\partial t} + \frac{\partial f}{\partial g_2} \frac{\partial g_2}{\partial t}
+        \]
+        In other words, the first matrix represents the gradient of $f$ w.r.t. its variables and 
+        the second matrix contains in the $i$-th row the gradient of $g_i$.
+
+        Therefore, if $g_i$ are in turn multivariate functions $g_1(s, t), g_2(s, t): \mathbb{R}^2 \rightarrow \mathbb{R}$,
+        the chain rule can be applied as follows:
+        \[
+            \frac{\text{d}f}{\text{d}(s, t)} = 
+            \begin{pmatrix}
+                \frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
+            \end{pmatrix}
+            \begin{pmatrix}
+                \frac{\partial g_1}{\partial s} & \frac{\partial g_1}{\partial t} \\ 
+                \frac{\partial g_2}{\partial s} & \frac{\partial g_2}{\partial t}
+            \end{pmatrix}
+        \]
+
+        \begin{example}
+            Let $f(x_1, x_2) = x_1^2 + 2x_2$, where $x_1 = \sin(t)$ and $x_2 = \cos(t)$.
+            \[
+                \begin{split}
+                    \frac{\text{d}f}{\text{d}t} & = 
+                        \frac{\partial f}{\partial x_1}\frac{\partial x_1}{\partial t} + \frac{\partial f}{\partial x_2}\frac{\partial x_2}{\partial t} \\
+                        & = (2x_1)(\cos(t)) + (2)(-\sin(t)) \\
+                        & = 2\sin(t)\cos(t) - 2\sin(t)
+                \end{split}
+            \]
+        \end{example}
+
+        \begin{example}
+            Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ \vec{g})(t) = f(\vec{g}(t))$ where:
+            \[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(g_1, g_2) = \exp(g_1 g_2^2) \]
+            \[ 
+                \vec{g}: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as } 
+                \vec{g}(t) = \begin{pmatrix} g_1 \\ g_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
+            \]
+            The gradient of $h$ with respect to $t$ can be computed as:
+            \[
+                \frac{\text{d} h}{\text{d} t} =
+                    \frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial t} =
+                    \begin{pmatrix}
+                        \frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
+                    \end{pmatrix}
+                    \begin{pmatrix}
+                        \frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
+                    \end{pmatrix}
+            \]
+            \[
+                = 
+                \begin{pmatrix} \exp(g_1 g_2^2)g_2^2 & 2\exp(g_1 g_2^2)g_1 g_2 \end{pmatrix}
+                \begin{pmatrix} \cos(t) + (-t\sin(t)) \\ \sin(t) + t\cos(t) \end{pmatrix}
+            \]
+        \end{example}
+
+        \begin{example}[Gradient of a least squares loss] \marginnote{Least squares loss gradient}
+            Given a linear model defined on $\vec{\uptheta}$:
+            \[ \vec{y} = \matr{\Phi}\vec{\uptheta} \]
+        \end{example}
+        with $\vec{\uptheta} \in \mathbb{R}^D$, $\matr{\Phi} \in \mathbb{R}^{N \times D}$ and $\vec{y} \in \mathbb{R}^N$.
+        We can define the least squares loss function as:
+        \[ L(\vec{e}) = \Vert \vec{e} \Vert_2^2 \]
+        \[ \vec{e}(\vec{\uptheta}) = \vec{y} - \matr{\Phi}\vec{\uptheta} \]
+        It must be noted that:
+        \[ L(\vec{e}) = \Vert \vec{e} \Vert_2^2 = \vec{e}^T\vec{e} = \sum_{i=1}^{N} \vec{e}_i^2 \]
+
+        To compute the gradient of $L$ with respect to $\vec{\uptheta}$, we can use the chain rule:
+        \[ 
+            \begin{split}
+                \nabla L(\vec{\uptheta}) &= \frac{\partial L}{\partial \vec{e}} \frac{\partial \vec{e}}{\partial \vec{\uptheta}} 
+                        = (2\vec{e}^T) (-\matr{\Phi}) \\
+                    & = -2(\vec{y}^T - \vec{\uptheta}^T \matr{\Phi}^T)\matr{\Phi} \\
+                    & = -2(\vec{y}^T\matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi})
+            \end{split}
+        \]
+
+        Note that if we enforce $\nabla L(\vec{\uptheta}) = \nullvec$, we obtain the normal equation of \Cref{sec:lls}:
+        \[
+            \begin{split}
+                \nabla L = 0 &\iff -2(\vec{y}^T\matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi}) = \nullvec \\
+                    &\iff \vec{y}^T \matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi} = \nullvec \\
+                    &\iff \matr{\Phi}^T \vec{y} - \matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \nullvec
+            \end{split}
+        \]
+\end{description}
+
+
+
+\section{Gradient of vector-valued multivariate functions}
+
+\begin{description}
+    \item[Vector-valued function]
+        Function $\vec{f}: \mathbb{R}^n \rightarrow \mathbb{R}^m$ with $n \geq 1$ and $m > 1$.
+        Given $\vec{x} \in \mathbb{R}^n$, the output can be represented as:
+        \[
+            \vec{f}(\vec{x}) = 
+            \begin{pmatrix}
+                f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})
+            \end{pmatrix} \in \mathbb{R}^m
+        \]
+        where $f_i: \mathbb{R}^n \rightarrow \mathbb{R}$.
+
+    \item[Jacobian] \marginnote{Jacobian matrix}
+        Given $\vec{f}: \mathbb{R}^n \rightarrow \mathbb{R}^m$, the Jacobian matrix $\matr{J} \in \mathbb{R}^{m \times n}$
+        contains the first-order derivatives of $\vec{f}$:
+        \[
+            \matr{J} = \nabla\vec{f}(\vec{x}) = 
+            \begin{pmatrix}
+                \frac{\partial \vec{f}(\vec{x})}{\partial x_1} & \dots & \frac{\partial \vec{f}(\vec{x})}{\partial x_n}
+            \end{pmatrix} = 
+            \begin{pmatrix}
+                \frac{\partial f_1(\vec{x})}{\partial x_1} & \dots & \frac{\partial f_1(\vec{x})}{\partial x_n} \\
+                \vdots & \ddots & \vdots \\
+                \frac{\partial f_m(\vec{x})}{\partial x_1} & \dots & \frac{\partial f_m(\vec{x})}{\partial x_n} \\
+            \end{pmatrix}
+        \]
+        In other words, $J_{i,j} = \frac{\partial f_i}{\partial x_j}$.
+        Note that the Jacobian matrix is a generalization of the gradient in the real-valued case.
+    \end{description}
+
+
+
+\section{Backpropagation}
+\marginnote{Backpropagation}
+Backpropagation is used to tune the parameters of a neural network.
+A neural network can be seen as a composition of many functions:
+\[ \vec{y} = (\vec{f}_K \circ \vec{f}_{K-1} \circ \dots \circ \vec{f}_1)(\vec{x}) = \vec{f}_K(\vec{f}_{K-1}(\cdots \vec{f}_1(\vec{x}) \cdots)) \]
+Each $\vec{f}_i$ takes as input the output of the previous layer $\vec{x}_{i-1}$ and has the form:
+\[ \vec{f}_i(\vec{x}_{i-1}) = \sigma_i(\matr{A}_{i-1}\vec{x}_{i-1} + \vec{b}_{i-1}) \]
+where $\sigma_i$ is an activation function\footnote{\url{https://en.wikipedia.org/wiki/Activation_function}} (a function to add nonlinearity), 
+while $\matr{A}_{i-1}$ (linear mapping) and $\vec{b}_{i-1}$ (biases) are the parameters of $\vec{f}_i$.
+
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.7\textwidth]{img/_forward_pass.pdf}
+    \caption{Forward pass}
+\end{figure}
+
+We can more compactly denote a neural network with input $\vec{x}$ and $K$ layers as:
+\[ 
+    \begin{split}
+        \vec{f}_0 &= \vec{x} \\
+        \vec{f}_i &= \sigma_i(\matr{A}_{i-1} \vec{f}_{i-1} + \vec{b}_{i-1}) \text{ } i=1, \dots, K
+    \end{split}
+\]
+Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimize the squared loss:
+\[ L(\vec{\uptheta}) = \Vert \vec{y} - \vec{f}_K(\vec{\uptheta}, \vec{x}) \Vert^2 \]
+where $\vec{\uptheta} = \{ \matr{A}_{0}, \vec{b}_{0}, \dots, \matr{A}_{K-1}, \vec{b}_{K-1} \}$ are the parameters of each layer.
+This can be done by using the chain rule to compute the partial derivatives of $L$ with respect to the parameters $\vec{\uptheta}_j = \{ \matr{A}_j, \vec{b}_j \}$:
+\[
+    \begin{split}
+        \frac{\partial L}{\partial \vec{\uptheta}_{K-1}} &= 
+            \overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{\uptheta}_{K-1}}}^{\mathclap{\text{New}}} \\
+        \frac{\partial L}{\partial \vec{\uptheta}_{K-2}} &= 
+            \overbrace{\frac{\partial L}{\partial \vec{f}_K}}^{\mathclap{\text{Known}}}
+            \overbrace{\frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}} \frac{\partial \vec{f}_{K-1}}{\partial \vec{\uptheta}_{K-2}}}^{\mathclap{\text{New}}} \\
+        \frac{\partial L}{\partial \vec{\uptheta}_{K-3}} &= 
+            \overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}}}^{\mathclap{\text{Known}}}
+            \overbrace{\frac{\partial \vec{f}_{K-1}}{\partial \vec{f}_{K-2}} \frac{\partial \vec{f}_{K-2}}{\partial \vec{\uptheta}_{K-3}}}^{\mathclap{\text{New}}} \\
+        \vdots \\
+        \frac{\partial L}{\partial \vec{\uptheta}_{i}} &= 
+            \overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}} \dots}^{\mathclap{\text{Known}}}
+            \overbrace{\frac{\partial \vec{f}_{i+2}}{\partial \vec{f}_{i+1}} \frac{\partial \vec{f}_{i+1}}{\partial \vec{\uptheta}_{i}}}^{\mathclap{\text{New}}}
+    \end{split}
+\]
+
+\begin{figure}[ht]
+    \centering
+    \includegraphics[width=0.7\textwidth]{img/_backward_pass.pdf}
+    \caption{Backward pass}
+\end{figure}
+
+
+
+\section{Automatic differentiation}
+Starting from the example below first is recommended.\\
+
+\marginnote{Automatic differentiation}
+Automatic differentiation allows to numerically compute 
+the gradient of complex functions using elementary functions, intermediate variables and the chain rule through a computation graph.
+When the gradient has many components, it also allows to compute it more efficiently.
+
+Let $f$ be a function,
+$x_1, \dots, x_d$ the input variables of $f$,
+$x_{d+1}, \dots, x_{D-1}$ the intermediate variables and
+$x_D$ the output variable.
+The computation graph can be expressed as:
+\[
+    \forall i \in \{ d+1, \dots, D \}: x_i = g_i(x_{\text{Pa}(x_i)})
+\]
+where $g_i$ are elementary functions and $x_{\text{Pa}(x_i)}$ are the parent nodes of $x_i$ in the graph.
+In other words, each intermediate variable is expressed as an elementary function of its preceding nodes.
+The derivatives of $f$ can then be computed step-by-step going backward as:
+\[ \frac{\partial f}{\partial x_D} = 1 \text{, as by definition } f = x_D \]
+\[ 
+    \frac{\partial f}{\partial x_i} = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial x_c}{\partial x_i}
+        = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial g_c}{\partial x_i}
+\]
+where $\text{Pa}(x_c)$ is the set of parent nodes of $x_c$ in the graph.
+In other words, to compute the partial derivative of $f$ w.r.t. $x_i$, 
+we apply the chain rule by computing 
+the partial derivative of $f$ w.r.t. the variables following $x_i$ in the graph (as the computation goes backward).
+
+Automatic differentiation is applicable to all functions that can be expressed as a computational graph and 
+when the elementary functions are differentiable.
+Note that backpropagation is a special case of automatic differentiation.
+
+\begin{example}
+    Given the function:
+    \[ f(x) = \sqrt{x^2 + \exp(x^2)} + \cos(x^2 + \exp(x^2)) \]
+    and the elementary functions $\{ (\cdot)^2, \exp(\cdot), +, \sqrt{\cdot}, \cos(\cdot) \}$,
+    $f$ can be decomposed in the following intermediate variables:\\
+    \begin{minipage}{.5\linewidth}
+        \[
+            \begin{split}
+                a &= x^2        \\
+                b &= \exp(a)    \\
+                c &= a + b      \\
+                d &= \sqrt{c}   \\
+            \end{split} 
+        \]
+    \end{minipage}%
+    \begin{minipage}{.5\linewidth}
+        \[
+            \begin{split}
+                e &= \cos(c)    \\
+                f &= d + e      \\
+            \end{split} 
+        \]
+    \end{minipage}\\
+
+    Which corresponds to the following computation graph:
+    \begin{center}
+        \includegraphics[width=0.75\textwidth]{img/auto_diff.png}
+    \end{center}
+
+    We can then compute the derivatives of the intermediate variables w.r.t. their inputs (i.e. inbound edges):\\
+    \begin{minipage}{.5\linewidth}
+        \[
+            \begin{split}
+                \frac{\partial a}{\partial x} &= 2x                     \\
+                \frac{\partial b}{\partial a} &= \exp(a)                \\
+                \frac{\partial c}{\partial a} &= 1                      \\
+                \frac{\partial c}{\partial b} &= 1
+            \end{split}
+        \]
+    \end{minipage}%
+    \begin{minipage}{.5\linewidth}
+        \[
+            \begin{split}
+                \frac{\partial d}{\partial c} &= \frac{1}{2\sqrt{c}}    \\
+                \frac{\partial e}{\partial c} &= -\sin(c)               \\
+                \frac{\partial f}{\partial d} &= 1                      \\
+                \frac{\partial f}{\partial e} &= 1                      
+            \end{split}
+        \]
+    \end{minipage}\\
+
+    Finally, we can compute $\frac{\partial f}{\partial x}$ by going backward from the output ($f$) to the input ($x$):\\
+    \begin{minipage}{.5\linewidth}
+        \[
+            \begin{split}
+                \frac{\partial f}{\partial d} &= \text{ known (previous step)} \\
+                \frac{\partial f}{\partial e} &= \text{ known (previous step)} \\
+                \frac{\partial f}{\partial c} &= 
+                    \frac{\partial f}{\partial d}\frac{\partial d}{\partial c} + \frac{\partial f}{\partial e}\frac{\partial e}{\partial c} \\
+            \end{split}
+        \]
+    \end{minipage}%
+    \begin{minipage}{.5\linewidth}
+        \[
+            \begin{split}
+                \frac{\partial f}{\partial b} &= \frac{\partial f}{\partial c}\frac{\partial c}{\partial b} \\
+                \frac{\partial f}{\partial a} &= 
+                    \frac{\partial f}{\partial b}\frac{\partial b}{\partial a} + \frac{\partial f}{\partial c}\frac{\partial c}{\partial a} \\
+                \frac{\partial f}{\partial x} &= \frac{\partial f}{\partial a}\frac{\partial a}{\partial x}                    
+            \end{split}
+        \]
+    \end{minipage}\\
+
+    In other words, to compute the partial derivative of $f$ w.r.t. a variable $x_i$, 
+    all variables $w_j$ that follows $x_i$ in the graph are considered.
+
+    Now, by substituting we obtain:
+    \[
+        \begin{split}
+        \frac{\partial f}{\partial c} &= 1 \cdot \frac{1}{2\sqrt{c}} + 1 \cdot (-\sin(c)) \\
+        \frac{\partial f}{\partial b} &= \frac{\partial f}{\partial c} \cdot 1 \\
+        \frac{\partial f}{\partial a} &= \frac{\partial f}{\partial b} \cdot \exp(a) + \frac{\partial f}{\partial c} \cdot 1 \\
+        \frac{\partial f}{\partial x} &= \frac{\partial f}{\partial a} \cdot 2x
+        \end{split}
+    \] 
+\end{example}
--- a/src/year1/statistical-and-mathematical-methods-for-ai/smm.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/smm.tex
@ -0,0 +1,21 @@
+\documentclass[11pt]{ainotes}
+
+\title{Statistical and Mathematical Methods for Artificial Intelligence}
+\date{2023 -- 2024}
+\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
+
+\begin{document}
+    
+    \makenotesfront
+
+    \input{sections/_finite_numbers.tex}
+    \input{sections/_linear_algebra.tex}
+    \input{sections/_linear_systems.tex}
+    \input{sections/_matrix_decomp.tex}
+    \input{sections/_vector_calculus.tex}
+    \input{sections/_gradient_methods.tex}
+    \input{sections/_probability.tex}
+    \input{sections/_machine_learning.tex}
+    \eoc
+    
+\end{document}