Moved SMM in year1

This commit is contained in:
2023-12-27 17:49:28 +01:00
parent c98859ed9e
commit 3dc77a448a
33 changed files with 1 additions and 1 deletions

View File

@ -0,0 +1 @@
../../ainotes.cls

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

View File

@ -0,0 +1,88 @@
<mxfile host="app.diagrams.net" modified="2023-09-22T09:37:27.395Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0" etag="3qzh6VvLSaXopiRghqnY" version="21.7.0" type="device">
<diagram name="Pagina-1" id="mETDQKEhh33VIil_YAIY">
<mxGraphModel dx="819" dy="401" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="AFoxFzemWGuV3oYDkwgm-1" value="" style="ellipse;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="200" y="300" width="150" height="150" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-2" value="" style="ellipse;fillStyle=auto;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="280" y="340" width="10" height="10" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-3" value="" style="ellipse;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="280" y="400" width="10" height="10" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-4" value="U&amp;nbsp; " style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
<mxGeometry x="240" y="330" width="40" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-5" value="&lt;div align=&quot;right&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;&amp;nbsp;U&lt;/font&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;+ΔU&amp;nbsp; &lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
<mxGeometry x="230" y="390" width="50" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-6" value="" style="ellipse;whiteSpace=wrap;html=1;" parent="1" vertex="1">
<mxGeometry x="420" y="300" width="150" height="150" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-7" value="" style="ellipse;fillStyle=auto;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="480" y="340" width="10" height="10" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-8" value="" style="ellipse;fillColor=#99CCFF;" parent="1" vertex="1">
<mxGeometry x="480" y="400" width="10" height="10" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-9" value="&amp;nbsp;V" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
<mxGeometry x="490" y="330" width="50" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-10" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;&amp;nbsp;V&lt;/font&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;+ΔV&amp;nbsp; &lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
<mxGeometry x="490" y="390" width="50" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-11" value="" style="endArrow=classic;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;curved=1;" parent="1" source="AFoxFzemWGuV3oYDkwgm-2" target="AFoxFzemWGuV3oYDkwgm-7" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="410" y="420" as="sourcePoint" />
<mxPoint x="460" y="370" as="targetPoint" />
<Array as="points">
<mxPoint x="390" y="310" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-12" value="" style="endArrow=classic;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;curved=1;" parent="1" source="AFoxFzemWGuV3oYDkwgm-3" target="AFoxFzemWGuV3oYDkwgm-8" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="300" y="355" as="sourcePoint" />
<mxPoint x="530" y="355" as="targetPoint" />
<Array as="points">
<mxPoint x="390" y="360" />
</Array>
</mxGeometry>
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-13" value="&lt;font face=&quot;Times New Roman&quot; size=&quot;1&quot;&gt;&lt;i&gt;&lt;font style=&quot;font-size: 15px;&quot;&gt;f&lt;/font&gt;&lt;/i&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
<mxGeometry x="330" y="290" width="110" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-15" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="AFoxFzemWGuV3oYDkwgm-7" target="AFoxFzemWGuV3oYDkwgm-8" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="450" y="380" as="sourcePoint" />
<mxPoint x="500" y="330" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-16" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;exitX=0.5;exitY=0;exitDx=0;exitDy=0;" parent="1" source="AFoxFzemWGuV3oYDkwgm-3" target="AFoxFzemWGuV3oYDkwgm-2" edge="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="270" y="410" as="sourcePoint" />
<mxPoint x="320" y="360" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-17" value="&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;Δ&lt;/font&gt;U&amp;nbsp; " style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
<mxGeometry x="240" y="360" width="40" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-18" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;&amp;nbsp;&lt;/font&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;ΔV&amp;nbsp; &lt;br&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
<mxGeometry x="490" y="360" width="30" height="30" as="geometry" />
</mxCell>
<mxCell id="AFoxFzemWGuV3oYDkwgm-19" value="&lt;div align=&quot;left&quot;&gt;&lt;font style=&quot;font-size: 15px;&quot; face=&quot;Times New Roman&quot;&gt;Inherent error &lt;/font&gt;&lt;/div&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
<mxGeometry x="580" y="355" width="90" height="40" as="geometry" />
</mxCell>
<mxCell id="x--qwbr77Wqyja1BnvlK-2" value="" style="endArrow=classic;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;exitX=0;exitY=0.5;exitDx=0;exitDy=0;strokeWidth=2;" edge="1" parent="1" source="AFoxFzemWGuV3oYDkwgm-19" target="AFoxFzemWGuV3oYDkwgm-18">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="600" y="375" as="sourcePoint" />
<mxPoint x="450" y="370" as="targetPoint" />
</mxGeometry>
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 133 KiB

View File

@ -0,0 +1,11 @@
{
"name": "Statistical and Mathematical Methods for Artificial Intelligence",
"year": 1,
"semester": 1,
"pdfs": [
{
"name": null,
"path": "smm.pdf"
}
]
}

View File

@ -0,0 +1,208 @@
\chapter{Finite numbers}
\section{Sources of error}
\begin{description}
\item[Measure error] \marginnote{Measure error}
Precision of the measuring instrument.
\item[Arithmetic error] \marginnote{Arithmetic error}
Propagation of rounding errors in each step of an algorithm.
\item[Truncation error] \marginnote{Truncation error}
Approximating an infinite procedure to a finite number of iterations.
\item[Inherent error] \marginnote{Inherent error}
Caused by the finite representation of the data (floating-point).
\begin{figure}[h]
\centering
\includegraphics[width=0.6\textwidth]{img/_inherent_error.pdf}
\caption{Inherent error visualization}
\end{figure}
\end{description}
\section{Error measurement}
Let $x$ be a value and $\hat{x}$ its approximation. Then:
\begin{descriptionlist}
\item[Absolute error]
\[
E_{a} = \hat{x} - x
\marginnote{Absolute error}
\]
Note that, out of context, the absolute error is meaningless.
\item[Relative error]
\[
E_{r} = \frac{\hat{x} - x}{x}
\marginnote{Relative error}
\]
\end{descriptionlist}
\section{Representation in base \texorpdfstring{$\beta$}{B}}
Let $\beta \in \mathbb{N}_{> 1}$ be the base.
Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as:
\begin{equation}
\label{eq:finnum_b_representation}
x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots + d_n\beta^{-n})\beta^p
\end{equation}
where:
\begin{itemize}
\item $0 \leq d_i \leq \beta-1$
\item $d_1 \neq 0$
\item starting from an index $i$, not all $d_j$ ($j \geq i$) are equal to $\beta-1$
\end{itemize}
%
\Cref{eq:finnum_b_representation} can be represented using the normalized scientific notation as: \marginnote{Normalized scientific notation}
\[
x = \pm (0.d_1d_2\dots) \beta^p
\]
where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponent}. \marginnote{Mantissa\\Exponent}
\section{Floating-point}
A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the parameters: \marginnote{Floating-point}
\begin{itemize}
\item $\beta$: base
\item $t$: precision (number of digits in the mantissa)
\item $[L, U]$: range of the exponent
\end{itemize}
Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized form:
\begin{eqnarray}
x = \pm (0.d_1d_2 \dots d_t) \beta^p & L \leq p \leq U
\end{eqnarray}
We denote with $\texttt{fl}(x)$ the representation of $x \in \mathbb{R}$ in a given floating-point system.
\begin{example}
In $\mathcal{F}(10, 5, -3, 3)$, $x=12.\bar{3}$ is represented as:
\begin{equation*}
\texttt{fl}(x) = + 0.12333 \cdot 10^2
\end{equation*}
\end{example}
\subsection{Numbers distribution}
Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the total amount of representable numbers is:
\begin{equation*}
2(\beta-1) \beta^{t-1} (U-L+1)+1
\end{equation*}
%
Representable numbers are more sparse towards the exponent upper bound and more dense towards the lower bound.
It must be noted that there is an underflow area around 0.
\begin{figure}[h]
\centering
\includegraphics[width=0.8\textwidth]{img/floatingpoint_range.png}
\caption{Floating-point numbers in $\mathcal{F}(2, 3, -1, 2)$}
\end{figure}
\subsection{Number representation}
Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in:
\begin{descriptionlist}
\item[Exact representation]
if $p \in [L, U]$ and $d_i=0$ for $i>t$.
\item[Approximation] \marginnote{Truncation\\Rounding}
if $p \in [L, U]$ but $d_i$ may not be 0 for $i>t$.
In this case, the representation is obtained by truncating or rounding the value.
\item[Underflow] \marginnote{Underflow}
if $p < L$. In this case, the value is approximated to 0.
\item[Overflow] \marginnote{Overflow}
if $p > U$. In this case, an exception is usually raised.
\end{descriptionlist}
\subsection{Machine precision}
Machine precision $\varepsilon_{\text{mach}}$ determines the accuracy of a floating-point system. \marginnote{Machine precision}
Depending on the approximation approach, machine precision can be computed as:
\begin{descriptionlist}
\item[Truncation] $\varepsilon_{\text{mach}} = \beta^{1-t}$
\item[Rounding] $\varepsilon_{\text{mach}} = \frac{1}{2}\beta^{1-t}$
\end{descriptionlist}
Therefore, rounding results in more accurate representations.
$\varepsilon_{\text{mach}}$ is the smallest distance among the representable numbers (\Cref{fig:finnum_eps}).
\begin{figure}[h]
\centering
\includegraphics[width=0.2\textwidth]{img/machine_eps.png}
\caption{Visualization of $\varepsilon_{\text{mach}}$ in $\mathcal{F}(2, 3, -1, 2)$}
\label{fig:finnum_eps}
\end{figure}\\
%
In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest representable number such that:
\begin{equation*}
\texttt{fl}(1 + \varepsilon_{\text{mach}}) > 1.
\end{equation*}
\subsection{IEEE standard}
IEEE 754 defines two floating-point formats:
\begin{descriptionlist}
\item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{\texttt{float32}}
\begin{center}
\small
\begin{tabular}{|c|c|c|}
\hline
1 (sign) & 8 (exponent) & 23 (mantissa) \\
\hline
\end{tabular}
\end{center}
\item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{\texttt{float64}}
\begin{center}
\small
\begin{tabular}{|c|c|c|}
\hline
1 (sign) & 11 (exponent) & 52 (mantissa) \\
\hline
\end{tabular}
\end{center}
\end{descriptionlist}
As the first digit of the mantissa is always 1, it does not need to be stored.
Moreover, special configurations are reserved to represent \texttt{Inf} and \texttt{NaN}.
\subsection{Floating-point arithmetic}
Let:
\begin{itemize}
\item $+: \mathbb{R} \times \mathbb{R} \rightarrow \mathbb{R}$ be a real numbers operation.
\item $\oplus: \mathcal{F} \times \mathcal{F} \rightarrow \mathcal{F}$ be the corresponding operation in a floating-point system.
\end{itemize}
%
To compute $x \oplus y$, a machine:
\begin{enumerate}
\item Calculates $x + y$ in a high precision register
(still approximated, but more precise than the floating-point system used to store the result)
\item Stores the result as $\texttt{fl}(x + y)$
\end{enumerate}
A floating-point operation causes a small rounding error:
\[
\left\vert \frac{(x \oplus y) - (x + y)}{x+y} \right\vert < \varepsilon_{\text{mach}}
\]
%
However, some operations may be subject to the \textbf{cancellation} problem which causes information loss.
\marginnote{Cancellation}
\begin{example}
Given $x = 1$ and $y = 1 \cdot 10^{-17}$, we want to compute $x + y$ in $\mathcal{F}(10, 16, U, L)$.
It is assumed that $U$ and $L$ are sufficient for this example.
\begin{equation*}
\begin{split}
z & = \texttt{fl}(x) + \texttt{fl}(y) \\
& = 0.1 \cdot 10^1 + 0.1 \cdot 10^{-16} \\
& = (0.1 + 0.\overbrace{0\dots0}^{\mathclap{16\text{ zeros}}}1) \cdot 10^1 \\
& = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}}1 \cdot 10^1
\end{split}
\end{equation*}
Then, we have that $\texttt{fl}(z) = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}} \cdot 10^1 = 1 = x$.
\end{example}

View File

@ -0,0 +1,342 @@
\chapter{Gradient methods}
\section{Minimum of a function}
Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$.
\begin{descriptionlist}
\item[Stationary point] \marginnote{Stationary point}
$\vec{x}^*$ is a stationary point of $f$ iff:
\[ \nabla f(\vec{x}^*) = \nullvec \]
\item[Local minimum] \marginnote{Local minimum}
$\vec{x}^* \in \mathbb{R}^N$ is a local minimum of $f$ iff:
\[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
\item[Strict local minimum] \marginnote{Strict local minimum}
$\vec{x}^* \in \mathbb{R}^N$ is a strict local minimum of $f$ iff:
\[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
\item[Global minimum] \marginnote{Global minimum}
$\vec{x}^* \in \mathbb{R}^N$ is a global minimum of $f$ iff:
\[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
\item[Strict global minimum] \marginnote{Strict global minimum}
$\vec{x}^* \in \mathbb{R}^N$ is a strict global minimum of $f$ iff:
\[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
\end{descriptionlist}
Note that $\max \{ f(x) \} = \min \{ -f(x)$ \}.
\subsection{Optimality conditions}
\begin{description}
\item[First-order condition] \marginnote{First-order condition}
Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$.
\[ \text{If } \vec{x}^* \text{ local minimum of } f \Rightarrow \nabla f(\vec{x}^*) = \nullvec \]
\item[Second-order condition] \marginnote{Second-order condition}
Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and twice differentiable.
\[
\text{If } \nabla f(\vec{x}^*) = \nullvec \text{ and } \nabla^2 f(\vec{x}^*) \text{ positive definite} \Rightarrow
\vec{x}^* \text{ strict local minimum of } f
\]
\end{description}
As the second-order condition requires computing the Hessian matrix, which is expensive, in practice only the first-order condition is checked.
\section{Descent methods}
\marginnote{Descent methods}
Descent methods are iterative methods that have the property:
\[ f(\vec{x}_k) < f(\vec{x}_{k-1}) \]
The iteration is defined as:
\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
where $\vec{p}_{k-1} \in \mathbb{R}^N$ is the search direction and \marginnote{Search direction\\Step length}
$\alpha_{k-1} \in \mathbb{R}$ is the step length.
Note: descent methods usually converge to a local minimum.
\begin{figure}
\centering
\includegraphics[width=0.5\linewidth]{img/_gradient_contour.pdf}
\caption{Descent method steps in $\mathbb{R}^2$ (i.e. moving across contour lines)}
\end{figure}
\subsection{Choice of the search direction}
\begin{description}
\item[Descent direction] \marginnote{Descent direction}
$\vec{p} \in \mathbb{R}^N$ is a descent direction of $f$ in $\vec{x}$ if:
\[ \exists \bar{\alpha} > 0, \forall \alpha \in [0, \bar{\alpha}]: f(\vec{x} + \alpha \vec{p}) < f(\vec{x}) \]
\end{description}
\begin{theorem}
Let $\vec{p} \in \mathbb{R}^N$, $\vec{p} \neq \nullvec$.
\[ \text{If } \vec{p}^T \nabla f(\vec{x}) < 0 \Rightarrow \vec{p} \text{ descent direction of } f \text{ in } x \]
\end{theorem}
\begin{theorem}
For all $\vec{x}$, $\vec{p} = -\nabla f(\vec{x})$ is a descent direction of $f$ in $x$.
\end{theorem}
\begin{proof}
\[
\begin{split}
\vec{p}^T \nabla f(\vec{x}) < 0 &\iff -(\nabla f(\vec{x}))^T \nabla f(\vec{x}) < 0 \\
&\iff - \Vert \nabla f(\vec{x}) \Vert_2^2 < 0
\end{split}
\]
This holds as the norm is always positive.
\end{proof}
\begin{description}
\item[Gradient-like methods] \marginnote{Gradient-like methods}
Gradient-like methods are descent methods that use $-\nabla f$ as step.
\end{description}
\subsection{Choice of the step length}
\begin{description}
\item[Constant]
In machine learning, it is common to set a constant value for the step (learning rate),
but it can be proved that this does not guarantee convergence.
\item[Backtracking procedure] \marginnote{Backtracking procedure}
$\alpha_k$ is chosen such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
\begin{lstlisting}[mathescape=true, belowskip = -0.8\baselineskip]
def backtracking($\tau$, $c_1$):
$\alpha_k$ = 1 # Initial guess
while $f(x_k + \alpha_k \nabla f(\vec{x}_k))$ > $f(\vec{x}_k)$ + $c_1 \alpha_k \nabla f(\vec{x}_k)^T \nabla f(\vec{x}_k)$:
$\alpha_k$ = $\alpha_k$ / $\tau$
return $\alpha_k$
\end{lstlisting}
It can be proved that, by using the backtracking procedure, gradient methods converge to a local minimum.
\end{description}
\subsection{Stopping condition}
\marginnote{Stopping condition}
We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, when $\nabla f(\vec{x}_k) \approx \nullvec$.
We can verify this by checking the norm of the gradient against a tolerance $\tau$:
\begin{descriptionlist}
\item[Absolute condition] $\Vert \nabla f(x_k) \Vert_2 < \tau$
\item[Relative condition] $\frac{\Vert \nabla f(x_k) \Vert_2}{\Vert \nabla f(x_0) \Vert_2} < \tau$
\end{descriptionlist}
A generic gradient-like method can then be defined as:
\begin{lstlisting}[mathescape=true]
def gradientMethod($f$, $\vec{x}_0$):
$k$ = 0
while stoppingCondition($f$, $\vec{x}_k$, $\vec{x}_0$):
$p_k$ = $-\nabla f(\vec{x}_k)$
$\alpha_k$ = backtracking($\dots$)
$\vec{x}_{k+1}$ = $\vec{x}_k$ + $\alpha_k \vec{p}_k$
$k$ = $k$ + 1
return $x_k$
\end{lstlisting}
\subsection{Problems}
\begin{description}
\item[Choice of the initialization point] \marginnote{Initialization point}
The starting point of an iterative method is a user-defined parameter.
For simple problems, it is usually chosen randomly in $[-1, +1]$.
For complex problems, the choice of the initialization point is critical as
it may cause numerical instabilities or bad results.
Heuristics can be used to select an adequate starting point.
\item[Flat regions and local optima] \marginnote{Flat regions and local optima}
Flat regions slow down the learning speed,
while a local optima causes the method to converge at a poor solution.
\begin{figure}[ht]
\centering
\includegraphics[width=0.9\textwidth]{img/_descent_local_flat.pdf}
\caption{Flat regions and local minima}
\end{figure}
\item[Differential curvature]
Different magnitudes of the partial derivatives may cause the problem of
vanishing and exploding gradient. \marginnote{Vanishing gradient\\Exploding gradient}
This causes the learning process to require more iterations to adjust the direction.
In practice, as the gradient of complex functions is only an instantaneous direction of best decrease and
does not represent the direction to the minimum in the long term,
many updates are required for a gradient method to converge.
A method to mitigate this issue is to use feature normalization techniques.
\item[Non-differentiable objective function]
If the objective function has a small number of non-differentiable points,
the gradient descent method can be applied with minor modifications.
If lots of points are non-differentiable, the gradients will not be informative enough
to determine a decrease direction.
\item[Difficult topologies]
\marginnote{Cliff}
A cliff in the objective function causes problems when evaluating the gradient at the edge.
With a small step size, there is a slowdown in convergence.
With a large step size, there is an overshoot that may cause the algorithm to diverge.
% a slowdown when evaluating
% the gradient at the edge using a small step size and
% an overshoot when the step is too large.
\marginnote{Valley}
A valley in the objective function causes a gradient method to bounce between the sides
to a point where no significant progress can be made.
\begin{figure}[ht]
\begin{subfigure}{.5\textwidth}
\centering
\includegraphics[width=.30\linewidth]{img/cliff.png}
\caption{Cliff region}
\end{subfigure}%
\begin{subfigure}{.5\textwidth}
\centering
\includegraphics[width=.30\linewidth]{img/valley.png}
\caption{Ping pong tournament in a valley}
\end{subfigure}
\end{figure}
\end{description}
\section{Convex functions}
\begin{description}
\item[Convex set] \marginnote{Convex set}
Informally, a set is convex if, for any two points of the set,
the points laying on the segment connecting them are also part of the set.
\begin{figure}[ht]
\begin{subfigure}{.5\textwidth}
\centering
\includegraphics[width=.25\linewidth]{img/convex_set.png}
\caption{Convex set}
\end{subfigure}%
\begin{subfigure}{.5\textwidth}
\centering
\includegraphics[width=.25\linewidth]{img/non_convex_set.png}
\caption{Non-convex set}
\end{subfigure}
\end{figure}
\item[Convex function] \marginnote{Convex function}
Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
$f$ is convex if:
\[
\forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]:
f(t\vec{x}_1 + (1-t)\vec{x}_2) \leq t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
\]
In other words, the segment connecting two points of the function lays above the graph.
\begin{figure}[ht]
\centering
\includegraphics[width=0.55\textwidth]{img/convex_function.png}
\caption{Convex function}
\end{figure}
\item[Strictly convex function] \marginnote{Strictly convex function}
Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
$f$ is strictly convex if:
\[
\forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]:
f(t\vec{x}_1 + (1-t)\vec{x}_2) < t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
\]
\end{description}
\subsection{Properties}
% \marginnote{Convex properties}
\begin{itemize}
\item $\text{if } f \text{ convex} \Rightarrow \text{any local minimum of } f \text{ is also global}$
\item $\text{if } f \text{ strictly convex} \Rightarrow \text{the global minimum of } f \text{ is unique}$
\item $\text{if } f \text{ convex and differentiable} \Rightarrow \text{any stationary point of } f \text{ is a global minimum}$
\end{itemize}
\subsection{Quadratic functions}
\marginnote{Quadratic function}
A quadratic function has form:
\[ f(\vec{x}) = \frac{1}{2}\vec{x}^T\matr{A}\vec{x} - \vec{x}^T\vec{b} + c \]
where $\matr{A} \in \mathbb{R}^{n \times n}$, $\vec{b} \in \mathbb{R}^n$ and $c \in \mathbb{R}$.
\begin{theorem}
If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive semidefinite,
then $f$ is convex.
\end{theorem}
\begin{theorem}
If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive definite,
then $f$ is strictly convex.
\end{theorem}
\begin{theorem}
\marginnote{Least squares quadratic function}
The least squares problem $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is a quadratic function.
\end{theorem}
\begin{proof}
\[
\begin{split}
(\matr{A}\vec{x} - \vec{b})^T(\matr{A}\vec{x} - \vec{b}) &= (\vec{x}^T\matr{A}^T - \vec{b}^T)(\matr{A}\vec{x} - \vec{b}) \\
&= \vec{x}^T\matr{A}^T\matr{A}\vec{x} - \vec{b}^T\matr{A}\vec{x} - \vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \\
\end{split}
\]
As $\vec{b}^T\matr{A}\vec{x} = \vec{x}^T\matr{A}^T\vec{b}$, we have:
\[ \vec{x}^T\matr{A}^T\matr{A}\vec{x} - 2\vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \]
Let $\matr{B} = \matr{A}^T\matr{A}$, $\vec{q} = \matr{A}^T\vec{b}$ and $c = \vec{b}^T\vec{b}$,
we have the quadratic form:
\[ \vec{x}^T\matr{B}\vec{x} - 2\vec{x}^T\vec{q} + c \]
$\matr{B}$ is symmetric positive semidefinite (i.e. $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is convex).
Moreover, when $\matr{A}$ is full-rank, $\matr{B}$ is symmetric positive definite (i.e. strictly convex).
\end{proof}
\section{Gradient descent with momentum}
\marginnote{Momentum}
The momentum is an additional term to keep track of previous iterations:
\[
\Delta \vec{x}_k = \vec{x}_k - \vec{x}_{k-1} = \gamma \Delta \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1})
\]
where $\gamma \in [0, 1]$. An iteration is therefore defined as:
\[
\vec{x}_k = \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1}) + \gamma \Delta\vec{x}_{k-1}
\]
\section{Stochastic gradient descent (SGD)}
\marginnote{Stochastic gradient descent}
SGD is a stochastic approximation of gradient descent that uses an approximation of the gradient.
Given $N$ data points, the loss can be defined as the sum of the individual losses:
\[ L(\vec{x}) = \sum_{n=1}^{N} L_n(\vec{x}) \]
where $\vec{x}$ is the vector of parameters.
The corresponding gradient can be computed as:
\[ \nabla L(\vec{x}) = \sum_{n=1}^{N} \nabla L_n(\vec{x}) \]
\marginnote{Mini-batch}
SGD reduces the amount of computation by approximating the gradient with a subset (mini-batch) $B$ of $\nabla L_n$:
\[ \nabla L(\vec{x}) = \sum_{i \in B} \nabla L_i(\vec{x}) \]
\begin{theorem}
Under some assumptions and with an appropriate decrease in learning rate,
SGD is guaranteed to converge to a local minimum.
\end{theorem}
Different sizes of the mini-batch result in different behavior:
\begin{descriptionlist}
\item[Large mini-batches] accurate estimates of the gradient.
\item[Small mini-batches] faster computation.
\end{descriptionlist}

View File

@ -0,0 +1,344 @@
\chapter{Linear algebra}
\section{Vector space}
A \textbf{vector space} over $\mathbb{R}$ is a nonempty set $V$, whose elements are called vectors, with two operations:
\marginnote{Vector space}
\begin{center}
\begin{tabular}{l c}
Addition & $+ : V \times V \rightarrow V$ \\
Scalar multiplication & $\cdot : \mathbb{R} \times V \rightarrow V$
\end{tabular}
\end{center}
A vector space has the following properties:
\begin{enumerate}
\item Addition is commutative and associative
\item A null vector exists: $\exists \nullvec \in V$ s.t. $\forall \vec{u} \in V: \nullvec + \vec{u} = \vec{u} + \nullvec = \vec{u}$
\item An identity element for scalar multiplication exists: $\forall \vec{u} \in V: 1\vec{u} = \vec{u}$
\item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$.\\
$\vec{a}$ is denoted as $-\vec{u}$.
\item Distributive properties:
\[ \forall \alpha \in \mathbb{R}, \forall \vec{u}, \vec{w} \in V: \alpha(\vec{u} + \vec{w}) = \alpha \vec{u} + \alpha \vec{w} \]
\[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha + \beta)\vec{u} = \alpha \vec{u} + \beta \vec{u} \]
\item Associative property:
\[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha \beta)\vec{u} = \alpha (\beta \vec{u}) \]
\end{enumerate}
%
A subset $U \subseteq V$ of a vector space $V$ is a \textbf{subspace} iff $U$ is a vector space.
\marginnote{Subspace}
\subsection{Basis}
\marginnote{Basis}
Let $V$ be a vector space of dimension $n$.
A basis $\beta = \{ \vec{v}_1, \dots, \vec{v}_n \}$ of $V$ is a set of $n$ linearly independent vectors of $V$.\\
Each element of $V$ can be represented as a linear combination of the vectors in the basis $\beta$:
\[ \forall \vec{w} \in V: \vec{w} = \lambda_1\vec{v}_1 + \dots + \lambda_n\vec{v}_n \text{ where } \lambda_i \in \mathbb{R} \]
%
The canonical basis of a vector space is a basis where each vector represents a dimension $i$ \marginnote{Canonical basis}
(i.e. 1 in position $i$ and 0 in all other positions).
\begin{example}
The canonical basis $\beta$ of $\mathbb{R}^3$ is $\beta = \{ (1, 0, 0), (0, 1, 0), (0, 0, 1) \}$
\end{example}
\subsection{Dot product}
The dot product of two vectors in $\vec{x}, \vec{y} \in \mathbb{R}^n$ is defined as: \marginnote{Dot product}
\begin{equation*}
\left\langle \vec{x}, \vec{y} \right\rangle =
\vec{x}^T \vec{y} = \sum_{i=1}^{n} x_i \cdot y_i
\end{equation*}
\section{Matrix}
This is a {\tiny(very formal definition of)} matrix: \marginnote{Matrix}
\begin{equation*}
\matr{A} =
\begin{pmatrix}
a_{11} & a_{12} & \dots & a_{1n} \\
a_{21} & a_{22} & \dots & a_{2n} \\
\vdots & \vdots & \ddots & \vdots \\
a_{m1} & a_{m2} & \dots & a_{mn}
\end{pmatrix}
\end{equation*}
\subsection{Invertible matrix}
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is invertible (non-singular) if: \marginnote{Non-singular matrix}
\begin{equation*}
\exists \matr{B} \in \mathbb{R}^{n \times n}: \matr{AB} = \matr{BA} = \matr{I}
\end{equation*}
where $\matr{I}$ is the identity matrix. $\matr{B}$ is denoted as $\matr{A}^{-1}$.
\subsection{Kernel}
The null space (kernel) of a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ is a subspace such that: \marginnote{Kernel}
\begin{equation*}
\text{Ker}(\matr{A}) = \{ \vec{x} \in \mathbb{R}^n : \matr{A}\vec{x} = \nullvec \}
\end{equation*}
%
\begin{theorem} \label{th:kernel_invertible}
A square matrix $\matr{A}$ with $\text{\normalfont Ker}(\matr{A}) = \{\nullvec\}$ is non singular.
\end{theorem}
\subsection{Similar matrices} \marginnote{Similar matrices}
Two matrices $\matr{A}$ and $\matr{D}$ are \textbf{similar} if there exists an invertible matrix $\matr{P}$ such that:
\[ \matr{D} = \matr{P}^{-1} \matr{A} \matr{P} \]
\section{Norms}
\subsection{Vector norms}
The norm of a vector is a function: \marginnote{Vector norm}
\begin{equation*}
\Vert \cdot \Vert: \mathbb{R}^n \rightarrow \mathbb{R}
\end{equation*}
such that for each $\lambda \in \mathbb{R}$ and $\vec{x}, \vec{y} \in \mathbb{R}^n$:
\begin{itemize}
\item $\Vert \vec{x} \Vert \geq 0$
\item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = \nullvec$
\item $\Vert \lambda \vec{x} \Vert = \vert \lambda \vert \cdot \Vert \vec{x} \Vert$
\item $\Vert \vec{x} + \vec{y} \Vert \leq \Vert \vec{x} \Vert + \Vert \vec{y} \Vert$
\end{itemize}
%
Common norms are:
\begin{descriptionlist}
\item[2-norm] $\Vert \vec{x} \Vert_2 = \sqrt{ \sum_{i=1}^{n} x_i^2 }$
\item[1-norm] $\Vert \vec{x} \Vert_1 = \sum_{i=1}^{n} \vert x_i \vert$
\item[$\infty$-norm] $\Vert \vec{x} \Vert_{\infty} = \max_{1 \leq i \leq n} \vert x_i \vert$
\end{descriptionlist}
%
In general, different norms tend to maintain the same proportion.
In some cases, unbalanced results may be obtained when comparing different norms.
\begin{example}
Let $\vec{x} = (1, 1000)$ and $\vec{y} = (999, 1000)$. Their norms are:
\begin{center}
\begin{tabular}{l l}
$\Vert \vec{x} \Vert_{2} = \sqrt{1000001}$ & $\Vert \vec{y} \Vert_{2} = \sqrt{1998001}$ \\
$\Vert \vec{x} \Vert_{\infty} = 1000$ & $\Vert \vec{y} \Vert_{\infty} = 1000$ \\
\end{tabular}
\end{center}
\end{example}
\subsection{Matrix norms}
The norm of a matrix is a function: \marginnote{Matrix norm}
\begin{equation*}
\Vert \cdot \Vert: \mathbb{R}^{m \times n} \rightarrow \mathbb{R}
\end{equation*}
such that for each $\lambda \in \mathbb{R}$ and $\matr{A}, \matr{B} \in \mathbb{R}^{m \times n}$:
\begin{itemize}
\item $\Vert \matr{A} \Vert \geq 0$
\item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \matr{0}$
\item $\Vert \lambda \matr{A} \Vert = \vert \lambda \vert \cdot \Vert \matr{A} \Vert$
\item $\Vert \matr{A} + \matr{B} \Vert \leq \Vert \matr{A} \Vert + \Vert \matr{B} \Vert$
\end{itemize}
%
Common norms are:
\begin{descriptionlist}
\item[2-norm]
$\Vert \matr{A} \Vert_2 = \sqrt{ \rho(\matr{A}^T\matr{A}) }$,\\
where $\rho(\matr{X})$ is the largest absolute value of the eigenvalues of $\matr{X}$ (spectral radius).
\item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$ (i.e. max sum of the columns in absolute value)
\item[Frobenius norm] $\Vert \matr{A} \Vert_F = \sqrt{ \sum_{i=1}^{m} \sum_{j=1}^{n} a_{i,j}^2 }$
\end{descriptionlist}
\section{Symmetric, positive definite matrices}
\begin{description}
\item[Symmetric matrix] \marginnote{Symmetric matrix}
A square matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is symmetric $\iff \matr{A} = \matr{A}^T$
\item[Positive semidefinite matrix] \marginnote{Positive semidefinite matrix}
A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is positive semidefinite iff
\begin{equation*}
\forall \vec{x} \in \mathbb{R}^n \smallsetminus \{0\}: \vec{x}^T \matr{A} \vec{x} \geq 0
\end{equation*}
\item[Positive definite matrix] \marginnote{Positive definite matrix}
A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is positive definite iff
\begin{equation*}
\forall \vec{x} \in \mathbb{R}^n \smallsetminus \{0\}: \vec{x}^T \matr{A} \vec{x} > 0
\end{equation*}
%
It has the following properties:
\begin{enumerate}
\item The null space of $\matr{A}$ has the null vector only: $\text{Ker}(\matr{A}) = \{ \nullvec \}$. \\
Which implies that $\matr{A}$ is non-singular (\Cref{th:kernel_invertible}).
\item The diagonal elements of $\matr{A}$ are all positive.
\end{enumerate}
\end{description}
\section{Orthogonality}
\begin{description}
\item[Angle between vectors] \marginnote{Angle between vectors}
The angle $\omega$ between two vectors $\vec{x}$ and $\vec{y}$ can be obtained from:
\begin{equation*}
\cos\omega = \frac{\left\langle \vec{x}, \vec{y} \right\rangle }{\Vert \vec{x} \Vert_2 \cdot \Vert \vec{y} \Vert_2}
\end{equation*}
\item[Orthogonal vectors] \marginnote{Orthogonal vectors}
Two vectors $\vec{x}$ and $\vec{y}$ are orthogonal ($\vec{x} \perp \vec{y}$) when:
\[ \left\langle \vec{x}, \vec{y} \right\rangle = 0 \]
\item[Orthonormal vectors] \marginnote{Orthonormal vectors}
Two vectors $\vec{x}$ and $\vec{y}$ are orthonormal when:
\[ \vec{x} \perp \vec{y} \text{ and } \Vert \vec{x} \Vert = \Vert \vec{y} \Vert=1 \]
\begin{theorem}
The canonical basis of a vector space is orthonormal.
\end{theorem}
\item[Orthogonal matrix] \marginnote{Orthogonal matrix}
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is orthogonal if its columns are \underline{orthonormal} vectors.
It has the following properties:
\begin{enumerate}
\item $\matr{A}\matr{A}^T = \matr{I} = \matr{A}^T\matr{A}$, which implies $\matr{A}^{-1} = \matr{A}^T$.
\item The length of a vector is unchanged when mapped through an orthogonal matrix:
\[ \Vert \matr{A}\vec{x} \Vert^2 = \Vert \vec{x} \Vert^2 \]
\item The angle between two vectors is unchanged when both are mapped through an orthogonal matrix:
\[
\cos\omega = \frac{(\matr{A}\vec{x})^T(\matr{A}\vec{y})}{\Vert \matr{A}\vec{x} \Vert \cdot \Vert \matr{A}\vec{y} \Vert} =
\frac{\vec{x}^T\vec{y}}{\Vert \vec{x} \Vert \cdot \Vert \vec{y} \Vert}
\]
\end{enumerate}
Note: an orthogonal matrix represents a rotation.
\item[Orthogonal basis] \marginnote{Orthogonal basis}
Given a $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
$\beta$ is an orthogonal basis if:
\[ \vec{b}_i \perp \vec{b}_j \text{ for } i \neq j \text{ (i.e.} \left\langle \vec{b}_i, \vec{b}_j \right\rangle = 0 \text{)} \]
\item[Orthonormal basis] \marginnote{Orthonormal basis}
Given a $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
$\beta$ is an orthonormal basis if:
\[ \Vert \vec{b}_i \Vert_2 = 1 \text{ (or} \left\langle \vec{b}_i, \vec{b}_i \right\rangle = 1 \text{)} \]
\item[Orthogonal complement] \marginnote{Orthogonal complement}
Given a $n$-dimensional vector space $V$ and a $m$-dimensional subspace $U \subseteq V$.
The orthogonal complement $U^\perp$ of $U$ is a $(n-m)$-dimensional subspace of $V$ such that it
contains all the vectors orthogonal to every vector in $U$:
\[ \forall \vec{w} \in V: \vec{w} \in U^\perp \iff (\forall \vec{u} \in U: \vec{w} \perp \vec{u}) \]
%
Note that $U \cap U^\perp = \{ \nullvec \}$ and
it is possible to represent all vectors in $V$ as a linear combination of both the basis of $U$ and $U^\perp$.
The vector $\vec{w} \in U^\perp$ s.t. $\Vert \vec{w} \Vert = 1$ is the \textbf{normal vector} of $U$. \marginnote{Normal vector}
%
\begin{figure}[ht]
\centering
\includegraphics[width=0.4\textwidth]{img/_orthogonal_complement.pdf}
\caption{Orthogonal complement of a subspace $U \subseteq \mathbb{R}^3$}
\end{figure}
\end{description}
\section{Projections}
Projections are methods to map high-dimensional data into a lower-dimensional space
while minimizing the compression loss.\\
\marginnote{Orthogonal projection}
Let $V$ be a vector space and $U \subseteq V$ a subspace of $V$.
A linear mapping $\pi: V \rightarrow U$ is a (orthogonal) projection if:
\[ \pi^2 = \pi \circ \pi = \pi \]
In other words, applying $\pi$ multiple times gives the same result (i.e. idempotency).\\
$\pi$ can be expressed as a transformation matrix $\matr{P}_\pi$ such that:
\[ \matr{P}_\pi^2 = \matr{P}_\pi \]
\subsection{Projection onto general subspaces} \marginnote{Projection onto subspace basis}
To project a vector $\vec{x} \in \mathbb{R}^n$ into a lower-dimensional subspace $U \subseteq \mathbb{R}^n$,
it is possible to use the basis of $U$.\\
%
Let $m = \text{dim}(U)$ be the dimension of $U$ and
$\matr{B} = (\vec{b}_1, \dots, \vec{b}_m) \in \mathbb{R}^{n \times m}$ an ordered basis of $U$.
A projection $\pi_U(\vec{x})$ represents $\vec{x}$ as a linear combination of the basis:
\[ \pi_U(\vec{x}) = \sum_{i=1}^{m} \lambda_i \vec{b}_i = \matr{B}\vec{\uplambda} \]
where $\vec{\uplambda} = (\lambda_1, \dots, \lambda_m)^T \in \mathbb{R}^{m}$ are the new coordinates of $\vec{x}$
and is found by minimizing the distance between $\pi_U(\vec{x})$ and $\vec{x}$.
\section{Eigenvectors and eigenvalues}
Given a square matrix $\matr{A} \in \mathbb{R}^{n \times n}$,
$\lambda \in \mathbb{C}$ is an eigenvalue of $\matr{A}$ \marginnote{Eigenvalue}
with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if: \marginnote{Eigenvector}
\[ \matr{A}\vec{x} = \lambda\vec{x} \]
It is equivalent to say that:
\begin{itemize}
\item $\lambda$ is an eigenvalue of $\matr{A} \in \mathbb{R}^{n \times n}$
\item $\exists \vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ s.t. $\matr{A}\vec{x} = \lambda\vec{x}$ \\
Equivalently the system $(\matr{A} - \lambda \matr{I}_n)\vec{x} = \nullvec$ is non-trivial ($\vec{x} \neq \nullvec$).
\item $\text{rank}(\matr{A} - \lambda \matr{I}_n) < n$
\item $\det(\matr{A} - \lambda \matr{I}_n) = 0$ (i.e. $(\matr{A} - \lambda \matr{I}_n)$ is singular {\footnotesize(i.e. not invertible)})
\end{itemize}
Note that eigenvectors are not unique.
Given an eigenvector $\vec{x}$ of $\matr{A}$ with eigenvalue $\lambda$,
we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is an eigenvector of $\matr{A}$:
\[ \matr{A}(c\vec{x}) = c(\matr{A}\vec{x}) = c\lambda\vec{x} = \lambda(c\vec{x}) \]
% \begin{theorem}
% The eigenvalues of a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ are all in $\mathbb{R}$.
% \end{theorem}
\begin{theorem} \marginnote{Eigenvalues and positive definiteness}
$\matr{A} \in \mathbb{R}^{n \times n}$ is symmetric positive definite $\iff$
its eigenvalues are all positive.
\end{theorem}
\begin{description}
\item[Eigenspace] \marginnote{Eigenspace}
Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalue $\lambda$.
This set is a subspace of $\mathbb{R}^n$.
\item[Eigenspectrum] \marginnote{Eigenspectrum}
Set of all eigenvalues of $\matr{A} \in \mathbb{R}^{n \times n}$.
\end{description}
\begin{description}
\item[Geometric multiplicity] \marginnote{Geometric multiplicity}
Given an eigenvalue $\lambda$ of a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated to $\lambda$.
\end{description}
\begin{theorem} \marginnote{Linearly independent eigenvectors}
Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
If its $n$ eigenvectors $\vec{x}_1, \dots, \vec{x}_n$ are associated to distinct eigenvalues,
then $\vec{x}_1, \dots, \vec{x}_n$ are linearly independent (i.e. they form a basis of $\mathbb{R}^n$).
\begin{descriptionlist}
\item[Defective matrix] \marginnote{Defective matrix}
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is defective if it has less than $n$ linearly independent eigenvectors.
\end{descriptionlist}
\end{theorem}
\begin{theorem}[Spectral theorem] \label{th:spectral_theorem} \marginnote{Spectral theorem}
Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
Its eigenvectors form an orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
\end{theorem}
\subsection{Diagonalizability}
\marginnote{Diagonalizable matrix}
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is diagonalizable if it is similar to a diagonal matrix $\matr{D} \in \mathbb{R}^{n \times n}$:
\[ \exists \matr{P} \in \mathbb{R}^{n \times n} \text{ s.t. } \matr{P} \text{ invertible and } \matr{D} = \matr{P}^{-1}\matr{A}\matr{P} \]
\begin{theorem}
Similar matrices have the same eigenvalues.
\end{theorem}
\begin{theorem} \marginnote{Symmetric matrix diagonalizability}
A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is always diagonalizable.
\end{theorem}

View File

@ -0,0 +1,242 @@
\chapter{Linear systems}
A linear system:
\begin{equation*}
\begin{cases}
a_{1,1}x_1 + a_{1,2}x_2 + \dots + a_{1,n}x_n = b_1\\
a_{2,1}x_1 + a_{2,2}x_2 + \dots + a_{2,n}x_n = b_2\\
\hspace*{7em} \vdots \\
a_{m,1}x_1 + a_{m,2}x_2 + \dots + a_{m,n}x_n = b_m\\
\end{cases}
\end{equation*}
can be represented as:
\[ \matr{A}\vec{x} = \vec{b} \]
where:
\[
\matr{A} =
\begin{pmatrix}
a_{1,1} & a_{1, 2} & \hdots & a_{1,n} \\
a_{2,1} & a_{2, 2} & \hdots & a_{2,n} \\
\vdots & \vdots & \ddots & \vdots \\
a_{m,1} & a_{m, 2} & \hdots & a_{m,n}
\end{pmatrix} \in \mathbb{R}^{m \times n}
\hspace*{2em}
%
\vec{x} =
\begin{pmatrix}
x_1 \\
x_2 \\
\vdots \\
x_n
\end{pmatrix} \in \mathbb{R}^n
\hspace*{2em}
%
\vec{b} =
\begin{pmatrix}
b_1 \\
b_2 \\
\vdots \\
b_m
\end{pmatrix} \in \mathbb{R}^m
\]
\section{Square linear systems}
\marginnote{Square linear system}
A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$
has a unique solution iff one of the following conditions is satisfied:
\begin{enumerate}
\item $\matr{A}$ is non-singular (invertible)
\item $\text{rank}(\matr{A}) = n$ (full rank)
\item $\matr{A}\vec{x}$ only admits the solution $\vec{x} = \nullvec$
\end{enumerate}
The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
\[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
Therefore, numerical methods are usually more suited.
The two main families of methods are:
\begin{itemize}
\item Direct methods.
\item Iterative methods.
\end{itemize}
\section{Direct methods}
\marginnote{Direct methods}
Direct methods compute the solution of a linear system in a finite number of steps.
Compared to iterative methods, they are more precise but more expensive.
The most common approach consists in factorizing the matrix $\matr{A}$.
\subsection{Gaussian factorization}
\marginnote{Gaussian factorization\\(LU decomposition)}
Given a square linear system $\matr{A}\vec{x} = \vec{b}$,
the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that:
\begin{itemize}
\item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix.
\item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix.
\end{itemize}
%
The system can be decomposed into:
\[
\begin{split}
\matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
& \iff
\begin{cases}
\matr{L}\vec{y} = \vec{b} \\
\vec{y} = \matr{U}\vec{x}
\end{cases}
\end{split}
\]
To find the solution, it is sufficient to solve in order:
\begin{enumerate}
\item $\matr{L}\vec{y} = \vec{b}$ (solved w.r.t. $\vec{y}$)
\item $\vec{y} = \matr{U}\vec{x}$ (solved w.r.t. $\vec{x}$)
\end{enumerate}
The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\
$O(\frac{n^3}{3})$ is the time complexity of the LU factorization.
$O(n^2)$ is the complexity to directly solve a system with a triangular matrix (forward or backward substitutions).
\subsection{Gaussian factorization with pivoting}
\marginnote{Gaussian factorization with pivoting}
During the computation of $\matr{A} = \matr{L}\matr{U}$
(using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}),
a division by 0 may occur.
A method to prevent this problem (and to lower the algorithmic error (i.e. overflows)) is to change the order of the rows of $\matr{A}$ before decomposing it.
This is achieved by using a permutation matrix $\matr{P}$, which is obtained as a permutation of the identity matrix.
The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
The system can be decomposed into:
\[
\begin{split}
\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
& \iff
\begin{cases}
\matr{L}\vec{y} = \matr{P}\vec{b} \\
\vec{y} = \matr{U}\vec{x}
\end{cases}
\end{split}
\]
An alternative formulation (which is what \texttt{SciPy} uses)
is defined as:
\[\matr{A} = \matr{P}\matr{L}\matr{U} \iff \matr{P}^T\matr{A} = \matr{L}\matr{U} \]
It must be noted that $\matr{P}$ is orthogonal, so $\matr{P}^T = \matr{P}^{-1}$.
The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can be found as above.
\subsection{Cholesky factorization}
Given a symmetric positive definite matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
It is possible to decompose $\matr{A}$ as:
\[ \matr{A} = \matr{L}\matr{L}^T \]
where $\matr{L}$ is lower triangular.
A square system where $\matr{A}$ is symmetric definite positive can be solved as above using the Cholesky factorization.
This method has time complexity $O(\frac{n^3}{6})$.
\section{Iterative methods}
\marginnote{Iterative methods}
Iterative methods solve a linear system by computing a sequence that converges to the exact solution.
Compared to direct methods, they are less precise but computationally faster and more suited for large systems.
The overall idea is to build a sequence of vectors $\vec{x}_k$
that converges to the exact solution $\vec{x}^*$:
\[ \lim_{k \rightarrow \infty} \vec{x}_k = \vec{x}^* \]
Generally, the first vector $\vec{x}_0$ is given (or guessed). Subsequent vectors are computed w.r.t. the previous iteration
as $\vec{x}_k = g(\vec{x}_{k-1})$.
The two most common families of iterative methods are:
\begin{descriptionlist}
\item[Stationary methods] \marginnote{Stationary methods}
compute the sequence as:
\[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \]
where $\matr{B}$ is called iteration matrix and $\vec{d}$ is computed from the $\vec{b}$ vector of the system.
The time complexity per iteration is $O(n^2)$.
\item[Gradient-like methods] \marginnote{Gradient-like methods}
have the form:
\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
where $\alpha_{k-1} \in \mathbb{R}$ and the vector $\vec{p}_{k-1}$ is called direction.
\end{descriptionlist}
\subsection{Stopping criteria}
\marginnote{Stopping criteria}
One or more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite).
The most common approaches are:
\begin{descriptionlist}
\item[Residual based]
The algorithm is terminated when the current solution is close enough to the exact solution.
The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$.
Given a tolerance $\varepsilon$, the algorithm may stop when:
\begin{itemize}
\item $\Vert \vec{r}_k \Vert \leq \varepsilon$ (absolute)
\item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$ (relative)
\end{itemize}
\item[Update based]
The algorithm is terminated when the difference between iterations is very small.
Given a tolerance $\tau$, the algorithm stops when:
\[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \]
\end{descriptionlist}
Obviously, as the sequence is truncated, a truncation error is introduced when using iterative methods.
\section{Condition number}
Inherent error causes inaccuracies during the resolution of a system.
This problem is independent of the algorithm and is estimated using exact arithmetic.
Given a system $\matr{A}\vec{x} = \vec{b}$, we perturbate $\matr{A}$ and/or $\vec{b}$ and study the inherited error.
For instance, if we perturbate $\vec{b}$, we obtain the following system:
\[ \matr{A}\tilde{\vec{x}} = (\vec{b} + \Delta\vec{b}) \]
After finding $\tilde{\vec{x}}$, we can compute the inherent error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
By comparing $\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert$ and $\left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert$,
we can compute the error introduced by the perturbation.
It can be shown that the distance is:
\[
\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert \leq
\Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \cdot \left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert
\]
Finally, we can define the \textbf{condition number} of a matrix $\matr{A}$ as: \marginnote{Condition number}
\[ K(\matr{A}) = \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \]
A system is \textbf{ill-conditioned} if $K(\matr{A})$ is large \marginnote{Ill-conditioned}
(i.e. a small perturbation of the input causes a large change in the output).
Otherwise, it is \textbf{well-conditioned}. \marginnote{Well-conditioned}
\section{Linear least squares problem}
A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$
does not generally have a solution.
\marginnote{Linear least squares}
Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
This problem is usually formulated as:
\[
\tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
\]
It always admits a solution and, depending on $\text{rank}(\matr{A})$, there are two possible cases:
\begin{descriptionlist}
\item[$\text{rank}(\matr{A}) = n$]
The solution is unique for each $b \in \mathbb{R}^m$.
\marginnote{Normal equation}
It is found by solving the normal equation:
\[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
$\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
\item[$\text{rank}(\matr{A}) < n$]
The system admits infinite solutions.
Of all the solutions $S$, we are interested in the one with minimum norm:
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
\end{descriptionlist}

View File

@ -0,0 +1,306 @@
\chapter{Machine learning}
\section{Models}
\begin{description}
\item[Function model] \marginnote{Function model}
The model (predictor) is a deterministic function:
\[ f: \mathbb{R}^D \rightarrow \mathbb{R} \]
In this course, only linear functions are considered:
\[ f_\vec{\uptheta}(\vec{x}) = \uptheta_0 + \uptheta_1 x_1 + \dots + \uptheta_D x_D = \vec{\uptheta}^T \vec{x} \]
where $\vec{x} = \begin{pmatrix} 1, x_1, \dots, x_D \end{pmatrix}$ is the input vector and
$\vec{\uptheta} = \begin{pmatrix} \uptheta_0, \dots, \uptheta_D \end{pmatrix}$ is the parameter vector.
\item[Probabilistic model] \marginnote{Probabilistic model}
The model is a multivariate probabilistic distribution that
is able to quantify uncertainty in noisy data.
\end{description}
\section{Learning}
\subsection{Empirical risk minimization}
\marginnote{Empirical risk minimization}
Used for function models.
The parameters of the predictor are directly obtained as an optimization problem that aims to minimize the distance
between the prediction and the ground truth.
Let $(\vec{x}_n, y_n)$ be a dataset of $N$ elements
where $\vec{x}_n \in \mathbb{R}^D$ are the examples and $y_n \in \mathbb{R}$ are the labels.
We want to estimate a predictor $f_\vec{\uptheta}(\vec{x}) = \vec{\uptheta}^T \vec{x}$ with parameters $\vec{\uptheta}$
such that, with the ideal parameters $\vec{\uptheta}^*$, it fits the data well:
\[ f_{\vec{\uptheta}^*}(\vec{x}_n) \approx y_n \]
We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n)$.
\begin{description}
\item[Loss function] \marginnote{Loss function}
A loss function $\ell(y_n, \hat{y}_n)$ indicates how a predictor fits the data.
An assumption commonly made in machine learning is that
the dataset $(\vec{x}_n, y_n)$ is independent and identically distributed.
Therefore, the empirical mean is a good estimate of the population mean.
\item[Empirical risk] \marginnote{Empirical risk}
Given the example matrix $\matr{X} = \begin{pmatrix} \vec{x}_1, \dots, \vec{x}_N \end{pmatrix} \in \mathbb{R}^{N \times D}$
and the label vector $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix} \in \mathbb{R}^N$.
The empirical risk is given by the average loss:
\[ \textbf{R}_\text{emp}(f_\vec{\uptheta}, \matr{X}, \vec{y}) = \frac{1}{N} \sum_{n=1}^{N} \ell(y_n, \hat{y}_n) \]
\begin{description}
\item[Least-squares loss] \marginnote{Least-squares loss}
The least-squares loss is defined as:
\[ \ell(y_n, \hat{y}_n) = (y_n - \hat{y}_n)^2 \]
Therefore, the minimization task is:
\[
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - f_\vec{\uptheta}(\vec{x}_n))^2 =
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - \vec{\uptheta}^T\vec{x}_n)^2 =
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2
\]
\end{description}
\item[Expected risk] \marginnote{Expected risk}
The expected risk is defined as:
\[ \textbf{R}_\text{true}(f_\vec{\uptheta}) = \mathbb{E}_{\vec{x}, y}[\ell(y, f_\vec{\uptheta}(\vec{x}_\text{test}))] \]
where the parameters $\vec{\uptheta}$ are fixed and the samples are taken from a test set.
\item[Overfitting] \marginnote{Overfitting}
\sloppy
A predictor $f_\vec{\uptheta}$ is overfitting when $\textbf{R}_\text{emp}(f, \matr{X}_\text{train}, \vec{y}_\text{train})$
underestimates $\textbf{R}_\text{true}(f_\vec{\uptheta})$ (i.e. the loss on the training set is low, but on the test set is high).
\item[Regularization] \marginnote{Regularization}
Method that introduces a penalty term to the loss that
helps to find a compromise between the accuracy and the complexity of the solution:
\[ \bar{\ell}(y_n, \hat{y}_n) = \ell(y_n, \hat{y}_n) + \lambda \mathcal{R}(\vec{\uptheta}) \]
where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the regularizer (penalty term).
\begin{description}
\item[Regularized least squares] \marginnote{Regularized least squares}
A simple regularization term for the least squares problem is $\Vert \vec{\uptheta} \Vert^2$.
The problem becomes:
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D}
\{ \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
\end{description}
\end{description}
\subsection{Maximum likelihood estimation (MLE)}
% \marginnote{Maximum likelihood estimation (MLE)}
Used for probabilistic models.
The parameters are determined as the most likely to predict the correct label given an input.
\begin{description}
\item[Negative log-likelihood] \marginnote{Negative log-likelihood}
\sloppy
Given a random variable $\bm{x}$ and a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$,
the negative log-likelihood of $\bm{x}$ is:
\[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
Note that:
\begin{itemize}
\item The minus is added as we are converting the problem of maximizing the likelihood to a minimization problem.
\item The logarithm is useful for numerical stability.
\end{itemize}
$\mathcal{L}_{\bm{x}}(\vec{\uptheta})$ indicates how likely it is to observe $\bm{x}$ with
$\vec{\uptheta}$ as the parameters of the predictor.
Given a dataset $(\bm{x}_n, y_n)$ of $N$ independent and identically distributed (i.i.d.) elements,
optimizing the likelihood allows to find the most likely parameters to represent the dataset.
As the dataset is independent, we have that:
\[ p_\vec{\uptheta}(\vec{y} \vert \matr{X}) = \prod_{n=1}^{N} p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
where $\matr{X} = \begin{pmatrix} \bm{x}_1, \dots, \bm{x}_N \end{pmatrix}$ and
$\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix}$.
Moreover, as the dataset is identically distributed,
each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.
By applying the logarithm, we have that the negative log-likelihood of an i.i.d. dataset is defined as:
\[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
and to find good parameters $\vec{\uptheta}$, we solve the problem:
\[
\min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) =
\min_{\vec{\uptheta} \in \mathbb{R}^D} -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n)
\]
\begin{description}
\item[Gaussian likelihood] \marginnote{Gaussian likelihood}
Using a linear model $\bm{x}^T\vec{\uptheta}$ as predictor and
assuming that the likelihood has a Gaussian distribution as follows:
\[ p_\vec{\uptheta}(y_n \,\vert\, \bm{x}_n) = \mathcal{N}(y_n \,\vert\, \bm{x}_n^T\vec{\uptheta}, \sigma^2) \]
where the Gaussian distribution has mean $\bm{x}_n^T\vec{\uptheta}$ (i.e. $f_\vec{\uptheta}(\bm{x}_n))$
and variance $\sigma^2$ for the $n$-th data point.
The negative log-likelihood is:
\[
\begin{split}
\mathcal{L}(\vec{\uptheta}) &= -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \\
&= -\sum_{n=1}^{N} \log \mathcal{N}(y_n \vert \bm{x}_n^T\vec{\uptheta}, \sigma^2) \\
&= -\sum_{n=1}^{N} \log \left( \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) \right) \\
&= -\sum_{n=1}^{N} \log\exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}} \\
&= \frac{1}{2\sigma^2} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}
\end{split}
\]
The minimization problem becomes:
\[
\begin{split}
\min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) &=
\min_{\vec{\uptheta} \in \mathbb{R}^D}
\overbrace{\frac{1}{2\sigma^2}}^{\mathclap{\text{constant}}}
\sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 -
\overbrace{\sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}}^{\mathclap{\text{constant}}} \\
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 \\
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2
\end{split}
\]
which corresponds to the least squares problem.
\end{description}
\begin{figure}[ht]
\begin{subfigure}{.45\textwidth}
\centering
\includegraphics[width=.75\linewidth]{img/gaussian_mle_good.png}
\caption{When the parameters are good, the label will be near the mean (i.e. predictor)}
\end{subfigure}
\hspace*{1em}
\begin{subfigure}{.45\textwidth}
\centering
\includegraphics[width=.75\linewidth]{img/gaussian_mle_bad.png}
\caption{When the parameters are bad, the label will be far from the mean}
\end{subfigure}
\caption{Geometric interpretation of the Gaussian likelihood}
\end{figure}
\end{description}
\subsection{Maximum a posteriori estimation (MAP)}
\marginnote{Maximum a posteriori (MAP)}
Maximum a posteriori estimation uses the opposite distribution of MLE and maximizes:
\[
\max_{\vec{\uptheta} \in \mathbb{R}^D} p(\vec{\uptheta} \vert \matr{X}, \vec{y}) =
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{\uptheta} \vert \matr{X}, \vec{y})
\]
In other words, it maximizes the probability of a set of parameters $\vec{\uptheta}$ given the observation of the dataset $(\matr{X}, \vec{y})$.
By applying the Bayes' theorem, the problem becomes:
\[
\begin{split}
\min_{\vec{\uptheta} \in \mathbb{R}^D}
-\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
\end{split}
\]
\begin{description}
\item[Gaussian posteriori] \marginnote{Gaussian posteriori}
By assuming that the conditional probability of the dataset follows a Gaussian distribution (as in MLE),
the problem becomes:
\[
\min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \} =
\min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 -\log p(\vec{\uptheta}) \}
\]
Moreover, assuming that $p(\vec{\uptheta}) \sim \mathcal{N}(0, \matr{\Sigma})$, we have that:
\[ -\log p(\vec{\uptheta}) = \frac{1}{2\sigma^2} \Vert \vec{\uptheta} \Vert^2 \]
Therefore, the problem becomes:
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
MAP can be seen as a regularization factor for MLE.
\end{description}
\section{Linear regression}
\marginnote{Linear regression}
Given a dataset of inputs $\vec{x}_n \in \mathbb{R}^D$ with corresponding labels $y_n = f(\vec{x}_n) + \varepsilon$,
where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise,
we want to estimate the function $f$.
\begin{description}
\item[Model]
We use as the predictor:
\[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \]
Because of the noise, we use a probabilistic model with likelihood:
\[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, f(\vec{x}), \sigma^2) \]
\item[Parameter estimation]
To estimate $\vec{\uptheta}$, we can use MLE:
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D} -p_\vec{\uptheta}(\vec{y} \vert \matr{X}) \]
\end{description}
\subsection{Maximum likelihood estimation with features}
\marginnote{MLE with features}
Linear regression is linear only with respect to the parameters $\vec{\uptheta}$.
Therefore, it is possible to apply any transformation to the inputs of the predictor $f$ such that:
\[ f(\vec{x}_n) = (\phi(\vec{x}_n))^T \vec{\uptheta} \]
where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ is a transformation and
$\vec{\uptheta} \in \mathbb{R}^K$ are the parameters.
Given a dataset of $N$ entries $\vec{x}_n \in \mathbb{R}^D$ with labels $y_n \in \mathbb{R}$
and a transformation function $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$,
the transformed features can be expressed through a feature matrix $\matr{\Phi} \in \mathbb{R}^{N \times K}$:
\[
\matr{\Phi} =
\begin{pmatrix}
(\phi(\vec{x}_1))^T \\ \vdots \\ (\phi(\vec{x}_N))^T
\end{pmatrix}
=
\begin{pmatrix}
\phi_0(\vec{x}_1) & \cdots & \phi_{K-1}(\vec{x}_1) \\
\vdots & \ddots & \vdots \\
\phi_0(\vec{x}_N) & \cdots & \phi_{K-1}(\vec{x}_N) \\
\end{pmatrix}
\]
The negative log-likelihood can be defined as:
\[
-\log p_\vec{\uptheta}(\vec{y} \,\vert\, \matr{X}) =
\frac{1}{2\sigma^2} (\vec{y} - \matr{\Phi}\vec{\uptheta})^T (\vec{y} - \matr{\Phi}\vec{\uptheta}) + \text{constant}
\]
As $\matr{\Phi}$ is (usually) full-rank and convex, the problem can be solved directly using normal equations:
\[
\matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \matr{\Phi}^T \vec{y} \iff
\vec{\uptheta} = (\matr{\Phi}^T \matr{\Phi})^{-1} \matr{\Phi}^T \vec{y}
\]
Obviously, the negative log-likelihood can also be minimized by using a gradient method.
\begin{description}
\item[Root mean square error (RMSE)] \marginnote{Root mean square error (RMSE)}
RMSE is computed as:
\[
\sqrt{ \frac{1}{N} \Vert \vec{y} - \matr{\Phi}\vec{\uptheta} \Vert^2 } =
\sqrt{ \frac{1}{N} \sum_{n=1}^{N}(y_n - (\phi(\vec{x}_n))^T\vec{\uptheta})^2 }
\]
Differently from MSE, RMSE allows to compare errors of datasets with different sizes
and scales its result to the labels.
By comparing the RMSE of the train and test sets, it is possible to check if a model is overfitting.
\end{description}
\begin{description}
\item[Polynomial regression] \marginnote{Polynomial regression}
The transformation function $\phi: \mathbb{R} \rightarrow \mathbb{R}^K$ is defined as:
\[
\phi(x) =
\begin{pmatrix}
\phi_0(x) \\ \phi_1(x) \\ \phi_2(x) \\ \vdots \\ \phi_{K-1}(x)
\end{pmatrix}
=
\begin{pmatrix}
1 \\ x \\ x^2 \\ \vdots \\ x^{K-1}
\end{pmatrix}
\]
The predictor is then defined as:
\[
\begin{split}
f(x) &= (\phi(x))^T \vec{\uptheta} \\
&= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i = \sum_{i=0}^{K-1} x^i \vartheta_i
\end{split}
\]
\end{description}

View File

@ -0,0 +1,226 @@
\chapter{Matrix decomposition}
\section{Eigendecomposition}
\marginnote{Eigendecomposition}
Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
If the eigenvectors of $\matr{A}$ form a basis of $\mathbb{R}^n$,
then $\matr{A} \in \mathbb{R}^{n \times n}$ can be decomposed into:
\[ \matr{A} = \matr{P}\matr{D}\matr{P}^{-1} \]
where $\matr{P} \in \mathbb{R}^{n \times n}$ contains the eigenvectors of $\matr{A}$ as its columns and
$\matr{D}$ is a diagonal matrix whose diagonal contains the eigenvalues of $\matr{A}$.
Note that a symmetric matrix can always be decomposed (\Cref{th:spectral_theorem})
\section{Singular value decomposition}
\marginnote{Singular value decomposition}
Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r \in [0, \min\{m, n\}]$.
The singular value decomposition (SVD) of $\matr{A}$ is always possible and has form:
\[
\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
\]
\[
=
\begin{pmatrix}
\begin{pmatrix} \\ \vec{u}_1 \\ \\ \end{pmatrix} &
\dots &
\begin{pmatrix} \\ \vec{u}_m \\ \\ \end{pmatrix}
\end{pmatrix}
\begin{pmatrix}
\sigma_1 & 0 & 0 \\
0 & \ddots & 0 \\
0 & 0 & \sigma_{\min\{m, n\}} \\
\end{pmatrix}
\begin{pmatrix}
\begin{pmatrix} & \vec{v}_1 & \end{pmatrix} \\
\vdots \\
\begin{pmatrix} & \vec{v}_n & \end{pmatrix} \\
\end{pmatrix}
\]
where:
\begin{itemize}
\item
$\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix whose columns $\vec{u}_i$ are called left-singular vectors.
\item
$\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix whose columns $\vec{v}_i$ are called right-singular vectors.
\item
$\matr{\Sigma} \in \mathbb{R}^{m \times n}$ is a matrix with $\matr{\Sigma}_{i,j} = 0$ (i.e. diagonal if it was a square matrix) and
the singular values $\sigma_i, i = 1 \dots \min\{m, n\}$ on the diagonal.
By convention $\sigma_1 \geq \sigma_2 \geq \dots \geq \sigma_r \geq 0$.
Note that singular values $\sigma_j = 0$ for $(r + 1) \leq j \leq \min\{m, n\}$
(i.e. singular values at indexes after $\text{rank}(\matr{A})$ are always 0).
\end{itemize}
\marginnote{Singular value equation}
We can also represent SVD as a \textbf{singular value equation}, which resembles the eigenvalue equation:
\[ \matr{A}\vec{v}_i = \sigma_i\vec{u}_i \text{ for } i = 1, \dots, r \]
This is derived from:
\[
\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
\iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}\matr{V}^T\matr{V}
\iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}
\]
\subsection{Singular values and eigenvalues}
\marginnote{Eigendecomposition of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$}
Given $\matr{A} \in \mathbb{R}^{m \times n}$, we can obtain the eigenvalues and eigenvectors
of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$ through SVD.
For $\matr{A}^T\matr{A}$, we can compute:
\[
\begin{split}
\matr{A}^T\matr{A} & = (\matr{U}\matr{\Sigma}\matr{V}^T)^T(\matr{U}\matr{\Sigma}\matr{V}^T) \text{ using } (\matr{A}\matr{B})^T = \matr{B}^T\matr{A}^T \\
& = (\matr{V}\matr{\Sigma}^T\matr{U}^T)(\matr{U}\matr{\Sigma}\matr{V}^T) \\
& = \matr{V}\matr{\Sigma}^T\matr{\Sigma}\matr{V}^T \\
& = \matr{V}\matr{\Sigma}^2\matr{V}^T
\end{split}
\]
As $\matr{V}$ is orthogonal ($\matr{V}^T = \matr{V}^{-1}$), we can apply the eigendecomposition theorem:
\begin{itemize}
\item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$.
\item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$.
\end{itemize}
The same process holds for $\matr{A}\matr{A}^T$. In this case, the columns of $\matr{U}$ (left-singular vectors) are the eigenvectors.
\subsection{Singular values and 2-norm}
Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$,
we have that $\matr{A}^T\matr{A} = \matr{A}^2 = \matr{A}\matr{A}^T$ (as $\matr{A}^T = \matr{A}$).
The eigenvalues of $\matr{A}^2$ are $\lambda_1^2, \dots,\lambda_n^2$, where $\lambda_i$ are eigenvalues of $\matr{A}$.
Alternatively, the eigenvalues of $\matr{A}^2$ are the squared singular values of $\matr{A}$: $\lambda_i^2 = \sigma_i^2$.
Moreover, the eigenvalues of $\matr{A}^{-1}$ are $\frac{1}{\lambda_1}, \dots, \frac{1}{\lambda_n}$.
\marginnote{2-norm using SVD}
We can compute the 2-norm as:
\[ \Vert \matr{A} \Vert_2 = \sqrt{\rho(\matr{A}^T\matr{A})} = \sqrt{\rho(\matr{A}^2)} = \sqrt{\max\{\sigma_1^2, \dots, \sigma_r^2\}} = \sigma_1 \]
\[
\Vert \matr{A}^{-1} \Vert_2 = \sqrt{\rho((\matr{A}^{-1})^T(\matr{A}^{-1}))} =
\sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} =
\sqrt{\max \left\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2} \right\}} = \frac{1}{\sigma_r}
\]
Furthermore, we can compute the condition number of $\matr{A}$ as:
\[ K(\matr{A}) = \Vert \matr{A} \Vert_2 \cdot \Vert \matr{A}^{-1} \Vert_2 = \sigma_1 \cdot \frac{1}{\sigma_r} \]
\subsection{Application: Matrix approximation}
Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ and its SVD decomposition $\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T$,
we can construct a rank-1 matrix (dyad) $\matr{A}_i \in \mathbb{R}^{m \times n}$ as: \marginnote{Dyad}
\[ \matr{A}_i = \vec{u}_i \vec{v}_i^T \]
where $\vec{u}_i \in \mathbb{R}^m$ is the $i$-th column of $\matr{U}$ and
$\vec{v}_i \in \mathbb{R}^n$ is the $i$-th column of $\matr{V}$.
Then, we can compose $\matr{A}$ as a sum of dyads:
\[ \matr{A}_i = \sum_{i=1}^{r} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{r} \sigma_i \matr{A}_i \]
\marginnote{Rank-$k$ approximation}
By considering only the first $k < r$ singular values, we can obtain a rank-$k$ approximation of $\matr{A}$:
\[ \hat{\matr{A}}(k) = \sum_{i=1}^{k} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{k} \sigma_i \matr{A}_i \]
\begin{theorem}[Eckart-Young]
Given $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r$.
For any $k \leq r$ (this theorem is interesting for $k < r$), the rank-$k$ approximation is:
\[
\hat{\matr{A}}(k) = \arg \min_{\matr{B} \in \mathbb{R}^{m \times n}, \text{rank}(\matr{B}) = k} \Vert \matr{A} - \matr{B} \Vert_2
\]
\end{theorem}
In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closest one to $\matr{A}$.
Moreover, the error of the rank-$k$ approximation is:
\[
\Vert \matr{A} - \hat{\matr{A}}(k) \Vert_2 =
\left\Vert \sum_{i=1}^{r} \sigma_i \matr{A}_i - \sum_{j=1}^{k} \sigma_j \matr{A}_j \right\Vert_2 =
\left\Vert \sum_{i=k+1}^{r} \sigma_i \matr{A}_i \right\Vert_2 =
\sigma_{k+1}
\]
\subsubsection{Image compression}
Each dyad requires $1 + m + n$ (respectively for $\sigma_i$, $\vec{u}_i$ and $\vec{v}_i$) numbers to be stored.
A rank-$k$ approximation requires to store $k(1 + m + n)$ numbers.
Therefore, the compression factor is given by: \marginnote{Compression factor}
\[
c_k = 1 - \frac{k(1 + m + n)}{mn}
\]
\begin{figure}[h]
\centering
\includegraphics[width=0.60\textwidth]{img/_rank_k_approx.pdf}
\caption{Approximation of an image}
\end{figure}
\subsection{Application: Linear least squares problem} \label{sec:lls}
Given a least squares problem:
\[
\tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
\]
When $\text{rank}(\matr{A}) < n$, the system admits infinite solutions.
Of all the solutions $S$, we are interested in the one with minimum norm:
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
This problem can be solved using SVD:
\[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
\subsection{Application: Polynomial interpolation}
\marginnote{Polynomial interpolation}
Given a set of $m$ data $(x_i, y_i), i=1, \dots, m$,
we want to find a polynomial of degree $n$ ($m > n$) that approximates it.
In other words, we want to find a function:
\[ f(x) = c_0 + c_1 x + c_2 x^2 + \dots + c_n x^n \]
that minimizes the residual vector $\vec{r} = (r_1, \dots, r_m)$,
where $r_i = \vert y_i - f(x_i) \vert$.
We can formulate this as a linear system:
\[
\vec{r} = \vec{y} - \matr{A}\vec{c} =
\begin{pmatrix}
y_1 \\
\vdots \\
y_m
\end{pmatrix}
-
\begin{pmatrix}
1 & x_1 & x_1^2 & \dots & x_1^n \\
\vdots & \vdots & \vdots & \ddots & \vdots \\
1 & x_m & x_m^2 & \dots & x_m^n
\end{pmatrix}
\begin{pmatrix}
c_0 \\
\vdots \\
c_n
\end{pmatrix}
\]
that can be solved as a linear least squares problem:
\[ \min_{\vec{c} \in \mathbb{R}^n} \Vert \vec{y} - \matr{A}\vec{c} \Vert_2^2 \]
\begin{figure}[h]
\centering
\includegraphics[width=0.40\textwidth]{img/linear_regression.png}
\caption{Interpolation using a polynomial of degree 1}
\end{figure}
\section{Eigendecomposition vs SVD}
\begin{center}
\begin{tabular}{m{16em} | m{16em}}
\hline
\multicolumn{1}{c|}{\textbf{Eigendecomposition}} & \multicolumn{1}{c}{\textbf{SVD}} \\
\multicolumn{1}{c|}{$\matr{A} = \matr{P}\matr{D}\matr{P}^{-1}$} & \multicolumn{1}{c}{$\matr{A}=\matr{U}\matr{\Sigma}\matr{V}$} \\
\hline
Only defined for square matrices $\matr{A} \in \mathbb{R}^{n \times n}$ with eigenvectors that form a basis of $\mathbb{R}^n$
& Always exists \\
\hline
$\matr{P}$ is not necessarily orthogonal & $\matr{U}$ and $\matr{V}$ are orthogonal \\
\hline
The elements on the diagonal of $\matr{D}$ may be in $\mathbb{C}$
& The elements on the diagonal of $\matr{\Sigma}$ are all non-negative reals \\
\hline
\multicolumn{2}{c}{For symmetric matrices, eigendecomposition and SVD are the same} \\
\hline
\end{tabular}
\end{center}

View File

@ -0,0 +1,522 @@
\chapter{Probability and statistics}
\begin{description}
\item[Probability]
Model of a process where the underlying uncertainty is captured by random variables.
\item[Statistics]
Determines the underlying process that explains an observation.
\end{description}
\section{Probability}
\begin{description}
\item[State space] \marginnote{State space}
Set $\Omega$ of all the possible results of an experiment.
\begin{example}
A coin is tossed two times.
$\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$
\end{example}
\item[Event] \marginnote{Event}
Set of possible results (i.e. $A$ is an event if $A \subseteq \Omega$)
\item[Probability] \marginnote{Probability}
Let $\mathcal{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
The probability of an event is a function:
\[ \prob{A}: \mathcal{E} \rightarrow [0, 1] \]
\begin{example}
Let $\Omega$ be as above.
Given an event $A = \{ (\text{T}, \text{H}), (\text{H}, \text{T}) \}$,
its probability is: $\prob{A} = \frac{2}{4} = \frac{1}{2}$
\end{example}
\item[Conditional probability] \marginnote{Conditional probability}
Probability of an event $B$, knowing that another event $A$ happened:
\[ \prob{B \vert A} = \frac{\prob{A \cap B}}{\prob{A}} \text{, with } \prob{A} \neq 0 \]
\begin{example}
A coin is tossed three times.
Given the events $A = \{ \text{tails two times} \}$ and $B = \{ \text{one heads and one tails} \}$
We have that:
\begin{minipage}{\linewidth}
\centering
\small
$\Omega = \{
(\text{T}, \text{T}, \text{T}), (\text{T}, \text{T}, \text{H}), (\text{T}, \text{H}, \text{T})
(\text{T}, \text{H}, \text{H}), (\text{H}, \text{T}, \text{T}), (\text{H}, \text{T}, \text{H})
(\text{H}, \text{H}, \text{T}), (\text{H}, \text{H}, \text{H})
\}$
\end{minipage}
\begin{minipage}{.325\linewidth}
\centering
$\prob{A} = \frac{4}{8} = \frac{1}{2}$
\end{minipage}
\begin{minipage}{.325\linewidth}
\centering
$\prob{B} = \frac{6}{8} = \frac{3}{4}$
\end{minipage}
\begin{minipage}{.325\linewidth}
\centering
$\prob{A \cap B} = \frac{3}{8}$
\end{minipage}
\begin{minipage}{.48\linewidth}
\centering
$\prob{A \vert B} = \frac{3/8}{3/4} = \frac{1}{2}$
\end{minipage}
\begin{minipage}{.48\linewidth}
\centering
$\prob{B \vert A} = \frac{3/8}{1/2} = \frac{3}{4}$
\end{minipage}
\end{example}
\item[Independent events] \marginnote{Independent events}
Two events $A$ and $B$ are independent if:
\[ \prob{A \cap B} = \prob{A}\prob{B} \]
It follows that:
\begin{minipage}{.48\linewidth}
\centering
$\prob{A \vert B} = \prob{A}$
\end{minipage}
\begin{minipage}{.48\linewidth}
\centering
$\prob{B \vert A} = \prob{B}$
\end{minipage}
In general, given $n$ events $A_1, \dots, A_n$, they are independent if:
\[ \prob{A_1 \cap \dots \cap A_n} = \prod_{i=1}^{n} \prob{A_i} \]
\end{description}
\section{Random variables}
\begin{description}
\item[Random variable (RV)] \marginnote{Random variable}
A random variable $X$ is a function:
\[ X: \Omega \rightarrow \mathbb{R} \]
\item[Target space/Support] \marginnote{Target space}
Given a random variable $X$,
the target space (or support) $\mathcal{T}_X$ of $X$ is the set of all its possible values:
\[ \mathcal{T}_X = \{ x \mid x = X(\omega), \forall \omega \in \Omega \} \]
\end{description}
\subsection{Discrete random variables}
\begin{description}
\item[Discrete random variable] \marginnote{Discrete random variable}
A random variable $X$ is discrete if its target space $\mathcal{T}_X$ is finite or countably infinite.
\begin{example}
A coin is tossed twice.
Given the random variable $X(\omega) = \{ \text{number of heads} \}$.
We have that $\mathcal{T}_X = \{ 0, 1, 2 \}$, therefore $X$ is discrete.
\end{example}
\begin{example}
Roll a die until 6 comes out.
Given the random variable $Y(\omega) = \{ \text{number of rolls before 6} \}$.
We have that $\mathcal{T}_Y = \{ 1, 2, \dots \} = \mathbb{N} \smallsetminus \{0\}$,
therefore $Y$ is discrete as $\mathcal{T}_Y$ is a countable set.
\end{example}
\item[Probability mass function (PMF)] \marginnote{Probability mass function (PMF)}
Given a discrete random variable $X$, its probability mass function is a function $p_X: \mathcal{T}_X \rightarrow [0, 1]$ such that:
\[ p_X(x) = \prob{X = x}, \forall x \in \mathcal{T}_X \]
A PMF has the following properties:
\begin{enumerate}
\item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$
\item $\sum_{x \in \mathcal{T}_X} p_X(x) = 1$
\item Let $A \subseteq \Omega$, $\prob{X = x \in A} = \sum_{x \in A} p_X(x)$
\end{enumerate}
We denote with $X \sim p_X$ a random variable $X$ with PMF $p_X$.
\begin{example}
Let $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$.
Given a random variable $X = \{ \text{number of heads} \}$ with $\mathcal{T}_X = \{ 0, 1, 2 \}$.
Its PMF is:
\[
\begin{split}
p_X &= \prob{X = 0} = \frac{1}{4} \\
p_X &= \prob{X = 1} = \frac{2}{4} \\
p_X &= \prob{X = 2} = \frac{1}{4}
\end{split}
\]
\end{example}
\end{description}
\subsection{Continuous random variables}
\begin{description}
\item[Continuous random variable] \marginnote{Continuous random variable}
A random variable $X$ is continuous if its target space $\mathcal{T}_X$ is uncountably infinite (i.e. a subset of $\mathbb{R}$).
Usually, $\mathcal{T}_X$ is an interval or a union of intervals.
\begin{example}
Given a random variable $Z = \{ \text{Time before the arrival of a client} \}$.
$Z$ is continuous as $\mathcal{T}_Z = [a, b] \subseteq [0, +\infty[$ is an uncountable set.
\end{example}
\item[Probability density function (PDF)] \marginnote{Probability density function (PDF)}
Given a continuous random variable $X$,
its probability density function is a function $p_X: \mathcal{T}_X \rightarrow \mathbb{R}$ such that:
\[ \prob{X \in A} = \int_{A} p_X(x) \,dx \]
\[ \prob{a \leq X \leq b} = \int_{a}^{b} p_X(x) \,dx \]
Note that $\prob{X = a} = \prob{a \leq X \leq a} = \int_{a}^{a} p_X(x) \,dx = 0$
A PDF has the following properties:
\begin{enumerate}
\item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$
\item $\int_{x \in \mathcal{T}_X} p_X(x) \,dx = 1$
\item $\prob{X \in A} = \int_{A} p_X(x) \,dx$
\end{enumerate}
We denote with $X \sim p_X$ a random variable $X$ with PDF $p_X$.
\end{description}
\section{Discrete joint distribution}
\begin{description}
\item[Univariate distribution] \marginnote{Univariate distribution}
Distribution with one random variable.
\item[Multivariate distribution] \marginnote{Multivariate distribution}
Distribution with multiple random variables.
\item[Joint probability] \marginnote{Joint probability}
Let $X$ and $Y$ be random variables respectively with target space $\mathcal{T}_X$ and $\mathcal{T}_Y$.
The joint probability of $X$ and $Y$ has target space $\mathcal{T}_{XY} = \mathcal{T}_X \times \mathcal{T}_Y$
and its PMF is:
\[ p_{XY}(x_i, y_j) = \prob{X = x_i \cap Y = y_j} \]
$p_X(x)$ and $p_Y(y)$ are the \textbf{marginal probabilities}. \marginnote{Marginal probability}
\begin{example}
Let $X$ and $Y$ be random variables respectively with five and three possible states.
\begin{center}
\includegraphics[width=0.4\textwidth]{img/_joint_probability_example.pdf}
\end{center}
We denote with:
\begin{itemize}
\item $N$ the number of events.
\item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p_{XY}(x, y) = n_{ij}$).
\item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column.
\item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row.
\end{itemize}
The marginal probabilities are:\\
\begin{minipage}{.48\linewidth}
\centering
\[ p_X(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
\end{minipage}
\begin{minipage}{.48\linewidth}
\centering
\[ p_Y(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
\end{minipage}
The conditional probabilities can be computed as:
\[ \prob{Y = y_j \vert X = x_i} = \frac{p_{XY}(x_i, y_i)}{p_X(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
\[ \prob{X = x_i \vert Y = y_j} = \frac{p_{XY}(x_i, y_i)}{p_Y(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
\end{example}
\end{description}
\section{Rules of probability}
\subsection{Sum rule}
\marginnote{Sum rule\\Marginalization property}
Given $X$ and $Y$ random variables. The sum rule states that:
\[
p_X(\bm{x}) =
\begin{cases}
\sum_{\bm{y} \in \mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
\int_{\mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
\end{cases}
\]
The sum rule relates the joint distribution and the marginal distribution.
In fact, the sum rule can be applied to any subset of the random variables of a joint distribution.
Given $\bm{x} = \begin{pmatrix} x_1, \dots, x_D \end{pmatrix}^T$,
the marginal w.r.t. $x_i$ can be obtained by integrating/summing out all random variables except $x_i$:
\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\smallsetminus i} \]
\subsection{Product rule}
\marginnote{Product rule}
\[ p(\bm{x}, \bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) = p(\bm{x} \vert \bm{y}) p(\bm{y}) \]
\section{Bayes' theorem}
\begin{theorem}
\marginnote{Bayes' theorem}
Given two random variables $X$ and $Y$:
\[
\overbrace{p(\bm{x} \vert \bm{y})}^{\mathclap{\text{posterior}}} =
\frac
{ \overbrace{p(\bm{y} \vert \bm{x})}^{\mathclap{\text{likelihood }}} \overbrace{p(\bm{x})}^{\mathclap{\text{ prior}}} }
{\underbrace{p(\bm{y})}_{\mathclap{\text{evidence}}}}
\]
where:
\begin{descriptionlist}
\item[Prior] \marginnote{Prior}
is the prior knowledge of the unobserved data $\bm{x}$.
\item[Likelihood] \marginnote{Likelihood}
describes the relation between $\bm{x}$ and $\bm{y}$.
\item[Posterior] \marginnote{Posterior}
represents the quantity of interest (i.e. knowledge on $\bm{x}$ after observing $\bm{y}$).
\item[Evidence/Marginal likelihood] \marginnote{Evidence/Marginal likelihood}
normalizes the posterior. It is defined independently from $\bm{x}$ (i.e. is constant) as:
\[ p(\bm{y}) = \int p(\bm{y} \vert \bm{x}) p(\bm{x}) \,d\bm{x} \]
\end{descriptionlist}
\end{theorem}
\begin{proof}
This is a direct consequence of the product rule:
\[
p(\bm{x} \vert \bm{y}) p(\bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) \iff
p(\bm{x} \vert \bm{y}) p(\bm{y}) = \frac{p(\bm{y} \vert \bm{x}) p(\bm{x})}{p(\bm{y})}
\]
\end{proof}
Note: sometimes, instead of the full posterior, the maximum is considered (with loss of information):
\[ \max_x p(x \vert y) = \max_x \frac{p(y \vert x) p(x)}{\underbrace{p(y)}_{\mathclap{\text{constant}}}} = \max_x p(y \vert x) p(x) \]
\section{Statistics}
\begin{description}
\item[Statistic] \marginnote{Statistic}
A statistic of a random variable is a deterministic function defined on it.
\end{description}
\subsection{Mean}
\begin{description}
\item[Expected value (univariate)] \marginnote{Expected value (univariate)}
Given a function $g$ of a random variable $X \sim p(x)$,
its expected value is:
\[
\mathbb{E}_X[g(x)] =
\begin{cases}
\sum_{x \in \mathcal{T}_X} g(x)p(x) & \text{if } $X$ \text{ is discrete} \\
\int_{\mathcal{T}_X} g(x)p(x) \,dx & \text{if } $X$ \text{ is continuous} \\
\end{cases}
\]
\item[Expected value (multivariate)] \marginnote{Expected value (multivariate)}
A multivariate random variable $X$ can be seen as
a vector of univariate random variables $\begin{pmatrix} X_1, \dots, X_D \end{pmatrix}^T$.
Its expected value can be computed element-wise as:
\[
\mathbb{E}_X[g(\bm{x})] =
\begin{pmatrix} \mathbb{E}_{X_1}[g(x_1)] \\ \vdots \\ \mathbb{E}_{X_D}[g(x_D)] \end{pmatrix} \in \mathbb{R}^D
\]
\item[Mean] \marginnote{Mean}
Given a random variable $X \sim p(x)$,
the mean of $X$ is its expected value with $g$ defined as the identity:
\[
\mathbb{E}_X[x] =
\begin{cases}
\sum_{x \in \mathcal{T}_X} x \cdot p(x) & \text{if } $X$ \text{ is discrete} \\
\int_{\mathcal{T}_X} x \cdot p(x) \,dx & \text{if } $X$ \text{ is continuous} \\
\end{cases}
\]
\end{description}
\subsection{Variance}
\begin{description}
\item[Covariance (univariate)] \marginnote{Covariance (univariate)}
Given two univariate random variables $X$ and $Y$, their covariance is:
\[ \text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[(x - \mathbb{E}_X[x])(y - \mathbb{E}_Y[y])] \]
\begin{lemma}
$\text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[x, y] - \mathbb{E}_{X}[x]\mathbb{E}_{Y}[y]$
\end{lemma}
\item[Variance (univariate)] \marginnote{Variance (univariate)}
The variance of a univariate random variable is given by:
\[ \mathbb{V}_X[x] = \text{Cov}_X[x, x] \]
Its square root is the standard deviation $\sigma(x)$.
\item[Covariance (multivariate)] \marginnote{Covariance (multivariate)}
Given two multivariate random variables
$X$ and $Y$ with states $\bm{x} \in \mathbb{R}^D$ and $\bm{y} \in \mathbb{R}^E$,
their covariance is:
\[
\text{Cov}_{XY}[\bm{x}, \bm{y}] = \text{Cov}_{XY}[\bm{y}, \bm{x}]^T =
\mathbb{E}_{XY}[\bm{xy}^T] - \mathbb{E}_{X}[\bm{x}]\mathbb{E}_{Y}[\bm{y}]^T \in \mathbb{R}^{D \times E}
\]
\item[Variance (multivariate)] \marginnote{Variance (multivariate)}
Given a multivariate random variable $X$ with
states $\bm{x} \in \mathbb{R}^D$ and mean vector $\bm{\mu} \in \mathbb{R}^D$.
Its variance is given by:
\[
\begin{split}
\mathbb{V}_X[\bm{x}] &= \text{Cov}_X[\bm{x}, \bm{x}] \\
&= \mathbb{E}_X[\bm{xx}^T] - \mathbb{E}_X[\bm{x}]\mathbb{E}_X[\bm{x}]^T \\
&=
\begin{pmatrix}
\text{Cov}[x_1, x_1] & \text{Cov}[x_1, x_2] & \cdots & \text{Cov}[x_1, x_D] \\
\text{Cov}[x_2, x_1] & \text{Cov}[x_2, x_2] & \cdots & \text{Cov}[x_2, x_D] \\
\vdots & \vdots & \ddots & \vdots \\
\text{Cov}[x_D, x_1] & \text{Cov}[x_D, x_2] & \cdots & \text{Cov}[x_D, x_D] \\
\end{pmatrix} \in \mathbb{R}^{D \times D}
\end{split}
\]
This matrix is called covariance matrix and is symmetric positive semidefinite.
\item[Correlation] \marginnote{Correlation}
Given two random variables $X$ and $Y$, their correlation is:
\[ \text{corr}[x, y] = \frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}} \in [-1, 1] \]
\begin{itemize}
\item When $\text{corr}[x, y] \rightarrow +1$, $x$ and $y$ are expected to grow together.
\item When $\text{corr}[x, y] \rightarrow -1$, $x$ grows when $y$ decreases and vice versa.
\item When $\text{corr}[x, y] \rightarrow 0$, $x$ and $y$ are not correlated.
\end{itemize}
\end{description}
\subsection{Empirical mean and variance}
In practice, it is not always possible to compute statistics on the real population.
Empirical observations can be made on a (finite) subset of the real population sampled as
a finite number of identical random variables $X_1, \dots, X_N$.
\begin{description}
\item[Empirical mean] \marginnote{Empirical mean}
\[ \bar{x} = \frac{1}{N} \sum_{n=1}^{N}x_n \]
\item[Empirical variance] \marginnote{Empirical variance}
\[ \sigma^2 = \frac{1}{N} \sum_{n=1}^{N}(x_n - \bar{x})^2 \]
\end{description}
\section{Random variables properties}
\subsection{Manipulations}
\begin{itemize}
\item $\mathbb{E}[\bm{x} + \bm{y}] = \mathbb{E}[\bm{x}] + \mathbb{E}[\bm{y}]$
\marginnote{Manipulations of random variables}
\item $\mathbb{E}[\bm{x} - \bm{y}] = \mathbb{E}[\bm{x}] - \mathbb{E}[\bm{y}]$
\item $\mathbb{V}[\bm{x} + \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] + \text{Cov}[\bm{x}, \bm{y}] + \text{Cov}[\bm{y}, \bm{x}]$
\item $\mathbb{V}[\bm{x} - \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] - \text{Cov}[\bm{x}, \bm{y}] - \text{Cov}[\bm{y}, \bm{x}]$
\end{itemize}
\subsection{Statistical independence}
\marginnote{Statistical independence}
Two random variables $X$ and $Y$ are statistically independent iff:
\[ p(\bm{x}, \bm{y}) = p(\bm{x})p(\bm{y}) \]
\begin{theorem}
If $X$ and $Y$ are statistically independent, then:
\begin{itemize}
\item $p(\bm{x} \vert \bm{y}) = p(\bm{x})$ and $p(\bm{y} \vert \bm{x}) = p(\bm{y})$
\item $\mathbb{V}_{XY}[\bm{x} + \bm{y}] = \mathbb{V}_X[\bm{x}] + \mathbb{V}_Y[\bm{y}]$
\item $\text{Cov}_{XY}[\bm{x}, \bm{y}] = \nullvec$
\end{itemize}
\end{theorem}
\subsection{Conditional independence}
\marginnote{Conditional independence}
Two random variables $X$ and $Y$ are conditionally independent given $Z$ iff:
\[ p(\bm{x}, \bm{y} \vert \bm{z}) = p(\bm{x} \vert \bm{z}) p(\bm{y} \vert \bm{z}) \, \forall \bm{z} \in \mathcal{T}_Z \]
\subsection{Inner product}
\marginnote{Inner product of random variables}
Given two zero mean random variables $X$ and $Y$, their inner product is defined as:
\[ \left\langle X, Y \right\rangle = \text{Cov}[x, y] \]
The covariance matrix is symmetric positive definite.
Moreover, we have that:
\begin{itemize}
\item $\Vert X \Vert = \sqrt{\langle X, X \rangle} = \sqrt{\text{Cov}[x, x]} = \sqrt{\mathbb{V}[x]} = \sigma[x]$
\item
$\cos\theta = \frac{\langle X, Y \rangle}{\Vert X \Vert \cdot \Vert Y \Vert} =
\frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}}$, where $\theta$ is the angle between $X$ and $Y$.
\item $X \perp Y \iff \langle X, Y \rangle = 0 \iff \text{Cov}[x, y] = 0 \iff X \text{ and } Y \text{ uncorrelated}$
\end{itemize}
\section{Common distributions}
\subsection{Discrete random variables}
\begin{descriptionlist}
\item[Uniform distribution] \marginnote{Uniform distribution}
Given a discrete random variable $X$ with $\vert \mathcal{T}_X \vert = N$,
$X$ has a uniform distribution if:
\[ p_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
\item[Poisson distribution] \marginnote{Poisson distribution}
Given a discrete random variable $X$ with mean $\lambda$,
$X$ has a poisson distribution if:
\[ p_X(x) = e^{-\lambda} \frac{\lambda^x}{x!}, \forall x \in \mathcal{T}_X \]
A poisson distribution has $\mathbb{E}[x] = \lambda$ and $\mathbb{V}[x] = \lambda$.
\end{descriptionlist}
\subsection{Continuous random variables}
\begin{descriptionlist}
\item[Continuous uniform distribution] \marginnote{Continuous uniform distribution}
Given a continuous random variable $X$ with $\mathcal{T}_X = [a, b]$,
$X$ has a continuous uniform distribution if:
\[ p_X(x) = \frac{1}{b-a}, \forall x \in \mathcal{T}_X \]
\item[Normal distribution] \marginnote{Normal distribution}
Given a continuous random variable $X$ and the parameters $\mu$ (mean) and $\sigma$ (variance).
$X$ has a normal distribution if:
\[ p_X(x) = \frac{1}{\sigma \sqrt{2\pi}} e^{\frac{-(x-\mu)^2}{2\sigma^2}} , \forall x \in \mathcal{T}_X\]
In the multivariate case, it is defined as:
\[
p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}, \matr{\Sigma}) =
(2\pi)^{-\frac{D}{2}} \vert \matr{\Sigma} \vert^{-\frac{1}{2}} e^{(-\frac{1}{2}(\bm{x} - \bm{\mu})^T\matr{\Sigma}^{-1}(\bm{x}-\bm{\mu}))}
\in \mathbb{R}
\]
where $\bm{\mu}$ is the mean vector and $\matr{\Sigma}$ the covariance matrix.
\begin{description}
\item[Standard normal distribution] \marginnote{Standard normal distribution}
Normal distribution with $\mu = 0$ and $\sigma = 1$ (univariate) or
$\bm{\mu} = \nullvec$ and $\matr{\Sigma} = \matr{I}$ (multivariate).
\end{description}
\begin{figure}[ht]
\centering
\includegraphics[width=0.40\textwidth]{img/normal_distribution.png}
\caption{Normal distributions and standard normal distribution}
\end{figure}
\begin{theorem}[Linearity]
\marginnote{Gaussian sum and linear transformations}
Given $X$ and $Y$ independent Gaussian random variables with
$p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}_x, \matr{\Sigma}_x)$ and
$p(\bm{y}) = \mathcal{N}(\bm{y} \vert \bm{\mu}_y, \matr{\Sigma}_y)$.
It holds that:
\[ p(a\bm{x} + b\bm{y}) = \mathcal{N}(a\bm{\mu}_x + b\bm{\mu}_y, a^2\matr{\Sigma}_x + b^2\matr{\Sigma}_y) \]
\end{theorem}
\end{descriptionlist}

View File

@ -0,0 +1,360 @@
\chapter{Vector calculus}
\section{Gradient of real-valued multivariate functions}
\begin{description}
\item[Gradient] \marginnote{Gradient}
Given a function $f: \mathbb{R}^n \rightarrow \mathbb{R}$,
the gradient is a row vector containing the partial derivatives of $f$:
\[
\nabla f(\vec{x}) =
\begin{pmatrix}
\frac{\partial f(\vec{x})}{\partial x_1} & \frac{\partial f(\vec{x})}{\partial x_2} & \dots & \frac{\partial f(\vec{x})}{\partial x_n}
\end{pmatrix}
\in \mathbb{R}^{1 \times n}
\]
\item[Hessian] \marginnote{Hessian matrix}
Given a function $f: \mathbb{R}^n \rightarrow \mathbb{R}$,
the Hessian matrix $\matr{H} \in \mathbb{R}^{n \times n}$ contains the second derivatives of $f$:
\[
\matr{H} =
\begin{pmatrix}
\frac{\partial f}{\partial x_1^2} & \frac{\partial f}{\partial x_1 \partial x_2} & \dots & \frac{\partial f}{\partial x_1 \partial x_n} \\
\frac{\partial f}{\partial x_2 \partial x_1} & \frac{\partial f}{\partial x_2^2} & \dots & \vdots \\
\vdots & \vdots & \ddots & \vdots \\
\frac{\partial f}{\partial x_n \partial x_1} & \dots & \dots & \frac{\partial f}{\partial x_n^2}
\end{pmatrix}
\]
In other words, $H_{i,j} = \frac{\partial f}{\partial x_i \partial x_j}$.
Moreover, $\matr{H}$ is symmetric.
\end{description}
\subsection{Partial differentiation rules}
\begin{description}
\item[Product rule] \marginnote{Product rule}
Let $f, g: \mathbb{R}^n \rightarrow \mathbb{R}$:
\[
\frac{\partial}{\partial \vec{x}} (f(\vec{x})g(\vec{x})) =
\frac{\partial f}{\partial \vec{x}} g(\vec{x}) + f(\vec{x}) \frac{\partial g}{\partial \vec{x}}
\]
\item[Sum rule] \marginnote{Sum rule}
Let $f, g: \mathbb{R}^n \rightarrow \mathbb{R}$:
\[
\frac{\partial}{\partial \vec{x}} (f(\vec{x}) + g(\vec{x})) =
\frac{\partial f}{\partial \vec{x}} + \frac{\partial g}{\partial \vec{x}}
\]
\item[Chain rule] \marginnote{Chain rule}
Let $f: \mathbb{R}^n \rightarrow \mathbb{R}$ and $\vec{g}$ a vector of $n$ functions $g_i: \mathbb{R}^m \rightarrow \mathbb{R}$:
\[
\frac{\partial}{\partial \vec{x}} (f \circ \vec{g})(\vec{x}) =
\frac{\partial}{\partial \vec{x}} \Big( f(\vec{g}(\vec{x})) \Big) =
\frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial \vec{x}}
\]
For instance, consider a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables
$g_1(t), g_2(t): \mathbb{R} \rightarrow \mathbb{R}$ that are functions of $t$.
The gradient of $f$ with respect to $t$ is:
\[
\frac{\text{d}f}{\text{d}t} =
% \frac{\partial f}{\partial (g_1, g_2)} \frac{\partial (g_1, g_2)}{\partial t} =
\begin{pmatrix}
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
\end{pmatrix}
\begin{pmatrix}
\frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
\end{pmatrix}
= \frac{\partial f}{\partial g_1} \frac{\partial g_1}{\partial t} + \frac{\partial f}{\partial g_2} \frac{\partial g_2}{\partial t}
\]
In other words, the first matrix represents the gradient of $f$ w.r.t. its variables and
the second matrix contains in the $i$-th row the gradient of $g_i$.
Therefore, if $g_i$ are in turn multivariate functions $g_1(s, t), g_2(s, t): \mathbb{R}^2 \rightarrow \mathbb{R}$,
the chain rule can be applied as follows:
\[
\frac{\text{d}f}{\text{d}(s, t)} =
\begin{pmatrix}
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
\end{pmatrix}
\begin{pmatrix}
\frac{\partial g_1}{\partial s} & \frac{\partial g_1}{\partial t} \\
\frac{\partial g_2}{\partial s} & \frac{\partial g_2}{\partial t}
\end{pmatrix}
\]
\begin{example}
Let $f(x_1, x_2) = x_1^2 + 2x_2$, where $x_1 = \sin(t)$ and $x_2 = \cos(t)$.
\[
\begin{split}
\frac{\text{d}f}{\text{d}t} & =
\frac{\partial f}{\partial x_1}\frac{\partial x_1}{\partial t} + \frac{\partial f}{\partial x_2}\frac{\partial x_2}{\partial t} \\
& = (2x_1)(\cos(t)) + (2)(-\sin(t)) \\
& = 2\sin(t)\cos(t) - 2\sin(t)
\end{split}
\]
\end{example}
\begin{example}
Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ \vec{g})(t) = f(\vec{g}(t))$ where:
\[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(g_1, g_2) = \exp(g_1 g_2^2) \]
\[
\vec{g}: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as }
\vec{g}(t) = \begin{pmatrix} g_1 \\ g_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
\]
The gradient of $h$ with respect to $t$ can be computed as:
\[
\frac{\text{d} h}{\text{d} t} =
\frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial t} =
\begin{pmatrix}
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
\end{pmatrix}
\begin{pmatrix}
\frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
\end{pmatrix}
\]
\[
=
\begin{pmatrix} \exp(g_1 g_2^2)g_2^2 & 2\exp(g_1 g_2^2)g_1 g_2 \end{pmatrix}
\begin{pmatrix} \cos(t) + (-t\sin(t)) \\ \sin(t) + t\cos(t) \end{pmatrix}
\]
\end{example}
\begin{example}[Gradient of a least squares loss] \marginnote{Least squares loss gradient}
Given a linear model defined on $\vec{\uptheta}$:
\[ \vec{y} = \matr{\Phi}\vec{\uptheta} \]
\end{example}
with $\vec{\uptheta} \in \mathbb{R}^D$, $\matr{\Phi} \in \mathbb{R}^{N \times D}$ and $\vec{y} \in \mathbb{R}^N$.
We can define the least squares loss function as:
\[ L(\vec{e}) = \Vert \vec{e} \Vert_2^2 \]
\[ \vec{e}(\vec{\uptheta}) = \vec{y} - \matr{\Phi}\vec{\uptheta} \]
It must be noted that:
\[ L(\vec{e}) = \Vert \vec{e} \Vert_2^2 = \vec{e}^T\vec{e} = \sum_{i=1}^{N} \vec{e}_i^2 \]
To compute the gradient of $L$ with respect to $\vec{\uptheta}$, we can use the chain rule:
\[
\begin{split}
\nabla L(\vec{\uptheta}) &= \frac{\partial L}{\partial \vec{e}} \frac{\partial \vec{e}}{\partial \vec{\uptheta}}
= (2\vec{e}^T) (-\matr{\Phi}) \\
& = -2(\vec{y}^T - \vec{\uptheta}^T \matr{\Phi}^T)\matr{\Phi} \\
& = -2(\vec{y}^T\matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi})
\end{split}
\]
Note that if we enforce $\nabla L(\vec{\uptheta}) = \nullvec$, we obtain the normal equation of \Cref{sec:lls}:
\[
\begin{split}
\nabla L = 0 &\iff -2(\vec{y}^T\matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi}) = \nullvec \\
&\iff \vec{y}^T \matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi} = \nullvec \\
&\iff \matr{\Phi}^T \vec{y} - \matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \nullvec
\end{split}
\]
\end{description}
\section{Gradient of vector-valued multivariate functions}
\begin{description}
\item[Vector-valued function]
Function $\vec{f}: \mathbb{R}^n \rightarrow \mathbb{R}^m$ with $n \geq 1$ and $m > 1$.
Given $\vec{x} \in \mathbb{R}^n$, the output can be represented as:
\[
\vec{f}(\vec{x}) =
\begin{pmatrix}
f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})
\end{pmatrix} \in \mathbb{R}^m
\]
where $f_i: \mathbb{R}^n \rightarrow \mathbb{R}$.
\item[Jacobian] \marginnote{Jacobian matrix}
Given $\vec{f}: \mathbb{R}^n \rightarrow \mathbb{R}^m$, the Jacobian matrix $\matr{J} \in \mathbb{R}^{m \times n}$
contains the first-order derivatives of $\vec{f}$:
\[
\matr{J} = \nabla\vec{f}(\vec{x}) =
\begin{pmatrix}
\frac{\partial \vec{f}(\vec{x})}{\partial x_1} & \dots & \frac{\partial \vec{f}(\vec{x})}{\partial x_n}
\end{pmatrix} =
\begin{pmatrix}
\frac{\partial f_1(\vec{x})}{\partial x_1} & \dots & \frac{\partial f_1(\vec{x})}{\partial x_n} \\
\vdots & \ddots & \vdots \\
\frac{\partial f_m(\vec{x})}{\partial x_1} & \dots & \frac{\partial f_m(\vec{x})}{\partial x_n} \\
\end{pmatrix}
\]
In other words, $J_{i,j} = \frac{\partial f_i}{\partial x_j}$.
Note that the Jacobian matrix is a generalization of the gradient in the real-valued case.
\end{description}
\section{Backpropagation}
\marginnote{Backpropagation}
Backpropagation is used to tune the parameters of a neural network.
A neural network can be seen as a composition of many functions:
\[ \vec{y} = (\vec{f}_K \circ \vec{f}_{K-1} \circ \dots \circ \vec{f}_1)(\vec{x}) = \vec{f}_K(\vec{f}_{K-1}(\cdots \vec{f}_1(\vec{x}) \cdots)) \]
Each $\vec{f}_i$ takes as input the output of the previous layer $\vec{x}_{i-1}$ and has the form:
\[ \vec{f}_i(\vec{x}_{i-1}) = \sigma_i(\matr{A}_{i-1}\vec{x}_{i-1} + \vec{b}_{i-1}) \]
where $\sigma_i$ is an activation function\footnote{\url{https://en.wikipedia.org/wiki/Activation_function}} (a function to add nonlinearity),
while $\matr{A}_{i-1}$ (linear mapping) and $\vec{b}_{i-1}$ (biases) are the parameters of $\vec{f}_i$.
\begin{figure}[ht]
\centering
\includegraphics[width=0.7\textwidth]{img/_forward_pass.pdf}
\caption{Forward pass}
\end{figure}
We can more compactly denote a neural network with input $\vec{x}$ and $K$ layers as:
\[
\begin{split}
\vec{f}_0 &= \vec{x} \\
\vec{f}_i &= \sigma_i(\matr{A}_{i-1} \vec{f}_{i-1} + \vec{b}_{i-1}) \text{ } i=1, \dots, K
\end{split}
\]
Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimize the squared loss:
\[ L(\vec{\uptheta}) = \Vert \vec{y} - \vec{f}_K(\vec{\uptheta}, \vec{x}) \Vert^2 \]
where $\vec{\uptheta} = \{ \matr{A}_{0}, \vec{b}_{0}, \dots, \matr{A}_{K-1}, \vec{b}_{K-1} \}$ are the parameters of each layer.
This can be done by using the chain rule to compute the partial derivatives of $L$ with respect to the parameters $\vec{\uptheta}_j = \{ \matr{A}_j, \vec{b}_j \}$:
\[
\begin{split}
\frac{\partial L}{\partial \vec{\uptheta}_{K-1}} &=
\overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{\uptheta}_{K-1}}}^{\mathclap{\text{New}}} \\
\frac{\partial L}{\partial \vec{\uptheta}_{K-2}} &=
\overbrace{\frac{\partial L}{\partial \vec{f}_K}}^{\mathclap{\text{Known}}}
\overbrace{\frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}} \frac{\partial \vec{f}_{K-1}}{\partial \vec{\uptheta}_{K-2}}}^{\mathclap{\text{New}}} \\
\frac{\partial L}{\partial \vec{\uptheta}_{K-3}} &=
\overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}}}^{\mathclap{\text{Known}}}
\overbrace{\frac{\partial \vec{f}_{K-1}}{\partial \vec{f}_{K-2}} \frac{\partial \vec{f}_{K-2}}{\partial \vec{\uptheta}_{K-3}}}^{\mathclap{\text{New}}} \\
\vdots \\
\frac{\partial L}{\partial \vec{\uptheta}_{i}} &=
\overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}} \dots}^{\mathclap{\text{Known}}}
\overbrace{\frac{\partial \vec{f}_{i+2}}{\partial \vec{f}_{i+1}} \frac{\partial \vec{f}_{i+1}}{\partial \vec{\uptheta}_{i}}}^{\mathclap{\text{New}}}
\end{split}
\]
\begin{figure}[ht]
\centering
\includegraphics[width=0.7\textwidth]{img/_backward_pass.pdf}
\caption{Backward pass}
\end{figure}
\section{Automatic differentiation}
Starting from the example below first is recommended.\\
\marginnote{Automatic differentiation}
Automatic differentiation allows to numerically compute
the gradient of complex functions using elementary functions, intermediate variables and the chain rule through a computation graph.
When the gradient has many components, it also allows to compute it more efficiently.
Let $f$ be a function,
$x_1, \dots, x_d$ the input variables of $f$,
$x_{d+1}, \dots, x_{D-1}$ the intermediate variables and
$x_D$ the output variable.
The computation graph can be expressed as:
\[
\forall i \in \{ d+1, \dots, D \}: x_i = g_i(x_{\text{Pa}(x_i)})
\]
where $g_i$ are elementary functions and $x_{\text{Pa}(x_i)}$ are the parent nodes of $x_i$ in the graph.
In other words, each intermediate variable is expressed as an elementary function of its preceding nodes.
The derivatives of $f$ can then be computed step-by-step going backward as:
\[ \frac{\partial f}{\partial x_D} = 1 \text{, as by definition } f = x_D \]
\[
\frac{\partial f}{\partial x_i} = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial x_c}{\partial x_i}
= \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial g_c}{\partial x_i}
\]
where $\text{Pa}(x_c)$ is the set of parent nodes of $x_c$ in the graph.
In other words, to compute the partial derivative of $f$ w.r.t. $x_i$,
we apply the chain rule by computing
the partial derivative of $f$ w.r.t. the variables following $x_i$ in the graph (as the computation goes backward).
Automatic differentiation is applicable to all functions that can be expressed as a computational graph and
when the elementary functions are differentiable.
Note that backpropagation is a special case of automatic differentiation.
\begin{example}
Given the function:
\[ f(x) = \sqrt{x^2 + \exp(x^2)} + \cos(x^2 + \exp(x^2)) \]
and the elementary functions $\{ (\cdot)^2, \exp(\cdot), +, \sqrt{\cdot}, \cos(\cdot) \}$,
$f$ can be decomposed in the following intermediate variables:\\
\begin{minipage}{.5\linewidth}
\[
\begin{split}
a &= x^2 \\
b &= \exp(a) \\
c &= a + b \\
d &= \sqrt{c} \\
\end{split}
\]
\end{minipage}%
\begin{minipage}{.5\linewidth}
\[
\begin{split}
e &= \cos(c) \\
f &= d + e \\
\end{split}
\]
\end{minipage}\\
Which corresponds to the following computation graph:
\begin{center}
\includegraphics[width=0.75\textwidth]{img/auto_diff.png}
\end{center}
We can then compute the derivatives of the intermediate variables w.r.t. their inputs (i.e. inbound edges):\\
\begin{minipage}{.5\linewidth}
\[
\begin{split}
\frac{\partial a}{\partial x} &= 2x \\
\frac{\partial b}{\partial a} &= \exp(a) \\
\frac{\partial c}{\partial a} &= 1 \\
\frac{\partial c}{\partial b} &= 1
\end{split}
\]
\end{minipage}%
\begin{minipage}{.5\linewidth}
\[
\begin{split}
\frac{\partial d}{\partial c} &= \frac{1}{2\sqrt{c}} \\
\frac{\partial e}{\partial c} &= -\sin(c) \\
\frac{\partial f}{\partial d} &= 1 \\
\frac{\partial f}{\partial e} &= 1
\end{split}
\]
\end{minipage}\\
Finally, we can compute $\frac{\partial f}{\partial x}$ by going backward from the output ($f$) to the input ($x$):\\
\begin{minipage}{.5\linewidth}
\[
\begin{split}
\frac{\partial f}{\partial d} &= \text{ known (previous step)} \\
\frac{\partial f}{\partial e} &= \text{ known (previous step)} \\
\frac{\partial f}{\partial c} &=
\frac{\partial f}{\partial d}\frac{\partial d}{\partial c} + \frac{\partial f}{\partial e}\frac{\partial e}{\partial c} \\
\end{split}
\]
\end{minipage}%
\begin{minipage}{.5\linewidth}
\[
\begin{split}
\frac{\partial f}{\partial b} &= \frac{\partial f}{\partial c}\frac{\partial c}{\partial b} \\
\frac{\partial f}{\partial a} &=
\frac{\partial f}{\partial b}\frac{\partial b}{\partial a} + \frac{\partial f}{\partial c}\frac{\partial c}{\partial a} \\
\frac{\partial f}{\partial x} &= \frac{\partial f}{\partial a}\frac{\partial a}{\partial x}
\end{split}
\]
\end{minipage}\\
In other words, to compute the partial derivative of $f$ w.r.t. a variable $x_i$,
all variables $w_j$ that follows $x_i$ in the graph are considered.
Now, by substituting we obtain:
\[
\begin{split}
\frac{\partial f}{\partial c} &= 1 \cdot \frac{1}{2\sqrt{c}} + 1 \cdot (-\sin(c)) \\
\frac{\partial f}{\partial b} &= \frac{\partial f}{\partial c} \cdot 1 \\
\frac{\partial f}{\partial a} &= \frac{\partial f}{\partial b} \cdot \exp(a) + \frac{\partial f}{\partial c} \cdot 1 \\
\frac{\partial f}{\partial x} &= \frac{\partial f}{\partial a} \cdot 2x
\end{split}
\]
\end{example}

View File

@ -0,0 +1,21 @@
\documentclass[11pt]{ainotes}
\title{Statistical and Mathematical Methods for Artificial Intelligence}
\date{2023 -- 2024}
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
\begin{document}
\makenotesfront
\input{sections/_finite_numbers.tex}
\input{sections/_linear_algebra.tex}
\input{sections/_linear_systems.tex}
\input{sections/_matrix_decomp.tex}
\input{sections/_vector_calculus.tex}
\input{sections/_gradient_methods.tex}
\input{sections/_probability.tex}
\input{sections/_machine_learning.tex}
\eoc
\end{document}