Moved SMM in year1
@ -0,0 +1 @@
|
||||
../../ainotes.cls
|
||||
|
After Width: | Height: | Size: 42 KiB |
|
After Width: | Height: | Size: 73 KiB |
|
After Width: | Height: | Size: 31 KiB |
|
After Width: | Height: | Size: 9.3 KiB |
|
After Width: | Height: | Size: 9.8 KiB |
|
After Width: | Height: | Size: 9.6 KiB |
|
After Width: | Height: | Size: 9.7 KiB |
@ -0,0 +1,88 @@
|
||||
<mxfile host="app.diagrams.net" modified="2023-09-22T09:37:27.395Z" agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0" etag="3qzh6VvLSaXopiRghqnY" version="21.7.0" type="device">
|
||||
<diagram name="Pagina-1" id="mETDQKEhh33VIil_YAIY">
|
||||
<mxGraphModel dx="819" dy="401" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-1" value="" style="ellipse;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="200" y="300" width="150" height="150" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-2" value="" style="ellipse;fillStyle=auto;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||
<mxGeometry x="280" y="340" width="10" height="10" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-3" value="" style="ellipse;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||
<mxGeometry x="280" y="400" width="10" height="10" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-4" value="U&nbsp; " style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
|
||||
<mxGeometry x="240" y="330" width="40" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-5" value="<div align="right"><font style="font-size: 15px;" face="Times New Roman">&nbsp;U</font><font style="font-size: 15px;" face="Times New Roman">+ΔU&nbsp; <br></font></div>" style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="230" y="390" width="50" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-6" value="" style="ellipse;whiteSpace=wrap;html=1;" parent="1" vertex="1">
|
||||
<mxGeometry x="420" y="300" width="150" height="150" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-7" value="" style="ellipse;fillStyle=auto;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||
<mxGeometry x="480" y="340" width="10" height="10" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-8" value="" style="ellipse;fillColor=#99CCFF;" parent="1" vertex="1">
|
||||
<mxGeometry x="480" y="400" width="10" height="10" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-9" value="&nbsp;V" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
|
||||
<mxGeometry x="490" y="330" width="50" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-10" value="<div align="left"><font style="font-size: 15px;" face="Times New Roman">&nbsp;V</font><font style="font-size: 15px;" face="Times New Roman">+ΔV&nbsp; <br></font></div>" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="490" y="390" width="50" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-11" value="" style="endArrow=classic;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;curved=1;" parent="1" source="AFoxFzemWGuV3oYDkwgm-2" target="AFoxFzemWGuV3oYDkwgm-7" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="410" y="420" as="sourcePoint" />
|
||||
<mxPoint x="460" y="370" as="targetPoint" />
|
||||
<Array as="points">
|
||||
<mxPoint x="390" y="310" />
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-12" value="" style="endArrow=classic;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;curved=1;" parent="1" source="AFoxFzemWGuV3oYDkwgm-3" target="AFoxFzemWGuV3oYDkwgm-8" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="300" y="355" as="sourcePoint" />
|
||||
<mxPoint x="530" y="355" as="targetPoint" />
|
||||
<Array as="points">
|
||||
<mxPoint x="390" y="360" />
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-13" value="<font face="Times New Roman" size="1"><i><font style="font-size: 15px;">f</font></i></font>" style="text;html=1;strokeColor=none;fillColor=none;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="330" y="290" width="110" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-15" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;" parent="1" source="AFoxFzemWGuV3oYDkwgm-7" target="AFoxFzemWGuV3oYDkwgm-8" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="450" y="380" as="sourcePoint" />
|
||||
<mxPoint x="500" y="330" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-16" value="" style="endArrow=none;dashed=1;html=1;dashPattern=1 3;strokeWidth=2;rounded=0;entryX=0.5;entryY=1;entryDx=0;entryDy=0;exitX=0.5;exitY=0;exitDx=0;exitDy=0;" parent="1" source="AFoxFzemWGuV3oYDkwgm-3" target="AFoxFzemWGuV3oYDkwgm-2" edge="1">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="270" y="410" as="sourcePoint" />
|
||||
<mxPoint x="320" y="360" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-17" value="<font style="font-size: 15px;" face="Times New Roman">Δ</font>U&nbsp; " style="text;html=1;strokeColor=none;fillColor=none;align=right;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontFamily=Times New Roman;fontSize=15;" parent="1" vertex="1">
|
||||
<mxGeometry x="240" y="360" width="40" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-18" value="<div align="left"><font style="font-size: 15px;" face="Times New Roman">&nbsp;</font><font style="font-size: 15px;" face="Times New Roman">ΔV&nbsp; <br></font></div>" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="490" y="360" width="30" height="30" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="AFoxFzemWGuV3oYDkwgm-19" value="<div align="left"><font style="font-size: 15px;" face="Times New Roman">Inherent error </font></div>" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" parent="1" vertex="1">
|
||||
<mxGeometry x="580" y="355" width="90" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="x--qwbr77Wqyja1BnvlK-2" value="" style="endArrow=classic;html=1;rounded=0;entryX=1;entryY=0.5;entryDx=0;entryDy=0;exitX=0;exitY=0.5;exitDx=0;exitDy=0;strokeWidth=2;" edge="1" parent="1" source="AFoxFzemWGuV3oYDkwgm-19" target="AFoxFzemWGuV3oYDkwgm-18">
|
||||
<mxGeometry width="50" height="50" relative="1" as="geometry">
|
||||
<mxPoint x="600" y="375" as="sourcePoint" />
|
||||
<mxPoint x="450" y="370" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
||||
|
After Width: | Height: | Size: 11 KiB |
|
After Width: | Height: | Size: 5.2 KiB |
|
After Width: | Height: | Size: 9.4 KiB |
|
After Width: | Height: | Size: 39 KiB |
|
After Width: | Height: | Size: 133 KiB |
@ -0,0 +1,11 @@
|
||||
{
|
||||
"name": "Statistical and Mathematical Methods for Artificial Intelligence",
|
||||
"year": 1,
|
||||
"semester": 1,
|
||||
"pdfs": [
|
||||
{
|
||||
"name": null,
|
||||
"path": "smm.pdf"
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -0,0 +1,208 @@
|
||||
\chapter{Finite numbers}
|
||||
|
||||
|
||||
|
||||
\section{Sources of error}
|
||||
|
||||
\begin{description}
|
||||
\item[Measure error] \marginnote{Measure error}
|
||||
Precision of the measuring instrument.
|
||||
|
||||
\item[Arithmetic error] \marginnote{Arithmetic error}
|
||||
Propagation of rounding errors in each step of an algorithm.
|
||||
|
||||
\item[Truncation error] \marginnote{Truncation error}
|
||||
Approximating an infinite procedure to a finite number of iterations.
|
||||
|
||||
\item[Inherent error] \marginnote{Inherent error}
|
||||
Caused by the finite representation of the data (floating-point).
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.6\textwidth]{img/_inherent_error.pdf}
|
||||
\caption{Inherent error visualization}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Error measurement}
|
||||
|
||||
Let $x$ be a value and $\hat{x}$ its approximation. Then:
|
||||
\begin{descriptionlist}
|
||||
\item[Absolute error]
|
||||
\[
|
||||
E_{a} = \hat{x} - x
|
||||
\marginnote{Absolute error}
|
||||
\]
|
||||
Note that, out of context, the absolute error is meaningless.
|
||||
\item[Relative error]
|
||||
\[
|
||||
E_{r} = \frac{\hat{x} - x}{x}
|
||||
\marginnote{Relative error}
|
||||
\]
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Representation in base \texorpdfstring{$\beta$}{B}}
|
||||
|
||||
Let $\beta \in \mathbb{N}_{> 1}$ be the base.
|
||||
Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as:
|
||||
\begin{equation}
|
||||
\label{eq:finnum_b_representation}
|
||||
x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots + d_n\beta^{-n})\beta^p
|
||||
\end{equation}
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item $0 \leq d_i \leq \beta-1$
|
||||
\item $d_1 \neq 0$
|
||||
\item starting from an index $i$, not all $d_j$ ($j \geq i$) are equal to $\beta-1$
|
||||
\end{itemize}
|
||||
%
|
||||
\Cref{eq:finnum_b_representation} can be represented using the normalized scientific notation as: \marginnote{Normalized scientific notation}
|
||||
\[
|
||||
x = \pm (0.d_1d_2\dots) \beta^p
|
||||
\]
|
||||
where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponent}. \marginnote{Mantissa\\Exponent}
|
||||
|
||||
|
||||
|
||||
\section{Floating-point}
|
||||
A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the parameters: \marginnote{Floating-point}
|
||||
\begin{itemize}
|
||||
\item $\beta$: base
|
||||
\item $t$: precision (number of digits in the mantissa)
|
||||
\item $[L, U]$: range of the exponent
|
||||
\end{itemize}
|
||||
|
||||
Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized form:
|
||||
\begin{eqnarray}
|
||||
x = \pm (0.d_1d_2 \dots d_t) \beta^p & L \leq p \leq U
|
||||
\end{eqnarray}
|
||||
We denote with $\texttt{fl}(x)$ the representation of $x \in \mathbb{R}$ in a given floating-point system.
|
||||
|
||||
\begin{example}
|
||||
In $\mathcal{F}(10, 5, -3, 3)$, $x=12.\bar{3}$ is represented as:
|
||||
\begin{equation*}
|
||||
\texttt{fl}(x) = + 0.12333 \cdot 10^2
|
||||
\end{equation*}
|
||||
\end{example}
|
||||
|
||||
|
||||
\subsection{Numbers distribution}
|
||||
Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the total amount of representable numbers is:
|
||||
\begin{equation*}
|
||||
2(\beta-1) \beta^{t-1} (U-L+1)+1
|
||||
\end{equation*}
|
||||
%
|
||||
Representable numbers are more sparse towards the exponent upper bound and more dense towards the lower bound.
|
||||
It must be noted that there is an underflow area around 0.
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.8\textwidth]{img/floatingpoint_range.png}
|
||||
\caption{Floating-point numbers in $\mathcal{F}(2, 3, -1, 2)$}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Number representation}
|
||||
Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in:
|
||||
\begin{descriptionlist}
|
||||
\item[Exact representation]
|
||||
if $p \in [L, U]$ and $d_i=0$ for $i>t$.
|
||||
|
||||
\item[Approximation] \marginnote{Truncation\\Rounding}
|
||||
if $p \in [L, U]$ but $d_i$ may not be 0 for $i>t$.
|
||||
In this case, the representation is obtained by truncating or rounding the value.
|
||||
|
||||
\item[Underflow] \marginnote{Underflow}
|
||||
if $p < L$. In this case, the value is approximated to 0.
|
||||
|
||||
\item[Overflow] \marginnote{Overflow}
|
||||
if $p > U$. In this case, an exception is usually raised.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Machine precision}
|
||||
Machine precision $\varepsilon_{\text{mach}}$ determines the accuracy of a floating-point system. \marginnote{Machine precision}
|
||||
Depending on the approximation approach, machine precision can be computed as:
|
||||
\begin{descriptionlist}
|
||||
\item[Truncation] $\varepsilon_{\text{mach}} = \beta^{1-t}$
|
||||
\item[Rounding] $\varepsilon_{\text{mach}} = \frac{1}{2}\beta^{1-t}$
|
||||
\end{descriptionlist}
|
||||
Therefore, rounding results in more accurate representations.
|
||||
|
||||
$\varepsilon_{\text{mach}}$ is the smallest distance among the representable numbers (\Cref{fig:finnum_eps}).
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.2\textwidth]{img/machine_eps.png}
|
||||
\caption{Visualization of $\varepsilon_{\text{mach}}$ in $\mathcal{F}(2, 3, -1, 2)$}
|
||||
\label{fig:finnum_eps}
|
||||
\end{figure}\\
|
||||
%
|
||||
In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest representable number such that:
|
||||
\begin{equation*}
|
||||
\texttt{fl}(1 + \varepsilon_{\text{mach}}) > 1.
|
||||
\end{equation*}
|
||||
|
||||
|
||||
\subsection{IEEE standard}
|
||||
IEEE 754 defines two floating-point formats:
|
||||
\begin{descriptionlist}
|
||||
\item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{\texttt{float32}}
|
||||
\begin{center}
|
||||
\small
|
||||
\begin{tabular}{|c|c|c|}
|
||||
\hline
|
||||
1 (sign) & 8 (exponent) & 23 (mantissa) \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
||||
\item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{\texttt{float64}}
|
||||
\begin{center}
|
||||
\small
|
||||
\begin{tabular}{|c|c|c|}
|
||||
\hline
|
||||
1 (sign) & 11 (exponent) & 52 (mantissa) \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{descriptionlist}
|
||||
As the first digit of the mantissa is always 1, it does not need to be stored.
|
||||
Moreover, special configurations are reserved to represent \texttt{Inf} and \texttt{NaN}.
|
||||
|
||||
|
||||
\subsection{Floating-point arithmetic}
|
||||
Let:
|
||||
\begin{itemize}
|
||||
\item $+: \mathbb{R} \times \mathbb{R} \rightarrow \mathbb{R}$ be a real numbers operation.
|
||||
\item $\oplus: \mathcal{F} \times \mathcal{F} \rightarrow \mathcal{F}$ be the corresponding operation in a floating-point system.
|
||||
\end{itemize}
|
||||
%
|
||||
To compute $x \oplus y$, a machine:
|
||||
\begin{enumerate}
|
||||
\item Calculates $x + y$ in a high precision register
|
||||
(still approximated, but more precise than the floating-point system used to store the result)
|
||||
\item Stores the result as $\texttt{fl}(x + y)$
|
||||
\end{enumerate}
|
||||
|
||||
A floating-point operation causes a small rounding error:
|
||||
\[
|
||||
\left\vert \frac{(x \oplus y) - (x + y)}{x+y} \right\vert < \varepsilon_{\text{mach}}
|
||||
\]
|
||||
%
|
||||
However, some operations may be subject to the \textbf{cancellation} problem which causes information loss.
|
||||
\marginnote{Cancellation}
|
||||
\begin{example}
|
||||
Given $x = 1$ and $y = 1 \cdot 10^{-17}$, we want to compute $x + y$ in $\mathcal{F}(10, 16, U, L)$.
|
||||
It is assumed that $U$ and $L$ are sufficient for this example.
|
||||
\begin{equation*}
|
||||
\begin{split}
|
||||
z & = \texttt{fl}(x) + \texttt{fl}(y) \\
|
||||
& = 0.1 \cdot 10^1 + 0.1 \cdot 10^{-16} \\
|
||||
& = (0.1 + 0.\overbrace{0\dots0}^{\mathclap{16\text{ zeros}}}1) \cdot 10^1 \\
|
||||
& = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}}1 \cdot 10^1
|
||||
\end{split}
|
||||
\end{equation*}
|
||||
Then, we have that $\texttt{fl}(z) = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}} \cdot 10^1 = 1 = x$.
|
||||
\end{example}
|
||||
@ -0,0 +1,342 @@
|
||||
\chapter{Gradient methods}
|
||||
|
||||
|
||||
\section{Minimum of a function}
|
||||
|
||||
Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$.
|
||||
\begin{descriptionlist}
|
||||
\item[Stationary point] \marginnote{Stationary point}
|
||||
$\vec{x}^*$ is a stationary point of $f$ iff:
|
||||
\[ \nabla f(\vec{x}^*) = \nullvec \]
|
||||
|
||||
\item[Local minimum] \marginnote{Local minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a local minimum of $f$ iff:
|
||||
\[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
|
||||
f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
|
||||
|
||||
\item[Strict local minimum] \marginnote{Strict local minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a strict local minimum of $f$ iff:
|
||||
\[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
|
||||
f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
|
||||
|
||||
\item[Global minimum] \marginnote{Global minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a global minimum of $f$ iff:
|
||||
\[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
|
||||
|
||||
\item[Strict global minimum] \marginnote{Strict global minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a strict global minimum of $f$ iff:
|
||||
\[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
|
||||
\end{descriptionlist}
|
||||
|
||||
Note that $\max \{ f(x) \} = \min \{ -f(x)$ \}.
|
||||
|
||||
|
||||
\subsection{Optimality conditions}
|
||||
|
||||
\begin{description}
|
||||
\item[First-order condition] \marginnote{First-order condition}
|
||||
Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$.
|
||||
\[ \text{If } \vec{x}^* \text{ local minimum of } f \Rightarrow \nabla f(\vec{x}^*) = \nullvec \]
|
||||
|
||||
\item[Second-order condition] \marginnote{Second-order condition}
|
||||
Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and twice differentiable.
|
||||
\[
|
||||
\text{If } \nabla f(\vec{x}^*) = \nullvec \text{ and } \nabla^2 f(\vec{x}^*) \text{ positive definite} \Rightarrow
|
||||
\vec{x}^* \text{ strict local minimum of } f
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
As the second-order condition requires computing the Hessian matrix, which is expensive, in practice only the first-order condition is checked.
|
||||
|
||||
|
||||
|
||||
\section{Descent methods}
|
||||
|
||||
\marginnote{Descent methods}
|
||||
Descent methods are iterative methods that have the property:
|
||||
\[ f(\vec{x}_k) < f(\vec{x}_{k-1}) \]
|
||||
|
||||
The iteration is defined as:
|
||||
\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
|
||||
where $\vec{p}_{k-1} \in \mathbb{R}^N$ is the search direction and \marginnote{Search direction\\Step length}
|
||||
$\alpha_{k-1} \in \mathbb{R}$ is the step length.
|
||||
|
||||
Note: descent methods usually converge to a local minimum.
|
||||
|
||||
\begin{figure}
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{img/_gradient_contour.pdf}
|
||||
\caption{Descent method steps in $\mathbb{R}^2$ (i.e. moving across contour lines)}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Choice of the search direction}
|
||||
|
||||
\begin{description}
|
||||
\item[Descent direction] \marginnote{Descent direction}
|
||||
$\vec{p} \in \mathbb{R}^N$ is a descent direction of $f$ in $\vec{x}$ if:
|
||||
\[ \exists \bar{\alpha} > 0, \forall \alpha \in [0, \bar{\alpha}]: f(\vec{x} + \alpha \vec{p}) < f(\vec{x}) \]
|
||||
\end{description}
|
||||
|
||||
\begin{theorem}
|
||||
Let $\vec{p} \in \mathbb{R}^N$, $\vec{p} \neq \nullvec$.
|
||||
\[ \text{If } \vec{p}^T \nabla f(\vec{x}) < 0 \Rightarrow \vec{p} \text{ descent direction of } f \text{ in } x \]
|
||||
\end{theorem}
|
||||
|
||||
\begin{theorem}
|
||||
For all $\vec{x}$, $\vec{p} = -\nabla f(\vec{x})$ is a descent direction of $f$ in $x$.
|
||||
\end{theorem}
|
||||
\begin{proof}
|
||||
\[
|
||||
\begin{split}
|
||||
\vec{p}^T \nabla f(\vec{x}) < 0 &\iff -(\nabla f(\vec{x}))^T \nabla f(\vec{x}) < 0 \\
|
||||
&\iff - \Vert \nabla f(\vec{x}) \Vert_2^2 < 0
|
||||
\end{split}
|
||||
\]
|
||||
This holds as the norm is always positive.
|
||||
\end{proof}
|
||||
|
||||
\begin{description}
|
||||
\item[Gradient-like methods] \marginnote{Gradient-like methods}
|
||||
Gradient-like methods are descent methods that use $-\nabla f$ as step.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Choice of the step length}
|
||||
\begin{description}
|
||||
\item[Constant]
|
||||
In machine learning, it is common to set a constant value for the step (learning rate),
|
||||
but it can be proved that this does not guarantee convergence.
|
||||
|
||||
\item[Backtracking procedure] \marginnote{Backtracking procedure}
|
||||
$\alpha_k$ is chosen such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
|
||||
\begin{lstlisting}[mathescape=true, belowskip = -0.8\baselineskip]
|
||||
def backtracking($\tau$, $c_1$):
|
||||
$\alpha_k$ = 1 # Initial guess
|
||||
while $f(x_k + \alpha_k \nabla f(\vec{x}_k))$ > $f(\vec{x}_k)$ + $c_1 \alpha_k \nabla f(\vec{x}_k)^T \nabla f(\vec{x}_k)$:
|
||||
$\alpha_k$ = $\alpha_k$ / $\tau$
|
||||
return $\alpha_k$
|
||||
\end{lstlisting}
|
||||
It can be proved that, by using the backtracking procedure, gradient methods converge to a local minimum.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Stopping condition}
|
||||
\marginnote{Stopping condition}
|
||||
We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, when $\nabla f(\vec{x}_k) \approx \nullvec$.
|
||||
We can verify this by checking the norm of the gradient against a tolerance $\tau$:
|
||||
\begin{descriptionlist}
|
||||
\item[Absolute condition] $\Vert \nabla f(x_k) \Vert_2 < \tau$
|
||||
\item[Relative condition] $\frac{\Vert \nabla f(x_k) \Vert_2}{\Vert \nabla f(x_0) \Vert_2} < \tau$
|
||||
\end{descriptionlist}
|
||||
|
||||
A generic gradient-like method can then be defined as:
|
||||
\begin{lstlisting}[mathescape=true]
|
||||
def gradientMethod($f$, $\vec{x}_0$):
|
||||
$k$ = 0
|
||||
while stoppingCondition($f$, $\vec{x}_k$, $\vec{x}_0$):
|
||||
$p_k$ = $-\nabla f(\vec{x}_k)$
|
||||
$\alpha_k$ = backtracking($\dots$)
|
||||
$\vec{x}_{k+1}$ = $\vec{x}_k$ + $\alpha_k \vec{p}_k$
|
||||
$k$ = $k$ + 1
|
||||
return $x_k$
|
||||
\end{lstlisting}
|
||||
|
||||
|
||||
\subsection{Problems}
|
||||
|
||||
\begin{description}
|
||||
\item[Choice of the initialization point] \marginnote{Initialization point}
|
||||
The starting point of an iterative method is a user-defined parameter.
|
||||
For simple problems, it is usually chosen randomly in $[-1, +1]$.
|
||||
|
||||
For complex problems, the choice of the initialization point is critical as
|
||||
it may cause numerical instabilities or bad results.
|
||||
Heuristics can be used to select an adequate starting point.
|
||||
|
||||
\item[Flat regions and local optima] \marginnote{Flat regions and local optima}
|
||||
Flat regions slow down the learning speed,
|
||||
while a local optima causes the method to converge at a poor solution.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.9\textwidth]{img/_descent_local_flat.pdf}
|
||||
\caption{Flat regions and local minima}
|
||||
\end{figure}
|
||||
|
||||
\item[Differential curvature]
|
||||
Different magnitudes of the partial derivatives may cause the problem of
|
||||
vanishing and exploding gradient. \marginnote{Vanishing gradient\\Exploding gradient}
|
||||
This causes the learning process to require more iterations to adjust the direction.
|
||||
|
||||
In practice, as the gradient of complex functions is only an instantaneous direction of best decrease and
|
||||
does not represent the direction to the minimum in the long term,
|
||||
many updates are required for a gradient method to converge.
|
||||
|
||||
A method to mitigate this issue is to use feature normalization techniques.
|
||||
|
||||
\item[Non-differentiable objective function]
|
||||
If the objective function has a small number of non-differentiable points,
|
||||
the gradient descent method can be applied with minor modifications.
|
||||
|
||||
If lots of points are non-differentiable, the gradients will not be informative enough
|
||||
to determine a decrease direction.
|
||||
|
||||
\item[Difficult topologies]
|
||||
\marginnote{Cliff}
|
||||
A cliff in the objective function causes problems when evaluating the gradient at the edge.
|
||||
With a small step size, there is a slowdown in convergence.
|
||||
With a large step size, there is an overshoot that may cause the algorithm to diverge.
|
||||
% a slowdown when evaluating
|
||||
% the gradient at the edge using a small step size and
|
||||
% an overshoot when the step is too large.
|
||||
|
||||
\marginnote{Valley}
|
||||
A valley in the objective function causes a gradient method to bounce between the sides
|
||||
to a point where no significant progress can be made.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.30\linewidth]{img/cliff.png}
|
||||
\caption{Cliff region}
|
||||
\end{subfigure}%
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.30\linewidth]{img/valley.png}
|
||||
\caption{Ping pong tournament in a valley}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Convex functions}
|
||||
|
||||
\begin{description}
|
||||
\item[Convex set] \marginnote{Convex set}
|
||||
Informally, a set is convex if, for any two points of the set,
|
||||
the points laying on the segment connecting them are also part of the set.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.25\linewidth]{img/convex_set.png}
|
||||
\caption{Convex set}
|
||||
\end{subfigure}%
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.25\linewidth]{img/non_convex_set.png}
|
||||
\caption{Non-convex set}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\item[Convex function] \marginnote{Convex function}
|
||||
Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
|
||||
$f$ is convex if:
|
||||
\[
|
||||
\forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]:
|
||||
f(t\vec{x}_1 + (1-t)\vec{x}_2) \leq t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
|
||||
\]
|
||||
|
||||
In other words, the segment connecting two points of the function lays above the graph.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.55\textwidth]{img/convex_function.png}
|
||||
\caption{Convex function}
|
||||
\end{figure}
|
||||
|
||||
\item[Strictly convex function] \marginnote{Strictly convex function}
|
||||
Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
|
||||
$f$ is strictly convex if:
|
||||
\[
|
||||
\forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]:
|
||||
f(t\vec{x}_1 + (1-t)\vec{x}_2) < t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
% \marginnote{Convex properties}
|
||||
\begin{itemize}
|
||||
\item $\text{if } f \text{ convex} \Rightarrow \text{any local minimum of } f \text{ is also global}$
|
||||
\item $\text{if } f \text{ strictly convex} \Rightarrow \text{the global minimum of } f \text{ is unique}$
|
||||
\item $\text{if } f \text{ convex and differentiable} \Rightarrow \text{any stationary point of } f \text{ is a global minimum}$
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\subsection{Quadratic functions}
|
||||
\marginnote{Quadratic function}
|
||||
A quadratic function has form:
|
||||
\[ f(\vec{x}) = \frac{1}{2}\vec{x}^T\matr{A}\vec{x} - \vec{x}^T\vec{b} + c \]
|
||||
where $\matr{A} \in \mathbb{R}^{n \times n}$, $\vec{b} \in \mathbb{R}^n$ and $c \in \mathbb{R}$.
|
||||
|
||||
\begin{theorem}
|
||||
If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive semidefinite,
|
||||
then $f$ is convex.
|
||||
\end{theorem}
|
||||
|
||||
\begin{theorem}
|
||||
If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive definite,
|
||||
then $f$ is strictly convex.
|
||||
\end{theorem}
|
||||
|
||||
\begin{theorem}
|
||||
\marginnote{Least squares quadratic function}
|
||||
The least squares problem $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is a quadratic function.
|
||||
\end{theorem}
|
||||
\begin{proof}
|
||||
\[
|
||||
\begin{split}
|
||||
(\matr{A}\vec{x} - \vec{b})^T(\matr{A}\vec{x} - \vec{b}) &= (\vec{x}^T\matr{A}^T - \vec{b}^T)(\matr{A}\vec{x} - \vec{b}) \\
|
||||
&= \vec{x}^T\matr{A}^T\matr{A}\vec{x} - \vec{b}^T\matr{A}\vec{x} - \vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \\
|
||||
\end{split}
|
||||
\]
|
||||
As $\vec{b}^T\matr{A}\vec{x} = \vec{x}^T\matr{A}^T\vec{b}$, we have:
|
||||
\[ \vec{x}^T\matr{A}^T\matr{A}\vec{x} - 2\vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \]
|
||||
|
||||
Let $\matr{B} = \matr{A}^T\matr{A}$, $\vec{q} = \matr{A}^T\vec{b}$ and $c = \vec{b}^T\vec{b}$,
|
||||
we have the quadratic form:
|
||||
\[ \vec{x}^T\matr{B}\vec{x} - 2\vec{x}^T\vec{q} + c \]
|
||||
|
||||
$\matr{B}$ is symmetric positive semidefinite (i.e. $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is convex).
|
||||
Moreover, when $\matr{A}$ is full-rank, $\matr{B}$ is symmetric positive definite (i.e. strictly convex).
|
||||
\end{proof}
|
||||
|
||||
|
||||
|
||||
\section{Gradient descent with momentum}
|
||||
\marginnote{Momentum}
|
||||
The momentum is an additional term to keep track of previous iterations:
|
||||
\[
|
||||
\Delta \vec{x}_k = \vec{x}_k - \vec{x}_{k-1} = \gamma \Delta \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1})
|
||||
\]
|
||||
where $\gamma \in [0, 1]$. An iteration is therefore defined as:
|
||||
\[
|
||||
\vec{x}_k = \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1}) + \gamma \Delta\vec{x}_{k-1}
|
||||
\]
|
||||
|
||||
|
||||
|
||||
\section{Stochastic gradient descent (SGD)}
|
||||
\marginnote{Stochastic gradient descent}
|
||||
SGD is a stochastic approximation of gradient descent that uses an approximation of the gradient.
|
||||
Given $N$ data points, the loss can be defined as the sum of the individual losses:
|
||||
\[ L(\vec{x}) = \sum_{n=1}^{N} L_n(\vec{x}) \]
|
||||
where $\vec{x}$ is the vector of parameters.
|
||||
The corresponding gradient can be computed as:
|
||||
\[ \nabla L(\vec{x}) = \sum_{n=1}^{N} \nabla L_n(\vec{x}) \]
|
||||
|
||||
\marginnote{Mini-batch}
|
||||
SGD reduces the amount of computation by approximating the gradient with a subset (mini-batch) $B$ of $\nabla L_n$:
|
||||
\[ \nabla L(\vec{x}) = \sum_{i \in B} \nabla L_i(\vec{x}) \]
|
||||
|
||||
\begin{theorem}
|
||||
Under some assumptions and with an appropriate decrease in learning rate,
|
||||
SGD is guaranteed to converge to a local minimum.
|
||||
\end{theorem}
|
||||
|
||||
Different sizes of the mini-batch result in different behavior:
|
||||
\begin{descriptionlist}
|
||||
\item[Large mini-batches] accurate estimates of the gradient.
|
||||
\item[Small mini-batches] faster computation.
|
||||
\end{descriptionlist}
|
||||
@ -0,0 +1,344 @@
|
||||
\chapter{Linear algebra}
|
||||
|
||||
|
||||
\section{Vector space}
|
||||
|
||||
A \textbf{vector space} over $\mathbb{R}$ is a nonempty set $V$, whose elements are called vectors, with two operations:
|
||||
\marginnote{Vector space}
|
||||
\begin{center}
|
||||
\begin{tabular}{l c}
|
||||
Addition & $+ : V \times V \rightarrow V$ \\
|
||||
Scalar multiplication & $\cdot : \mathbb{R} \times V \rightarrow V$
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
A vector space has the following properties:
|
||||
\begin{enumerate}
|
||||
\item Addition is commutative and associative
|
||||
\item A null vector exists: $\exists \nullvec \in V$ s.t. $\forall \vec{u} \in V: \nullvec + \vec{u} = \vec{u} + \nullvec = \vec{u}$
|
||||
\item An identity element for scalar multiplication exists: $\forall \vec{u} \in V: 1\vec{u} = \vec{u}$
|
||||
\item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$.\\
|
||||
$\vec{a}$ is denoted as $-\vec{u}$.
|
||||
\item Distributive properties:
|
||||
\[ \forall \alpha \in \mathbb{R}, \forall \vec{u}, \vec{w} \in V: \alpha(\vec{u} + \vec{w}) = \alpha \vec{u} + \alpha \vec{w} \]
|
||||
\[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha + \beta)\vec{u} = \alpha \vec{u} + \beta \vec{u} \]
|
||||
\item Associative property:
|
||||
\[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha \beta)\vec{u} = \alpha (\beta \vec{u}) \]
|
||||
\end{enumerate}
|
||||
%
|
||||
A subset $U \subseteq V$ of a vector space $V$ is a \textbf{subspace} iff $U$ is a vector space.
|
||||
\marginnote{Subspace}
|
||||
|
||||
|
||||
\subsection{Basis}
|
||||
\marginnote{Basis}
|
||||
Let $V$ be a vector space of dimension $n$.
|
||||
A basis $\beta = \{ \vec{v}_1, \dots, \vec{v}_n \}$ of $V$ is a set of $n$ linearly independent vectors of $V$.\\
|
||||
Each element of $V$ can be represented as a linear combination of the vectors in the basis $\beta$:
|
||||
\[ \forall \vec{w} \in V: \vec{w} = \lambda_1\vec{v}_1 + \dots + \lambda_n\vec{v}_n \text{ where } \lambda_i \in \mathbb{R} \]
|
||||
%
|
||||
The canonical basis of a vector space is a basis where each vector represents a dimension $i$ \marginnote{Canonical basis}
|
||||
(i.e. 1 in position $i$ and 0 in all other positions).
|
||||
\begin{example}
|
||||
The canonical basis $\beta$ of $\mathbb{R}^3$ is $\beta = \{ (1, 0, 0), (0, 1, 0), (0, 0, 1) \}$
|
||||
\end{example}
|
||||
|
||||
\subsection{Dot product}
|
||||
The dot product of two vectors in $\vec{x}, \vec{y} \in \mathbb{R}^n$ is defined as: \marginnote{Dot product}
|
||||
\begin{equation*}
|
||||
\left\langle \vec{x}, \vec{y} \right\rangle =
|
||||
\vec{x}^T \vec{y} = \sum_{i=1}^{n} x_i \cdot y_i
|
||||
\end{equation*}
|
||||
|
||||
|
||||
\section{Matrix}
|
||||
|
||||
This is a {\tiny(very formal definition of)} matrix: \marginnote{Matrix}
|
||||
\begin{equation*}
|
||||
\matr{A} =
|
||||
\begin{pmatrix}
|
||||
a_{11} & a_{12} & \dots & a_{1n} \\
|
||||
a_{21} & a_{22} & \dots & a_{2n} \\
|
||||
\vdots & \vdots & \ddots & \vdots \\
|
||||
a_{m1} & a_{m2} & \dots & a_{mn}
|
||||
\end{pmatrix}
|
||||
\end{equation*}
|
||||
|
||||
\subsection{Invertible matrix}
|
||||
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is invertible (non-singular) if: \marginnote{Non-singular matrix}
|
||||
\begin{equation*}
|
||||
\exists \matr{B} \in \mathbb{R}^{n \times n}: \matr{AB} = \matr{BA} = \matr{I}
|
||||
\end{equation*}
|
||||
where $\matr{I}$ is the identity matrix. $\matr{B}$ is denoted as $\matr{A}^{-1}$.
|
||||
|
||||
\subsection{Kernel}
|
||||
The null space (kernel) of a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ is a subspace such that: \marginnote{Kernel}
|
||||
\begin{equation*}
|
||||
\text{Ker}(\matr{A}) = \{ \vec{x} \in \mathbb{R}^n : \matr{A}\vec{x} = \nullvec \}
|
||||
\end{equation*}
|
||||
%
|
||||
\begin{theorem} \label{th:kernel_invertible}
|
||||
A square matrix $\matr{A}$ with $\text{\normalfont Ker}(\matr{A}) = \{\nullvec\}$ is non singular.
|
||||
\end{theorem}
|
||||
|
||||
\subsection{Similar matrices} \marginnote{Similar matrices}
|
||||
Two matrices $\matr{A}$ and $\matr{D}$ are \textbf{similar} if there exists an invertible matrix $\matr{P}$ such that:
|
||||
\[ \matr{D} = \matr{P}^{-1} \matr{A} \matr{P} \]
|
||||
|
||||
|
||||
|
||||
\section{Norms}
|
||||
|
||||
\subsection{Vector norms}
|
||||
The norm of a vector is a function: \marginnote{Vector norm}
|
||||
\begin{equation*}
|
||||
\Vert \cdot \Vert: \mathbb{R}^n \rightarrow \mathbb{R}
|
||||
\end{equation*}
|
||||
such that for each $\lambda \in \mathbb{R}$ and $\vec{x}, \vec{y} \in \mathbb{R}^n$:
|
||||
\begin{itemize}
|
||||
\item $\Vert \vec{x} \Vert \geq 0$
|
||||
\item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = \nullvec$
|
||||
\item $\Vert \lambda \vec{x} \Vert = \vert \lambda \vert \cdot \Vert \vec{x} \Vert$
|
||||
\item $\Vert \vec{x} + \vec{y} \Vert \leq \Vert \vec{x} \Vert + \Vert \vec{y} \Vert$
|
||||
\end{itemize}
|
||||
%
|
||||
Common norms are:
|
||||
\begin{descriptionlist}
|
||||
\item[2-norm] $\Vert \vec{x} \Vert_2 = \sqrt{ \sum_{i=1}^{n} x_i^2 }$
|
||||
|
||||
\item[1-norm] $\Vert \vec{x} \Vert_1 = \sum_{i=1}^{n} \vert x_i \vert$
|
||||
|
||||
\item[$\infty$-norm] $\Vert \vec{x} \Vert_{\infty} = \max_{1 \leq i \leq n} \vert x_i \vert$
|
||||
\end{descriptionlist}
|
||||
%
|
||||
In general, different norms tend to maintain the same proportion.
|
||||
In some cases, unbalanced results may be obtained when comparing different norms.
|
||||
\begin{example}
|
||||
Let $\vec{x} = (1, 1000)$ and $\vec{y} = (999, 1000)$. Their norms are:
|
||||
\begin{center}
|
||||
\begin{tabular}{l l}
|
||||
$\Vert \vec{x} \Vert_{2} = \sqrt{1000001}$ & $\Vert \vec{y} \Vert_{2} = \sqrt{1998001}$ \\
|
||||
$\Vert \vec{x} \Vert_{\infty} = 1000$ & $\Vert \vec{y} \Vert_{\infty} = 1000$ \\
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{example}
|
||||
|
||||
|
||||
\subsection{Matrix norms}
|
||||
The norm of a matrix is a function: \marginnote{Matrix norm}
|
||||
\begin{equation*}
|
||||
\Vert \cdot \Vert: \mathbb{R}^{m \times n} \rightarrow \mathbb{R}
|
||||
\end{equation*}
|
||||
such that for each $\lambda \in \mathbb{R}$ and $\matr{A}, \matr{B} \in \mathbb{R}^{m \times n}$:
|
||||
\begin{itemize}
|
||||
\item $\Vert \matr{A} \Vert \geq 0$
|
||||
\item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \matr{0}$
|
||||
\item $\Vert \lambda \matr{A} \Vert = \vert \lambda \vert \cdot \Vert \matr{A} \Vert$
|
||||
\item $\Vert \matr{A} + \matr{B} \Vert \leq \Vert \matr{A} \Vert + \Vert \matr{B} \Vert$
|
||||
\end{itemize}
|
||||
%
|
||||
Common norms are:
|
||||
\begin{descriptionlist}
|
||||
\item[2-norm]
|
||||
$\Vert \matr{A} \Vert_2 = \sqrt{ \rho(\matr{A}^T\matr{A}) }$,\\
|
||||
where $\rho(\matr{X})$ is the largest absolute value of the eigenvalues of $\matr{X}$ (spectral radius).
|
||||
|
||||
\item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$ (i.e. max sum of the columns in absolute value)
|
||||
|
||||
\item[Frobenius norm] $\Vert \matr{A} \Vert_F = \sqrt{ \sum_{i=1}^{m} \sum_{j=1}^{n} a_{i,j}^2 }$
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Symmetric, positive definite matrices}
|
||||
|
||||
\begin{description}
|
||||
\item[Symmetric matrix] \marginnote{Symmetric matrix}
|
||||
A square matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is symmetric $\iff \matr{A} = \matr{A}^T$
|
||||
|
||||
\item[Positive semidefinite matrix] \marginnote{Positive semidefinite matrix}
|
||||
A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is positive semidefinite iff
|
||||
\begin{equation*}
|
||||
\forall \vec{x} \in \mathbb{R}^n \smallsetminus \{0\}: \vec{x}^T \matr{A} \vec{x} \geq 0
|
||||
\end{equation*}
|
||||
|
||||
\item[Positive definite matrix] \marginnote{Positive definite matrix}
|
||||
A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is positive definite iff
|
||||
\begin{equation*}
|
||||
\forall \vec{x} \in \mathbb{R}^n \smallsetminus \{0\}: \vec{x}^T \matr{A} \vec{x} > 0
|
||||
\end{equation*}
|
||||
%
|
||||
It has the following properties:
|
||||
\begin{enumerate}
|
||||
\item The null space of $\matr{A}$ has the null vector only: $\text{Ker}(\matr{A}) = \{ \nullvec \}$. \\
|
||||
Which implies that $\matr{A}$ is non-singular (\Cref{th:kernel_invertible}).
|
||||
\item The diagonal elements of $\matr{A}$ are all positive.
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Orthogonality}
|
||||
\begin{description}
|
||||
\item[Angle between vectors] \marginnote{Angle between vectors}
|
||||
The angle $\omega$ between two vectors $\vec{x}$ and $\vec{y}$ can be obtained from:
|
||||
\begin{equation*}
|
||||
\cos\omega = \frac{\left\langle \vec{x}, \vec{y} \right\rangle }{\Vert \vec{x} \Vert_2 \cdot \Vert \vec{y} \Vert_2}
|
||||
\end{equation*}
|
||||
|
||||
\item[Orthogonal vectors] \marginnote{Orthogonal vectors}
|
||||
Two vectors $\vec{x}$ and $\vec{y}$ are orthogonal ($\vec{x} \perp \vec{y}$) when:
|
||||
\[ \left\langle \vec{x}, \vec{y} \right\rangle = 0 \]
|
||||
|
||||
\item[Orthonormal vectors] \marginnote{Orthonormal vectors}
|
||||
Two vectors $\vec{x}$ and $\vec{y}$ are orthonormal when:
|
||||
\[ \vec{x} \perp \vec{y} \text{ and } \Vert \vec{x} \Vert = \Vert \vec{y} \Vert=1 \]
|
||||
\begin{theorem}
|
||||
The canonical basis of a vector space is orthonormal.
|
||||
\end{theorem}
|
||||
|
||||
\item[Orthogonal matrix] \marginnote{Orthogonal matrix}
|
||||
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is orthogonal if its columns are \underline{orthonormal} vectors.
|
||||
It has the following properties:
|
||||
\begin{enumerate}
|
||||
\item $\matr{A}\matr{A}^T = \matr{I} = \matr{A}^T\matr{A}$, which implies $\matr{A}^{-1} = \matr{A}^T$.
|
||||
\item The length of a vector is unchanged when mapped through an orthogonal matrix:
|
||||
\[ \Vert \matr{A}\vec{x} \Vert^2 = \Vert \vec{x} \Vert^2 \]
|
||||
\item The angle between two vectors is unchanged when both are mapped through an orthogonal matrix:
|
||||
\[
|
||||
\cos\omega = \frac{(\matr{A}\vec{x})^T(\matr{A}\vec{y})}{\Vert \matr{A}\vec{x} \Vert \cdot \Vert \matr{A}\vec{y} \Vert} =
|
||||
\frac{\vec{x}^T\vec{y}}{\Vert \vec{x} \Vert \cdot \Vert \vec{y} \Vert}
|
||||
\]
|
||||
\end{enumerate}
|
||||
Note: an orthogonal matrix represents a rotation.
|
||||
|
||||
\item[Orthogonal basis] \marginnote{Orthogonal basis}
|
||||
Given a $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
|
||||
$\beta$ is an orthogonal basis if:
|
||||
\[ \vec{b}_i \perp \vec{b}_j \text{ for } i \neq j \text{ (i.e.} \left\langle \vec{b}_i, \vec{b}_j \right\rangle = 0 \text{)} \]
|
||||
|
||||
\item[Orthonormal basis] \marginnote{Orthonormal basis}
|
||||
Given a $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
|
||||
$\beta$ is an orthonormal basis if:
|
||||
\[ \Vert \vec{b}_i \Vert_2 = 1 \text{ (or} \left\langle \vec{b}_i, \vec{b}_i \right\rangle = 1 \text{)} \]
|
||||
|
||||
\item[Orthogonal complement] \marginnote{Orthogonal complement}
|
||||
Given a $n$-dimensional vector space $V$ and a $m$-dimensional subspace $U \subseteq V$.
|
||||
The orthogonal complement $U^\perp$ of $U$ is a $(n-m)$-dimensional subspace of $V$ such that it
|
||||
contains all the vectors orthogonal to every vector in $U$:
|
||||
\[ \forall \vec{w} \in V: \vec{w} \in U^\perp \iff (\forall \vec{u} \in U: \vec{w} \perp \vec{u}) \]
|
||||
%
|
||||
Note that $U \cap U^\perp = \{ \nullvec \}$ and
|
||||
it is possible to represent all vectors in $V$ as a linear combination of both the basis of $U$ and $U^\perp$.
|
||||
|
||||
The vector $\vec{w} \in U^\perp$ s.t. $\Vert \vec{w} \Vert = 1$ is the \textbf{normal vector} of $U$. \marginnote{Normal vector}
|
||||
%
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.4\textwidth]{img/_orthogonal_complement.pdf}
|
||||
\caption{Orthogonal complement of a subspace $U \subseteq \mathbb{R}^3$}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Projections}
|
||||
Projections are methods to map high-dimensional data into a lower-dimensional space
|
||||
while minimizing the compression loss.\\
|
||||
\marginnote{Orthogonal projection}
|
||||
Let $V$ be a vector space and $U \subseteq V$ a subspace of $V$.
|
||||
A linear mapping $\pi: V \rightarrow U$ is a (orthogonal) projection if:
|
||||
\[ \pi^2 = \pi \circ \pi = \pi \]
|
||||
In other words, applying $\pi$ multiple times gives the same result (i.e. idempotency).\\
|
||||
$\pi$ can be expressed as a transformation matrix $\matr{P}_\pi$ such that:
|
||||
\[ \matr{P}_\pi^2 = \matr{P}_\pi \]
|
||||
|
||||
\subsection{Projection onto general subspaces} \marginnote{Projection onto subspace basis}
|
||||
To project a vector $\vec{x} \in \mathbb{R}^n$ into a lower-dimensional subspace $U \subseteq \mathbb{R}^n$,
|
||||
it is possible to use the basis of $U$.\\
|
||||
%
|
||||
Let $m = \text{dim}(U)$ be the dimension of $U$ and
|
||||
$\matr{B} = (\vec{b}_1, \dots, \vec{b}_m) \in \mathbb{R}^{n \times m}$ an ordered basis of $U$.
|
||||
A projection $\pi_U(\vec{x})$ represents $\vec{x}$ as a linear combination of the basis:
|
||||
\[ \pi_U(\vec{x}) = \sum_{i=1}^{m} \lambda_i \vec{b}_i = \matr{B}\vec{\uplambda} \]
|
||||
where $\vec{\uplambda} = (\lambda_1, \dots, \lambda_m)^T \in \mathbb{R}^{m}$ are the new coordinates of $\vec{x}$
|
||||
and is found by minimizing the distance between $\pi_U(\vec{x})$ and $\vec{x}$.
|
||||
|
||||
|
||||
|
||||
\section{Eigenvectors and eigenvalues}
|
||||
|
||||
Given a square matrix $\matr{A} \in \mathbb{R}^{n \times n}$,
|
||||
$\lambda \in \mathbb{C}$ is an eigenvalue of $\matr{A}$ \marginnote{Eigenvalue}
|
||||
with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if: \marginnote{Eigenvector}
|
||||
\[ \matr{A}\vec{x} = \lambda\vec{x} \]
|
||||
|
||||
It is equivalent to say that:
|
||||
\begin{itemize}
|
||||
\item $\lambda$ is an eigenvalue of $\matr{A} \in \mathbb{R}^{n \times n}$
|
||||
\item $\exists \vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ s.t. $\matr{A}\vec{x} = \lambda\vec{x}$ \\
|
||||
Equivalently the system $(\matr{A} - \lambda \matr{I}_n)\vec{x} = \nullvec$ is non-trivial ($\vec{x} \neq \nullvec$).
|
||||
\item $\text{rank}(\matr{A} - \lambda \matr{I}_n) < n$
|
||||
\item $\det(\matr{A} - \lambda \matr{I}_n) = 0$ (i.e. $(\matr{A} - \lambda \matr{I}_n)$ is singular {\footnotesize(i.e. not invertible)})
|
||||
\end{itemize}
|
||||
|
||||
Note that eigenvectors are not unique.
|
||||
Given an eigenvector $\vec{x}$ of $\matr{A}$ with eigenvalue $\lambda$,
|
||||
we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is an eigenvector of $\matr{A}$:
|
||||
\[ \matr{A}(c\vec{x}) = c(\matr{A}\vec{x}) = c\lambda\vec{x} = \lambda(c\vec{x}) \]
|
||||
|
||||
% \begin{theorem}
|
||||
% The eigenvalues of a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ are all in $\mathbb{R}$.
|
||||
% \end{theorem}
|
||||
|
||||
\begin{theorem} \marginnote{Eigenvalues and positive definiteness}
|
||||
$\matr{A} \in \mathbb{R}^{n \times n}$ is symmetric positive definite $\iff$
|
||||
its eigenvalues are all positive.
|
||||
\end{theorem}
|
||||
|
||||
\begin{description}
|
||||
\item[Eigenspace] \marginnote{Eigenspace}
|
||||
Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalue $\lambda$.
|
||||
This set is a subspace of $\mathbb{R}^n$.
|
||||
|
||||
\item[Eigenspectrum] \marginnote{Eigenspectrum}
|
||||
Set of all eigenvalues of $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
\end{description}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Geometric multiplicity] \marginnote{Geometric multiplicity}
|
||||
Given an eigenvalue $\lambda$ of a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated to $\lambda$.
|
||||
\end{description}
|
||||
|
||||
|
||||
\begin{theorem} \marginnote{Linearly independent eigenvectors}
|
||||
Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
If its $n$ eigenvectors $\vec{x}_1, \dots, \vec{x}_n$ are associated to distinct eigenvalues,
|
||||
then $\vec{x}_1, \dots, \vec{x}_n$ are linearly independent (i.e. they form a basis of $\mathbb{R}^n$).
|
||||
|
||||
\begin{descriptionlist}
|
||||
\item[Defective matrix] \marginnote{Defective matrix}
|
||||
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is defective if it has less than $n$ linearly independent eigenvectors.
|
||||
\end{descriptionlist}
|
||||
\end{theorem}
|
||||
|
||||
|
||||
\begin{theorem}[Spectral theorem] \label{th:spectral_theorem} \marginnote{Spectral theorem}
|
||||
Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
Its eigenvectors form an orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
|
||||
\end{theorem}
|
||||
|
||||
|
||||
\subsection{Diagonalizability}
|
||||
\marginnote{Diagonalizable matrix}
|
||||
A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is diagonalizable if it is similar to a diagonal matrix $\matr{D} \in \mathbb{R}^{n \times n}$:
|
||||
\[ \exists \matr{P} \in \mathbb{R}^{n \times n} \text{ s.t. } \matr{P} \text{ invertible and } \matr{D} = \matr{P}^{-1}\matr{A}\matr{P} \]
|
||||
|
||||
\begin{theorem}
|
||||
Similar matrices have the same eigenvalues.
|
||||
\end{theorem}
|
||||
|
||||
\begin{theorem} \marginnote{Symmetric matrix diagonalizability}
|
||||
A symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is always diagonalizable.
|
||||
\end{theorem}
|
||||
@ -0,0 +1,242 @@
|
||||
\chapter{Linear systems}
|
||||
|
||||
A linear system:
|
||||
\begin{equation*}
|
||||
\begin{cases}
|
||||
a_{1,1}x_1 + a_{1,2}x_2 + \dots + a_{1,n}x_n = b_1\\
|
||||
a_{2,1}x_1 + a_{2,2}x_2 + \dots + a_{2,n}x_n = b_2\\
|
||||
\hspace*{7em} \vdots \\
|
||||
a_{m,1}x_1 + a_{m,2}x_2 + \dots + a_{m,n}x_n = b_m\\
|
||||
\end{cases}
|
||||
\end{equation*}
|
||||
can be represented as:
|
||||
\[ \matr{A}\vec{x} = \vec{b} \]
|
||||
where:
|
||||
\[
|
||||
\matr{A} =
|
||||
\begin{pmatrix}
|
||||
a_{1,1} & a_{1, 2} & \hdots & a_{1,n} \\
|
||||
a_{2,1} & a_{2, 2} & \hdots & a_{2,n} \\
|
||||
\vdots & \vdots & \ddots & \vdots \\
|
||||
a_{m,1} & a_{m, 2} & \hdots & a_{m,n}
|
||||
\end{pmatrix} \in \mathbb{R}^{m \times n}
|
||||
\hspace*{2em}
|
||||
%
|
||||
\vec{x} =
|
||||
\begin{pmatrix}
|
||||
x_1 \\
|
||||
x_2 \\
|
||||
\vdots \\
|
||||
x_n
|
||||
\end{pmatrix} \in \mathbb{R}^n
|
||||
\hspace*{2em}
|
||||
%
|
||||
\vec{b} =
|
||||
\begin{pmatrix}
|
||||
b_1 \\
|
||||
b_2 \\
|
||||
\vdots \\
|
||||
b_m
|
||||
\end{pmatrix} \in \mathbb{R}^m
|
||||
\]
|
||||
|
||||
|
||||
|
||||
\section{Square linear systems}
|
||||
\marginnote{Square linear system}
|
||||
A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$
|
||||
has a unique solution iff one of the following conditions is satisfied:
|
||||
\begin{enumerate}
|
||||
\item $\matr{A}$ is non-singular (invertible)
|
||||
\item $\text{rank}(\matr{A}) = n$ (full rank)
|
||||
\item $\matr{A}\vec{x}$ only admits the solution $\vec{x} = \nullvec$
|
||||
\end{enumerate}
|
||||
|
||||
The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
|
||||
\[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
|
||||
However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
|
||||
Therefore, numerical methods are usually more suited.
|
||||
The two main families of methods are:
|
||||
\begin{itemize}
|
||||
\item Direct methods.
|
||||
\item Iterative methods.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\section{Direct methods}
|
||||
\marginnote{Direct methods}
|
||||
Direct methods compute the solution of a linear system in a finite number of steps.
|
||||
Compared to iterative methods, they are more precise but more expensive.
|
||||
|
||||
The most common approach consists in factorizing the matrix $\matr{A}$.
|
||||
|
||||
\subsection{Gaussian factorization}
|
||||
\marginnote{Gaussian factorization\\(LU decomposition)}
|
||||
Given a square linear system $\matr{A}\vec{x} = \vec{b}$,
|
||||
the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that:
|
||||
\begin{itemize}
|
||||
\item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix.
|
||||
\item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix.
|
||||
\end{itemize}
|
||||
%
|
||||
The system can be decomposed into:
|
||||
\[
|
||||
\begin{split}
|
||||
\matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
|
||||
& \iff
|
||||
\begin{cases}
|
||||
\matr{L}\vec{y} = \vec{b} \\
|
||||
\vec{y} = \matr{U}\vec{x}
|
||||
\end{cases}
|
||||
\end{split}
|
||||
\]
|
||||
To find the solution, it is sufficient to solve in order:
|
||||
\begin{enumerate}
|
||||
\item $\matr{L}\vec{y} = \vec{b}$ (solved w.r.t. $\vec{y}$)
|
||||
\item $\vec{y} = \matr{U}\vec{x}$ (solved w.r.t. $\vec{x}$)
|
||||
\end{enumerate}
|
||||
|
||||
The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\
|
||||
$O(\frac{n^3}{3})$ is the time complexity of the LU factorization.
|
||||
$O(n^2)$ is the complexity to directly solve a system with a triangular matrix (forward or backward substitutions).
|
||||
|
||||
|
||||
\subsection{Gaussian factorization with pivoting}
|
||||
\marginnote{Gaussian factorization with pivoting}
|
||||
During the computation of $\matr{A} = \matr{L}\matr{U}$
|
||||
(using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}),
|
||||
a division by 0 may occur.
|
||||
A method to prevent this problem (and to lower the algorithmic error (i.e. overflows)) is to change the order of the rows of $\matr{A}$ before decomposing it.
|
||||
This is achieved by using a permutation matrix $\matr{P}$, which is obtained as a permutation of the identity matrix.
|
||||
|
||||
The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
|
||||
The system can be decomposed into:
|
||||
\[
|
||||
\begin{split}
|
||||
\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
|
||||
& \iff
|
||||
\begin{cases}
|
||||
\matr{L}\vec{y} = \matr{P}\vec{b} \\
|
||||
\vec{y} = \matr{U}\vec{x}
|
||||
\end{cases}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
An alternative formulation (which is what \texttt{SciPy} uses)
|
||||
is defined as:
|
||||
\[\matr{A} = \matr{P}\matr{L}\matr{U} \iff \matr{P}^T\matr{A} = \matr{L}\matr{U} \]
|
||||
It must be noted that $\matr{P}$ is orthogonal, so $\matr{P}^T = \matr{P}^{-1}$.
|
||||
The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can be found as above.
|
||||
|
||||
|
||||
\subsection{Cholesky factorization}
|
||||
Given a symmetric positive definite matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
It is possible to decompose $\matr{A}$ as:
|
||||
\[ \matr{A} = \matr{L}\matr{L}^T \]
|
||||
where $\matr{L}$ is lower triangular.
|
||||
|
||||
A square system where $\matr{A}$ is symmetric definite positive can be solved as above using the Cholesky factorization.
|
||||
This method has time complexity $O(\frac{n^3}{6})$.
|
||||
|
||||
|
||||
|
||||
|
||||
\section{Iterative methods}
|
||||
\marginnote{Iterative methods}
|
||||
Iterative methods solve a linear system by computing a sequence that converges to the exact solution.
|
||||
Compared to direct methods, they are less precise but computationally faster and more suited for large systems.
|
||||
|
||||
The overall idea is to build a sequence of vectors $\vec{x}_k$
|
||||
that converges to the exact solution $\vec{x}^*$:
|
||||
\[ \lim_{k \rightarrow \infty} \vec{x}_k = \vec{x}^* \]
|
||||
Generally, the first vector $\vec{x}_0$ is given (or guessed). Subsequent vectors are computed w.r.t. the previous iteration
|
||||
as $\vec{x}_k = g(\vec{x}_{k-1})$.
|
||||
|
||||
The two most common families of iterative methods are:
|
||||
\begin{descriptionlist}
|
||||
\item[Stationary methods] \marginnote{Stationary methods}
|
||||
compute the sequence as:
|
||||
\[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \]
|
||||
where $\matr{B}$ is called iteration matrix and $\vec{d}$ is computed from the $\vec{b}$ vector of the system.
|
||||
The time complexity per iteration is $O(n^2)$.
|
||||
|
||||
\item[Gradient-like methods] \marginnote{Gradient-like methods}
|
||||
have the form:
|
||||
\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
|
||||
where $\alpha_{k-1} \in \mathbb{R}$ and the vector $\vec{p}_{k-1}$ is called direction.
|
||||
\end{descriptionlist}
|
||||
|
||||
\subsection{Stopping criteria}
|
||||
\marginnote{Stopping criteria}
|
||||
One or more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite).
|
||||
The most common approaches are:
|
||||
\begin{descriptionlist}
|
||||
\item[Residual based]
|
||||
The algorithm is terminated when the current solution is close enough to the exact solution.
|
||||
The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$.
|
||||
Given a tolerance $\varepsilon$, the algorithm may stop when:
|
||||
\begin{itemize}
|
||||
\item $\Vert \vec{r}_k \Vert \leq \varepsilon$ (absolute)
|
||||
\item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$ (relative)
|
||||
\end{itemize}
|
||||
|
||||
\item[Update based]
|
||||
The algorithm is terminated when the difference between iterations is very small.
|
||||
Given a tolerance $\tau$, the algorithm stops when:
|
||||
\[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \]
|
||||
\end{descriptionlist}
|
||||
Obviously, as the sequence is truncated, a truncation error is introduced when using iterative methods.
|
||||
|
||||
|
||||
|
||||
\section{Condition number}
|
||||
Inherent error causes inaccuracies during the resolution of a system.
|
||||
This problem is independent of the algorithm and is estimated using exact arithmetic.
|
||||
|
||||
Given a system $\matr{A}\vec{x} = \vec{b}$, we perturbate $\matr{A}$ and/or $\vec{b}$ and study the inherited error.
|
||||
For instance, if we perturbate $\vec{b}$, we obtain the following system:
|
||||
\[ \matr{A}\tilde{\vec{x}} = (\vec{b} + \Delta\vec{b}) \]
|
||||
After finding $\tilde{\vec{x}}$, we can compute the inherent error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
|
||||
|
||||
By comparing $\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert$ and $\left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert$,
|
||||
we can compute the error introduced by the perturbation.
|
||||
It can be shown that the distance is:
|
||||
\[
|
||||
\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert \leq
|
||||
\Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \cdot \left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert
|
||||
\]
|
||||
Finally, we can define the \textbf{condition number} of a matrix $\matr{A}$ as: \marginnote{Condition number}
|
||||
\[ K(\matr{A}) = \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \]
|
||||
|
||||
A system is \textbf{ill-conditioned} if $K(\matr{A})$ is large \marginnote{Ill-conditioned}
|
||||
(i.e. a small perturbation of the input causes a large change in the output).
|
||||
Otherwise, it is \textbf{well-conditioned}. \marginnote{Well-conditioned}
|
||||
|
||||
|
||||
\section{Linear least squares problem}
|
||||
|
||||
A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$
|
||||
does not generally have a solution.
|
||||
\marginnote{Linear least squares}
|
||||
Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
|
||||
\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
|
||||
In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
|
||||
This problem is usually formulated as:
|
||||
\[
|
||||
\tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
|
||||
\]
|
||||
It always admits a solution and, depending on $\text{rank}(\matr{A})$, there are two possible cases:
|
||||
\begin{descriptionlist}
|
||||
\item[$\text{rank}(\matr{A}) = n$]
|
||||
The solution is unique for each $b \in \mathbb{R}^m$.
|
||||
\marginnote{Normal equation}
|
||||
It is found by solving the normal equation:
|
||||
\[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
|
||||
$\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
|
||||
|
||||
\item[$\text{rank}(\matr{A}) < n$]
|
||||
The system admits infinite solutions.
|
||||
Of all the solutions $S$, we are interested in the one with minimum norm:
|
||||
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
|
||||
\end{descriptionlist}
|
||||
@ -0,0 +1,306 @@
|
||||
\chapter{Machine learning}
|
||||
|
||||
|
||||
\section{Models}
|
||||
|
||||
\begin{description}
|
||||
\item[Function model] \marginnote{Function model}
|
||||
The model (predictor) is a deterministic function:
|
||||
\[ f: \mathbb{R}^D \rightarrow \mathbb{R} \]
|
||||
|
||||
In this course, only linear functions are considered:
|
||||
\[ f_\vec{\uptheta}(\vec{x}) = \uptheta_0 + \uptheta_1 x_1 + \dots + \uptheta_D x_D = \vec{\uptheta}^T \vec{x} \]
|
||||
where $\vec{x} = \begin{pmatrix} 1, x_1, \dots, x_D \end{pmatrix}$ is the input vector and
|
||||
$\vec{\uptheta} = \begin{pmatrix} \uptheta_0, \dots, \uptheta_D \end{pmatrix}$ is the parameter vector.
|
||||
|
||||
\item[Probabilistic model] \marginnote{Probabilistic model}
|
||||
The model is a multivariate probabilistic distribution that
|
||||
is able to quantify uncertainty in noisy data.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Learning}
|
||||
|
||||
|
||||
\subsection{Empirical risk minimization}
|
||||
\marginnote{Empirical risk minimization}
|
||||
Used for function models.
|
||||
The parameters of the predictor are directly obtained as an optimization problem that aims to minimize the distance
|
||||
between the prediction and the ground truth.
|
||||
|
||||
Let $(\vec{x}_n, y_n)$ be a dataset of $N$ elements
|
||||
where $\vec{x}_n \in \mathbb{R}^D$ are the examples and $y_n \in \mathbb{R}$ are the labels.
|
||||
We want to estimate a predictor $f_\vec{\uptheta}(\vec{x}) = \vec{\uptheta}^T \vec{x}$ with parameters $\vec{\uptheta}$
|
||||
such that, with the ideal parameters $\vec{\uptheta}^*$, it fits the data well:
|
||||
\[ f_{\vec{\uptheta}^*}(\vec{x}_n) \approx y_n \]
|
||||
|
||||
We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n)$.
|
||||
|
||||
\begin{description}
|
||||
\item[Loss function] \marginnote{Loss function}
|
||||
A loss function $\ell(y_n, \hat{y}_n)$ indicates how a predictor fits the data.
|
||||
|
||||
An assumption commonly made in machine learning is that
|
||||
the dataset $(\vec{x}_n, y_n)$ is independent and identically distributed.
|
||||
Therefore, the empirical mean is a good estimate of the population mean.
|
||||
|
||||
\item[Empirical risk] \marginnote{Empirical risk}
|
||||
Given the example matrix $\matr{X} = \begin{pmatrix} \vec{x}_1, \dots, \vec{x}_N \end{pmatrix} \in \mathbb{R}^{N \times D}$
|
||||
and the label vector $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix} \in \mathbb{R}^N$.
|
||||
The empirical risk is given by the average loss:
|
||||
\[ \textbf{R}_\text{emp}(f_\vec{\uptheta}, \matr{X}, \vec{y}) = \frac{1}{N} \sum_{n=1}^{N} \ell(y_n, \hat{y}_n) \]
|
||||
|
||||
\begin{description}
|
||||
\item[Least-squares loss] \marginnote{Least-squares loss}
|
||||
The least-squares loss is defined as:
|
||||
\[ \ell(y_n, \hat{y}_n) = (y_n - \hat{y}_n)^2 \]
|
||||
|
||||
Therefore, the minimization task is:
|
||||
\[
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - f_\vec{\uptheta}(\vec{x}_n))^2 =
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - \vec{\uptheta}^T\vec{x}_n)^2 =
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
\item[Expected risk] \marginnote{Expected risk}
|
||||
The expected risk is defined as:
|
||||
\[ \textbf{R}_\text{true}(f_\vec{\uptheta}) = \mathbb{E}_{\vec{x}, y}[\ell(y, f_\vec{\uptheta}(\vec{x}_\text{test}))] \]
|
||||
where the parameters $\vec{\uptheta}$ are fixed and the samples are taken from a test set.
|
||||
|
||||
\item[Overfitting] \marginnote{Overfitting}
|
||||
\sloppy
|
||||
A predictor $f_\vec{\uptheta}$ is overfitting when $\textbf{R}_\text{emp}(f, \matr{X}_\text{train}, \vec{y}_\text{train})$
|
||||
underestimates $\textbf{R}_\text{true}(f_\vec{\uptheta})$ (i.e. the loss on the training set is low, but on the test set is high).
|
||||
|
||||
\item[Regularization] \marginnote{Regularization}
|
||||
Method that introduces a penalty term to the loss that
|
||||
helps to find a compromise between the accuracy and the complexity of the solution:
|
||||
\[ \bar{\ell}(y_n, \hat{y}_n) = \ell(y_n, \hat{y}_n) + \lambda \mathcal{R}(\vec{\uptheta}) \]
|
||||
where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the regularizer (penalty term).
|
||||
|
||||
\begin{description}
|
||||
\item[Regularized least squares] \marginnote{Regularized least squares}
|
||||
A simple regularization term for the least squares problem is $\Vert \vec{\uptheta} \Vert^2$.
|
||||
The problem becomes:
|
||||
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D}
|
||||
\{ \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Maximum likelihood estimation (MLE)}
|
||||
% \marginnote{Maximum likelihood estimation (MLE)}
|
||||
Used for probabilistic models.
|
||||
The parameters are determined as the most likely to predict the correct label given an input.
|
||||
|
||||
\begin{description}
|
||||
\item[Negative log-likelihood] \marginnote{Negative log-likelihood}
|
||||
\sloppy
|
||||
Given a random variable $\bm{x}$ and a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$,
|
||||
the negative log-likelihood of $\bm{x}$ is:
|
||||
\[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
|
||||
Note that:
|
||||
\begin{itemize}
|
||||
\item The minus is added as we are converting the problem of maximizing the likelihood to a minimization problem.
|
||||
\item The logarithm is useful for numerical stability.
|
||||
\end{itemize}
|
||||
$\mathcal{L}_{\bm{x}}(\vec{\uptheta})$ indicates how likely it is to observe $\bm{x}$ with
|
||||
$\vec{\uptheta}$ as the parameters of the predictor.
|
||||
|
||||
Given a dataset $(\bm{x}_n, y_n)$ of $N$ independent and identically distributed (i.i.d.) elements,
|
||||
optimizing the likelihood allows to find the most likely parameters to represent the dataset.
|
||||
As the dataset is independent, we have that:
|
||||
\[ p_\vec{\uptheta}(\vec{y} \vert \matr{X}) = \prod_{n=1}^{N} p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
|
||||
where $\matr{X} = \begin{pmatrix} \bm{x}_1, \dots, \bm{x}_N \end{pmatrix}$ and
|
||||
$\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix}$.
|
||||
Moreover, as the dataset is identically distributed,
|
||||
each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.
|
||||
|
||||
By applying the logarithm, we have that the negative log-likelihood of an i.i.d. dataset is defined as:
|
||||
\[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
|
||||
and to find good parameters $\vec{\uptheta}$, we solve the problem:
|
||||
\[
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) =
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n)
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Gaussian likelihood] \marginnote{Gaussian likelihood}
|
||||
Using a linear model $\bm{x}^T\vec{\uptheta}$ as predictor and
|
||||
assuming that the likelihood has a Gaussian distribution as follows:
|
||||
\[ p_\vec{\uptheta}(y_n \,\vert\, \bm{x}_n) = \mathcal{N}(y_n \,\vert\, \bm{x}_n^T\vec{\uptheta}, \sigma^2) \]
|
||||
where the Gaussian distribution has mean $\bm{x}_n^T\vec{\uptheta}$ (i.e. $f_\vec{\uptheta}(\bm{x}_n))$
|
||||
and variance $\sigma^2$ for the $n$-th data point.
|
||||
|
||||
The negative log-likelihood is:
|
||||
\[
|
||||
\begin{split}
|
||||
\mathcal{L}(\vec{\uptheta}) &= -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \\
|
||||
&= -\sum_{n=1}^{N} \log \mathcal{N}(y_n \vert \bm{x}_n^T\vec{\uptheta}, \sigma^2) \\
|
||||
&= -\sum_{n=1}^{N} \log \left( \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) \right) \\
|
||||
&= -\sum_{n=1}^{N} \log\exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}} \\
|
||||
&= \frac{1}{2\sigma^2} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
The minimization problem becomes:
|
||||
\[
|
||||
\begin{split}
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) &=
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D}
|
||||
\overbrace{\frac{1}{2\sigma^2}}^{\mathclap{\text{constant}}}
|
||||
\sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 -
|
||||
\overbrace{\sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}}^{\mathclap{\text{constant}}} \\
|
||||
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 \\
|
||||
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2
|
||||
\end{split}
|
||||
\]
|
||||
which corresponds to the least squares problem.
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{subfigure}{.45\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.75\linewidth]{img/gaussian_mle_good.png}
|
||||
\caption{When the parameters are good, the label will be near the mean (i.e. predictor)}
|
||||
\end{subfigure}
|
||||
\hspace*{1em}
|
||||
\begin{subfigure}{.45\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.75\linewidth]{img/gaussian_mle_bad.png}
|
||||
\caption{When the parameters are bad, the label will be far from the mean}
|
||||
\end{subfigure}
|
||||
|
||||
\caption{Geometric interpretation of the Gaussian likelihood}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Maximum a posteriori estimation (MAP)}
|
||||
\marginnote{Maximum a posteriori (MAP)}
|
||||
Maximum a posteriori estimation uses the opposite distribution of MLE and maximizes:
|
||||
\[
|
||||
\max_{\vec{\uptheta} \in \mathbb{R}^D} p(\vec{\uptheta} \vert \matr{X}, \vec{y}) =
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{\uptheta} \vert \matr{X}, \vec{y})
|
||||
\]
|
||||
In other words, it maximizes the probability of a set of parameters $\vec{\uptheta}$ given the observation of the dataset $(\matr{X}, \vec{y})$.
|
||||
By applying the Bayes' theorem, the problem becomes:
|
||||
\[
|
||||
\begin{split}
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D}
|
||||
-\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
|
||||
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Gaussian posteriori] \marginnote{Gaussian posteriori}
|
||||
By assuming that the conditional probability of the dataset follows a Gaussian distribution (as in MLE),
|
||||
the problem becomes:
|
||||
\[
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \} =
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 -\log p(\vec{\uptheta}) \}
|
||||
\]
|
||||
|
||||
Moreover, assuming that $p(\vec{\uptheta}) \sim \mathcal{N}(0, \matr{\Sigma})$, we have that:
|
||||
\[ -\log p(\vec{\uptheta}) = \frac{1}{2\sigma^2} \Vert \vec{\uptheta} \Vert^2 \]
|
||||
|
||||
Therefore, the problem becomes:
|
||||
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
|
||||
MAP can be seen as a regularization factor for MLE.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Linear regression}
|
||||
\marginnote{Linear regression}
|
||||
Given a dataset of inputs $\vec{x}_n \in \mathbb{R}^D$ with corresponding labels $y_n = f(\vec{x}_n) + \varepsilon$,
|
||||
where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise,
|
||||
we want to estimate the function $f$.
|
||||
|
||||
\begin{description}
|
||||
\item[Model]
|
||||
We use as the predictor:
|
||||
\[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \]
|
||||
Because of the noise, we use a probabilistic model with likelihood:
|
||||
\[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, f(\vec{x}), \sigma^2) \]
|
||||
|
||||
\item[Parameter estimation]
|
||||
To estimate $\vec{\uptheta}$, we can use MLE:
|
||||
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D} -p_\vec{\uptheta}(\vec{y} \vert \matr{X}) \]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Maximum likelihood estimation with features}
|
||||
\marginnote{MLE with features}
|
||||
Linear regression is linear only with respect to the parameters $\vec{\uptheta}$.
|
||||
Therefore, it is possible to apply any transformation to the inputs of the predictor $f$ such that:
|
||||
\[ f(\vec{x}_n) = (\phi(\vec{x}_n))^T \vec{\uptheta} \]
|
||||
where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ is a transformation and
|
||||
$\vec{\uptheta} \in \mathbb{R}^K$ are the parameters.
|
||||
|
||||
Given a dataset of $N$ entries $\vec{x}_n \in \mathbb{R}^D$ with labels $y_n \in \mathbb{R}$
|
||||
and a transformation function $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$,
|
||||
the transformed features can be expressed through a feature matrix $\matr{\Phi} \in \mathbb{R}^{N \times K}$:
|
||||
\[
|
||||
\matr{\Phi} =
|
||||
\begin{pmatrix}
|
||||
(\phi(\vec{x}_1))^T \\ \vdots \\ (\phi(\vec{x}_N))^T
|
||||
\end{pmatrix}
|
||||
=
|
||||
\begin{pmatrix}
|
||||
\phi_0(\vec{x}_1) & \cdots & \phi_{K-1}(\vec{x}_1) \\
|
||||
\vdots & \ddots & \vdots \\
|
||||
\phi_0(\vec{x}_N) & \cdots & \phi_{K-1}(\vec{x}_N) \\
|
||||
\end{pmatrix}
|
||||
\]
|
||||
|
||||
The negative log-likelihood can be defined as:
|
||||
\[
|
||||
-\log p_\vec{\uptheta}(\vec{y} \,\vert\, \matr{X}) =
|
||||
\frac{1}{2\sigma^2} (\vec{y} - \matr{\Phi}\vec{\uptheta})^T (\vec{y} - \matr{\Phi}\vec{\uptheta}) + \text{constant}
|
||||
\]
|
||||
As $\matr{\Phi}$ is (usually) full-rank and convex, the problem can be solved directly using normal equations:
|
||||
\[
|
||||
\matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \matr{\Phi}^T \vec{y} \iff
|
||||
\vec{\uptheta} = (\matr{\Phi}^T \matr{\Phi})^{-1} \matr{\Phi}^T \vec{y}
|
||||
\]
|
||||
Obviously, the negative log-likelihood can also be minimized by using a gradient method.
|
||||
|
||||
\begin{description}
|
||||
\item[Root mean square error (RMSE)] \marginnote{Root mean square error (RMSE)}
|
||||
RMSE is computed as:
|
||||
\[
|
||||
\sqrt{ \frac{1}{N} \Vert \vec{y} - \matr{\Phi}\vec{\uptheta} \Vert^2 } =
|
||||
\sqrt{ \frac{1}{N} \sum_{n=1}^{N}(y_n - (\phi(\vec{x}_n))^T\vec{\uptheta})^2 }
|
||||
\]
|
||||
Differently from MSE, RMSE allows to compare errors of datasets with different sizes
|
||||
and scales its result to the labels.
|
||||
|
||||
By comparing the RMSE of the train and test sets, it is possible to check if a model is overfitting.
|
||||
\end{description}
|
||||
|
||||
\begin{description}
|
||||
\item[Polynomial regression] \marginnote{Polynomial regression}
|
||||
The transformation function $\phi: \mathbb{R} \rightarrow \mathbb{R}^K$ is defined as:
|
||||
\[
|
||||
\phi(x) =
|
||||
\begin{pmatrix}
|
||||
\phi_0(x) \\ \phi_1(x) \\ \phi_2(x) \\ \vdots \\ \phi_{K-1}(x)
|
||||
\end{pmatrix}
|
||||
=
|
||||
\begin{pmatrix}
|
||||
1 \\ x \\ x^2 \\ \vdots \\ x^{K-1}
|
||||
\end{pmatrix}
|
||||
\]
|
||||
The predictor is then defined as:
|
||||
\[
|
||||
\begin{split}
|
||||
f(x) &= (\phi(x))^T \vec{\uptheta} \\
|
||||
&= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i = \sum_{i=0}^{K-1} x^i \vartheta_i
|
||||
\end{split}
|
||||
\]
|
||||
\end{description}
|
||||
@ -0,0 +1,226 @@
|
||||
\chapter{Matrix decomposition}
|
||||
|
||||
|
||||
\section{Eigendecomposition}
|
||||
\marginnote{Eigendecomposition}
|
||||
Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
If the eigenvectors of $\matr{A}$ form a basis of $\mathbb{R}^n$,
|
||||
then $\matr{A} \in \mathbb{R}^{n \times n}$ can be decomposed into:
|
||||
\[ \matr{A} = \matr{P}\matr{D}\matr{P}^{-1} \]
|
||||
where $\matr{P} \in \mathbb{R}^{n \times n}$ contains the eigenvectors of $\matr{A}$ as its columns and
|
||||
$\matr{D}$ is a diagonal matrix whose diagonal contains the eigenvalues of $\matr{A}$.
|
||||
|
||||
Note that a symmetric matrix can always be decomposed (\Cref{th:spectral_theorem})
|
||||
|
||||
|
||||
|
||||
\section{Singular value decomposition}
|
||||
\marginnote{Singular value decomposition}
|
||||
Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r \in [0, \min\{m, n\}]$.
|
||||
The singular value decomposition (SVD) of $\matr{A}$ is always possible and has form:
|
||||
\[
|
||||
\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
|
||||
\]
|
||||
\[
|
||||
=
|
||||
\begin{pmatrix}
|
||||
\begin{pmatrix} \\ \vec{u}_1 \\ \\ \end{pmatrix} &
|
||||
\dots &
|
||||
\begin{pmatrix} \\ \vec{u}_m \\ \\ \end{pmatrix}
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
\sigma_1 & 0 & 0 \\
|
||||
0 & \ddots & 0 \\
|
||||
0 & 0 & \sigma_{\min\{m, n\}} \\
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
\begin{pmatrix} & \vec{v}_1 & \end{pmatrix} \\
|
||||
\vdots \\
|
||||
\begin{pmatrix} & \vec{v}_n & \end{pmatrix} \\
|
||||
\end{pmatrix}
|
||||
\]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item
|
||||
$\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix whose columns $\vec{u}_i$ are called left-singular vectors.
|
||||
|
||||
\item
|
||||
$\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix whose columns $\vec{v}_i$ are called right-singular vectors.
|
||||
|
||||
\item
|
||||
$\matr{\Sigma} \in \mathbb{R}^{m \times n}$ is a matrix with $\matr{\Sigma}_{i,j} = 0$ (i.e. diagonal if it was a square matrix) and
|
||||
the singular values $\sigma_i, i = 1 \dots \min\{m, n\}$ on the diagonal.
|
||||
By convention $\sigma_1 \geq \sigma_2 \geq \dots \geq \sigma_r \geq 0$.
|
||||
Note that singular values $\sigma_j = 0$ for $(r + 1) \leq j \leq \min\{m, n\}$
|
||||
(i.e. singular values at indexes after $\text{rank}(\matr{A})$ are always 0).
|
||||
\end{itemize}
|
||||
|
||||
\marginnote{Singular value equation}
|
||||
We can also represent SVD as a \textbf{singular value equation}, which resembles the eigenvalue equation:
|
||||
\[ \matr{A}\vec{v}_i = \sigma_i\vec{u}_i \text{ for } i = 1, \dots, r \]
|
||||
This is derived from:
|
||||
\[
|
||||
\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
|
||||
\iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}\matr{V}^T\matr{V}
|
||||
\iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}
|
||||
\]
|
||||
|
||||
\subsection{Singular values and eigenvalues}
|
||||
\marginnote{Eigendecomposition of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$}
|
||||
Given $\matr{A} \in \mathbb{R}^{m \times n}$, we can obtain the eigenvalues and eigenvectors
|
||||
of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$ through SVD.
|
||||
|
||||
For $\matr{A}^T\matr{A}$, we can compute:
|
||||
\[
|
||||
\begin{split}
|
||||
\matr{A}^T\matr{A} & = (\matr{U}\matr{\Sigma}\matr{V}^T)^T(\matr{U}\matr{\Sigma}\matr{V}^T) \text{ using } (\matr{A}\matr{B})^T = \matr{B}^T\matr{A}^T \\
|
||||
& = (\matr{V}\matr{\Sigma}^T\matr{U}^T)(\matr{U}\matr{\Sigma}\matr{V}^T) \\
|
||||
& = \matr{V}\matr{\Sigma}^T\matr{\Sigma}\matr{V}^T \\
|
||||
& = \matr{V}\matr{\Sigma}^2\matr{V}^T
|
||||
\end{split}
|
||||
\]
|
||||
As $\matr{V}$ is orthogonal ($\matr{V}^T = \matr{V}^{-1}$), we can apply the eigendecomposition theorem:
|
||||
\begin{itemize}
|
||||
\item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$.
|
||||
\item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$.
|
||||
\end{itemize}
|
||||
|
||||
The same process holds for $\matr{A}\matr{A}^T$. In this case, the columns of $\matr{U}$ (left-singular vectors) are the eigenvectors.
|
||||
|
||||
|
||||
\subsection{Singular values and 2-norm}
|
||||
Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$,
|
||||
we have that $\matr{A}^T\matr{A} = \matr{A}^2 = \matr{A}\matr{A}^T$ (as $\matr{A}^T = \matr{A}$).
|
||||
|
||||
The eigenvalues of $\matr{A}^2$ are $\lambda_1^2, \dots,\lambda_n^2$, where $\lambda_i$ are eigenvalues of $\matr{A}$.
|
||||
Alternatively, the eigenvalues of $\matr{A}^2$ are the squared singular values of $\matr{A}$: $\lambda_i^2 = \sigma_i^2$.
|
||||
Moreover, the eigenvalues of $\matr{A}^{-1}$ are $\frac{1}{\lambda_1}, \dots, \frac{1}{\lambda_n}$.
|
||||
|
||||
\marginnote{2-norm using SVD}
|
||||
We can compute the 2-norm as:
|
||||
\[ \Vert \matr{A} \Vert_2 = \sqrt{\rho(\matr{A}^T\matr{A})} = \sqrt{\rho(\matr{A}^2)} = \sqrt{\max\{\sigma_1^2, \dots, \sigma_r^2\}} = \sigma_1 \]
|
||||
\[
|
||||
\Vert \matr{A}^{-1} \Vert_2 = \sqrt{\rho((\matr{A}^{-1})^T(\matr{A}^{-1}))} =
|
||||
\sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} =
|
||||
\sqrt{\max \left\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2} \right\}} = \frac{1}{\sigma_r}
|
||||
\]
|
||||
Furthermore, we can compute the condition number of $\matr{A}$ as:
|
||||
\[ K(\matr{A}) = \Vert \matr{A} \Vert_2 \cdot \Vert \matr{A}^{-1} \Vert_2 = \sigma_1 \cdot \frac{1}{\sigma_r} \]
|
||||
|
||||
|
||||
|
||||
\subsection{Application: Matrix approximation}
|
||||
Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ and its SVD decomposition $\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T$,
|
||||
we can construct a rank-1 matrix (dyad) $\matr{A}_i \in \mathbb{R}^{m \times n}$ as: \marginnote{Dyad}
|
||||
\[ \matr{A}_i = \vec{u}_i \vec{v}_i^T \]
|
||||
where $\vec{u}_i \in \mathbb{R}^m$ is the $i$-th column of $\matr{U}$ and
|
||||
$\vec{v}_i \in \mathbb{R}^n$ is the $i$-th column of $\matr{V}$.
|
||||
Then, we can compose $\matr{A}$ as a sum of dyads:
|
||||
\[ \matr{A}_i = \sum_{i=1}^{r} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{r} \sigma_i \matr{A}_i \]
|
||||
|
||||
\marginnote{Rank-$k$ approximation}
|
||||
By considering only the first $k < r$ singular values, we can obtain a rank-$k$ approximation of $\matr{A}$:
|
||||
\[ \hat{\matr{A}}(k) = \sum_{i=1}^{k} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{k} \sigma_i \matr{A}_i \]
|
||||
|
||||
\begin{theorem}[Eckart-Young]
|
||||
Given $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r$.
|
||||
For any $k \leq r$ (this theorem is interesting for $k < r$), the rank-$k$ approximation is:
|
||||
\[
|
||||
\hat{\matr{A}}(k) = \arg \min_{\matr{B} \in \mathbb{R}^{m \times n}, \text{rank}(\matr{B}) = k} \Vert \matr{A} - \matr{B} \Vert_2
|
||||
\]
|
||||
\end{theorem}
|
||||
In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closest one to $\matr{A}$.
|
||||
Moreover, the error of the rank-$k$ approximation is:
|
||||
\[
|
||||
\Vert \matr{A} - \hat{\matr{A}}(k) \Vert_2 =
|
||||
\left\Vert \sum_{i=1}^{r} \sigma_i \matr{A}_i - \sum_{j=1}^{k} \sigma_j \matr{A}_j \right\Vert_2 =
|
||||
\left\Vert \sum_{i=k+1}^{r} \sigma_i \matr{A}_i \right\Vert_2 =
|
||||
\sigma_{k+1}
|
||||
\]
|
||||
|
||||
\subsubsection{Image compression}
|
||||
Each dyad requires $1 + m + n$ (respectively for $\sigma_i$, $\vec{u}_i$ and $\vec{v}_i$) numbers to be stored.
|
||||
A rank-$k$ approximation requires to store $k(1 + m + n)$ numbers.
|
||||
Therefore, the compression factor is given by: \marginnote{Compression factor}
|
||||
\[
|
||||
c_k = 1 - \frac{k(1 + m + n)}{mn}
|
||||
\]
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.60\textwidth]{img/_rank_k_approx.pdf}
|
||||
\caption{Approximation of an image}
|
||||
\end{figure}
|
||||
|
||||
|
||||
|
||||
\subsection{Application: Linear least squares problem} \label{sec:lls}
|
||||
Given a least squares problem:
|
||||
\[
|
||||
\tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
|
||||
\]
|
||||
When $\text{rank}(\matr{A}) < n$, the system admits infinite solutions.
|
||||
Of all the solutions $S$, we are interested in the one with minimum norm:
|
||||
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
|
||||
This problem can be solved using SVD:
|
||||
\[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
|
||||
|
||||
|
||||
\subsection{Application: Polynomial interpolation}
|
||||
\marginnote{Polynomial interpolation}
|
||||
Given a set of $m$ data $(x_i, y_i), i=1, \dots, m$,
|
||||
we want to find a polynomial of degree $n$ ($m > n$) that approximates it.
|
||||
In other words, we want to find a function:
|
||||
\[ f(x) = c_0 + c_1 x + c_2 x^2 + \dots + c_n x^n \]
|
||||
that minimizes the residual vector $\vec{r} = (r_1, \dots, r_m)$,
|
||||
where $r_i = \vert y_i - f(x_i) \vert$.
|
||||
We can formulate this as a linear system:
|
||||
\[
|
||||
\vec{r} = \vec{y} - \matr{A}\vec{c} =
|
||||
\begin{pmatrix}
|
||||
y_1 \\
|
||||
\vdots \\
|
||||
y_m
|
||||
\end{pmatrix}
|
||||
-
|
||||
\begin{pmatrix}
|
||||
1 & x_1 & x_1^2 & \dots & x_1^n \\
|
||||
\vdots & \vdots & \vdots & \ddots & \vdots \\
|
||||
1 & x_m & x_m^2 & \dots & x_m^n
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
c_0 \\
|
||||
\vdots \\
|
||||
c_n
|
||||
\end{pmatrix}
|
||||
\]
|
||||
that can be solved as a linear least squares problem:
|
||||
\[ \min_{\vec{c} \in \mathbb{R}^n} \Vert \vec{y} - \matr{A}\vec{c} \Vert_2^2 \]
|
||||
|
||||
\begin{figure}[h]
|
||||
\centering
|
||||
\includegraphics[width=0.40\textwidth]{img/linear_regression.png}
|
||||
\caption{Interpolation using a polynomial of degree 1}
|
||||
\end{figure}
|
||||
|
||||
|
||||
|
||||
\section{Eigendecomposition vs SVD}
|
||||
\begin{center}
|
||||
\begin{tabular}{m{16em} | m{16em}}
|
||||
\hline
|
||||
\multicolumn{1}{c|}{\textbf{Eigendecomposition}} & \multicolumn{1}{c}{\textbf{SVD}} \\
|
||||
\multicolumn{1}{c|}{$\matr{A} = \matr{P}\matr{D}\matr{P}^{-1}$} & \multicolumn{1}{c}{$\matr{A}=\matr{U}\matr{\Sigma}\matr{V}$} \\
|
||||
\hline
|
||||
Only defined for square matrices $\matr{A} \in \mathbb{R}^{n \times n}$ with eigenvectors that form a basis of $\mathbb{R}^n$
|
||||
& Always exists \\
|
||||
\hline
|
||||
$\matr{P}$ is not necessarily orthogonal & $\matr{U}$ and $\matr{V}$ are orthogonal \\
|
||||
\hline
|
||||
The elements on the diagonal of $\matr{D}$ may be in $\mathbb{C}$
|
||||
& The elements on the diagonal of $\matr{\Sigma}$ are all non-negative reals \\
|
||||
\hline
|
||||
\multicolumn{2}{c}{For symmetric matrices, eigendecomposition and SVD are the same} \\
|
||||
\hline
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
@ -0,0 +1,522 @@
|
||||
\chapter{Probability and statistics}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Probability]
|
||||
Model of a process where the underlying uncertainty is captured by random variables.
|
||||
\item[Statistics]
|
||||
Determines the underlying process that explains an observation.
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Probability}
|
||||
\begin{description}
|
||||
\item[State space] \marginnote{State space}
|
||||
Set $\Omega$ of all the possible results of an experiment.
|
||||
\begin{example}
|
||||
A coin is tossed two times.
|
||||
$\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$
|
||||
\end{example}
|
||||
|
||||
\item[Event] \marginnote{Event}
|
||||
Set of possible results (i.e. $A$ is an event if $A \subseteq \Omega$)
|
||||
|
||||
\item[Probability] \marginnote{Probability}
|
||||
Let $\mathcal{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
|
||||
The probability of an event is a function:
|
||||
\[ \prob{A}: \mathcal{E} \rightarrow [0, 1] \]
|
||||
\begin{example}
|
||||
Let $\Omega$ be as above.
|
||||
Given an event $A = \{ (\text{T}, \text{H}), (\text{H}, \text{T}) \}$,
|
||||
its probability is: $\prob{A} = \frac{2}{4} = \frac{1}{2}$
|
||||
\end{example}
|
||||
|
||||
\item[Conditional probability] \marginnote{Conditional probability}
|
||||
Probability of an event $B$, knowing that another event $A$ happened:
|
||||
\[ \prob{B \vert A} = \frac{\prob{A \cap B}}{\prob{A}} \text{, with } \prob{A} \neq 0 \]
|
||||
|
||||
\begin{example}
|
||||
A coin is tossed three times.
|
||||
Given the events $A = \{ \text{tails two times} \}$ and $B = \{ \text{one heads and one tails} \}$
|
||||
We have that:
|
||||
|
||||
\begin{minipage}{\linewidth}
|
||||
\centering
|
||||
\small
|
||||
$\Omega = \{
|
||||
(\text{T}, \text{T}, \text{T}), (\text{T}, \text{T}, \text{H}), (\text{T}, \text{H}, \text{T})
|
||||
(\text{T}, \text{H}, \text{H}), (\text{H}, \text{T}, \text{T}), (\text{H}, \text{T}, \text{H})
|
||||
(\text{H}, \text{H}, \text{T}), (\text{H}, \text{H}, \text{H})
|
||||
\}$
|
||||
\end{minipage}
|
||||
|
||||
\begin{minipage}{.325\linewidth}
|
||||
\centering
|
||||
$\prob{A} = \frac{4}{8} = \frac{1}{2}$
|
||||
\end{minipage}
|
||||
\begin{minipage}{.325\linewidth}
|
||||
\centering
|
||||
$\prob{B} = \frac{6}{8} = \frac{3}{4}$
|
||||
\end{minipage}
|
||||
\begin{minipage}{.325\linewidth}
|
||||
\centering
|
||||
$\prob{A \cap B} = \frac{3}{8}$
|
||||
\end{minipage}
|
||||
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
$\prob{A \vert B} = \frac{3/8}{3/4} = \frac{1}{2}$
|
||||
\end{minipage}
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
$\prob{B \vert A} = \frac{3/8}{1/2} = \frac{3}{4}$
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
|
||||
\item[Independent events] \marginnote{Independent events}
|
||||
Two events $A$ and $B$ are independent if:
|
||||
\[ \prob{A \cap B} = \prob{A}\prob{B} \]
|
||||
It follows that:
|
||||
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
$\prob{A \vert B} = \prob{A}$
|
||||
\end{minipage}
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
$\prob{B \vert A} = \prob{B}$
|
||||
\end{minipage}
|
||||
|
||||
In general, given $n$ events $A_1, \dots, A_n$, they are independent if:
|
||||
\[ \prob{A_1 \cap \dots \cap A_n} = \prod_{i=1}^{n} \prob{A_i} \]
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Random variables}
|
||||
\begin{description}
|
||||
\item[Random variable (RV)] \marginnote{Random variable}
|
||||
A random variable $X$ is a function:
|
||||
\[ X: \Omega \rightarrow \mathbb{R} \]
|
||||
|
||||
\item[Target space/Support] \marginnote{Target space}
|
||||
Given a random variable $X$,
|
||||
the target space (or support) $\mathcal{T}_X$ of $X$ is the set of all its possible values:
|
||||
\[ \mathcal{T}_X = \{ x \mid x = X(\omega), \forall \omega \in \Omega \} \]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Discrete random variables}
|
||||
|
||||
\begin{description}
|
||||
\item[Discrete random variable] \marginnote{Discrete random variable}
|
||||
A random variable $X$ is discrete if its target space $\mathcal{T}_X$ is finite or countably infinite.
|
||||
|
||||
\begin{example}
|
||||
A coin is tossed twice.
|
||||
|
||||
Given the random variable $X(\omega) = \{ \text{number of heads} \}$.
|
||||
We have that $\mathcal{T}_X = \{ 0, 1, 2 \}$, therefore $X$ is discrete.
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
Roll a die until 6 comes out.
|
||||
|
||||
Given the random variable $Y(\omega) = \{ \text{number of rolls before 6} \}$.
|
||||
We have that $\mathcal{T}_Y = \{ 1, 2, \dots \} = \mathbb{N} \smallsetminus \{0\}$,
|
||||
therefore $Y$ is discrete as $\mathcal{T}_Y$ is a countable set.
|
||||
\end{example}
|
||||
|
||||
\item[Probability mass function (PMF)] \marginnote{Probability mass function (PMF)}
|
||||
Given a discrete random variable $X$, its probability mass function is a function $p_X: \mathcal{T}_X \rightarrow [0, 1]$ such that:
|
||||
\[ p_X(x) = \prob{X = x}, \forall x \in \mathcal{T}_X \]
|
||||
|
||||
A PMF has the following properties:
|
||||
\begin{enumerate}
|
||||
\item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$
|
||||
\item $\sum_{x \in \mathcal{T}_X} p_X(x) = 1$
|
||||
\item Let $A \subseteq \Omega$, $\prob{X = x \in A} = \sum_{x \in A} p_X(x)$
|
||||
\end{enumerate}
|
||||
|
||||
We denote with $X \sim p_X$ a random variable $X$ with PMF $p_X$.
|
||||
|
||||
\begin{example}
|
||||
Let $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$.
|
||||
Given a random variable $X = \{ \text{number of heads} \}$ with $\mathcal{T}_X = \{ 0, 1, 2 \}$.
|
||||
Its PMF is:
|
||||
\[
|
||||
\begin{split}
|
||||
p_X &= \prob{X = 0} = \frac{1}{4} \\
|
||||
p_X &= \prob{X = 1} = \frac{2}{4} \\
|
||||
p_X &= \prob{X = 2} = \frac{1}{4}
|
||||
\end{split}
|
||||
\]
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Continuous random variables}
|
||||
|
||||
\begin{description}
|
||||
\item[Continuous random variable] \marginnote{Continuous random variable}
|
||||
A random variable $X$ is continuous if its target space $\mathcal{T}_X$ is uncountably infinite (i.e. a subset of $\mathbb{R}$).
|
||||
Usually, $\mathcal{T}_X$ is an interval or a union of intervals.
|
||||
|
||||
\begin{example}
|
||||
Given a random variable $Z = \{ \text{Time before the arrival of a client} \}$.
|
||||
$Z$ is continuous as $\mathcal{T}_Z = [a, b] \subseteq [0, +\infty[$ is an uncountable set.
|
||||
\end{example}
|
||||
|
||||
\item[Probability density function (PDF)] \marginnote{Probability density function (PDF)}
|
||||
Given a continuous random variable $X$,
|
||||
its probability density function is a function $p_X: \mathcal{T}_X \rightarrow \mathbb{R}$ such that:
|
||||
\[ \prob{X \in A} = \int_{A} p_X(x) \,dx \]
|
||||
\[ \prob{a \leq X \leq b} = \int_{a}^{b} p_X(x) \,dx \]
|
||||
Note that $\prob{X = a} = \prob{a \leq X \leq a} = \int_{a}^{a} p_X(x) \,dx = 0$
|
||||
|
||||
A PDF has the following properties:
|
||||
\begin{enumerate}
|
||||
\item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$
|
||||
\item $\int_{x \in \mathcal{T}_X} p_X(x) \,dx = 1$
|
||||
\item $\prob{X \in A} = \int_{A} p_X(x) \,dx$
|
||||
\end{enumerate}
|
||||
|
||||
We denote with $X \sim p_X$ a random variable $X$ with PDF $p_X$.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Discrete joint distribution}
|
||||
|
||||
\begin{description}
|
||||
\item[Univariate distribution] \marginnote{Univariate distribution}
|
||||
Distribution with one random variable.
|
||||
|
||||
\item[Multivariate distribution] \marginnote{Multivariate distribution}
|
||||
Distribution with multiple random variables.
|
||||
|
||||
\item[Joint probability] \marginnote{Joint probability}
|
||||
Let $X$ and $Y$ be random variables respectively with target space $\mathcal{T}_X$ and $\mathcal{T}_Y$.
|
||||
The joint probability of $X$ and $Y$ has target space $\mathcal{T}_{XY} = \mathcal{T}_X \times \mathcal{T}_Y$
|
||||
and its PMF is:
|
||||
\[ p_{XY}(x_i, y_j) = \prob{X = x_i \cap Y = y_j} \]
|
||||
|
||||
$p_X(x)$ and $p_Y(y)$ are the \textbf{marginal probabilities}. \marginnote{Marginal probability}
|
||||
|
||||
\begin{example}
|
||||
Let $X$ and $Y$ be random variables respectively with five and three possible states.
|
||||
\begin{center}
|
||||
\includegraphics[width=0.4\textwidth]{img/_joint_probability_example.pdf}
|
||||
\end{center}
|
||||
We denote with:
|
||||
\begin{itemize}
|
||||
\item $N$ the number of events.
|
||||
\item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p_{XY}(x, y) = n_{ij}$).
|
||||
\item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column.
|
||||
\item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row.
|
||||
\end{itemize}
|
||||
|
||||
The marginal probabilities are:\\
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
\[ p_X(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
|
||||
\end{minipage}
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
\[ p_Y(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
|
||||
\end{minipage}
|
||||
|
||||
The conditional probabilities can be computed as:
|
||||
\[ \prob{Y = y_j \vert X = x_i} = \frac{p_{XY}(x_i, y_i)}{p_X(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
|
||||
\[ \prob{X = x_i \vert Y = y_j} = \frac{p_{XY}(x_i, y_i)}{p_Y(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Rules of probability}
|
||||
|
||||
\subsection{Sum rule}
|
||||
\marginnote{Sum rule\\Marginalization property}
|
||||
Given $X$ and $Y$ random variables. The sum rule states that:
|
||||
\[
|
||||
p_X(\bm{x}) =
|
||||
\begin{cases}
|
||||
\sum_{\bm{y} \in \mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
|
||||
\int_{\mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
|
||||
\end{cases}
|
||||
\]
|
||||
|
||||
The sum rule relates the joint distribution and the marginal distribution.
|
||||
In fact, the sum rule can be applied to any subset of the random variables of a joint distribution.
|
||||
Given $\bm{x} = \begin{pmatrix} x_1, \dots, x_D \end{pmatrix}^T$,
|
||||
the marginal w.r.t. $x_i$ can be obtained by integrating/summing out all random variables except $x_i$:
|
||||
\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\smallsetminus i} \]
|
||||
|
||||
\subsection{Product rule}
|
||||
\marginnote{Product rule}
|
||||
\[ p(\bm{x}, \bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) = p(\bm{x} \vert \bm{y}) p(\bm{y}) \]
|
||||
|
||||
|
||||
|
||||
\section{Bayes' theorem}
|
||||
\begin{theorem}
|
||||
\marginnote{Bayes' theorem}
|
||||
Given two random variables $X$ and $Y$:
|
||||
\[
|
||||
\overbrace{p(\bm{x} \vert \bm{y})}^{\mathclap{\text{posterior}}} =
|
||||
\frac
|
||||
{ \overbrace{p(\bm{y} \vert \bm{x})}^{\mathclap{\text{likelihood }}} \overbrace{p(\bm{x})}^{\mathclap{\text{ prior}}} }
|
||||
{\underbrace{p(\bm{y})}_{\mathclap{\text{evidence}}}}
|
||||
\]
|
||||
where:
|
||||
\begin{descriptionlist}
|
||||
\item[Prior] \marginnote{Prior}
|
||||
is the prior knowledge of the unobserved data $\bm{x}$.
|
||||
|
||||
\item[Likelihood] \marginnote{Likelihood}
|
||||
describes the relation between $\bm{x}$ and $\bm{y}$.
|
||||
|
||||
\item[Posterior] \marginnote{Posterior}
|
||||
represents the quantity of interest (i.e. knowledge on $\bm{x}$ after observing $\bm{y}$).
|
||||
|
||||
\item[Evidence/Marginal likelihood] \marginnote{Evidence/Marginal likelihood}
|
||||
normalizes the posterior. It is defined independently from $\bm{x}$ (i.e. is constant) as:
|
||||
\[ p(\bm{y}) = \int p(\bm{y} \vert \bm{x}) p(\bm{x}) \,d\bm{x} \]
|
||||
\end{descriptionlist}
|
||||
\end{theorem}
|
||||
\begin{proof}
|
||||
This is a direct consequence of the product rule:
|
||||
\[
|
||||
p(\bm{x} \vert \bm{y}) p(\bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) \iff
|
||||
p(\bm{x} \vert \bm{y}) p(\bm{y}) = \frac{p(\bm{y} \vert \bm{x}) p(\bm{x})}{p(\bm{y})}
|
||||
\]
|
||||
\end{proof}
|
||||
|
||||
Note: sometimes, instead of the full posterior, the maximum is considered (with loss of information):
|
||||
\[ \max_x p(x \vert y) = \max_x \frac{p(y \vert x) p(x)}{\underbrace{p(y)}_{\mathclap{\text{constant}}}} = \max_x p(y \vert x) p(x) \]
|
||||
|
||||
|
||||
|
||||
\section{Statistics}
|
||||
|
||||
\begin{description}
|
||||
\item[Statistic] \marginnote{Statistic}
|
||||
A statistic of a random variable is a deterministic function defined on it.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Mean}
|
||||
\begin{description}
|
||||
\item[Expected value (univariate)] \marginnote{Expected value (univariate)}
|
||||
Given a function $g$ of a random variable $X \sim p(x)$,
|
||||
its expected value is:
|
||||
\[
|
||||
\mathbb{E}_X[g(x)] =
|
||||
\begin{cases}
|
||||
\sum_{x \in \mathcal{T}_X} g(x)p(x) & \text{if } $X$ \text{ is discrete} \\
|
||||
\int_{\mathcal{T}_X} g(x)p(x) \,dx & \text{if } $X$ \text{ is continuous} \\
|
||||
\end{cases}
|
||||
\]
|
||||
|
||||
\item[Expected value (multivariate)] \marginnote{Expected value (multivariate)}
|
||||
A multivariate random variable $X$ can be seen as
|
||||
a vector of univariate random variables $\begin{pmatrix} X_1, \dots, X_D \end{pmatrix}^T$.
|
||||
Its expected value can be computed element-wise as:
|
||||
\[
|
||||
\mathbb{E}_X[g(\bm{x})] =
|
||||
\begin{pmatrix} \mathbb{E}_{X_1}[g(x_1)] \\ \vdots \\ \mathbb{E}_{X_D}[g(x_D)] \end{pmatrix} \in \mathbb{R}^D
|
||||
\]
|
||||
|
||||
\item[Mean] \marginnote{Mean}
|
||||
Given a random variable $X \sim p(x)$,
|
||||
the mean of $X$ is its expected value with $g$ defined as the identity:
|
||||
\[
|
||||
\mathbb{E}_X[x] =
|
||||
\begin{cases}
|
||||
\sum_{x \in \mathcal{T}_X} x \cdot p(x) & \text{if } $X$ \text{ is discrete} \\
|
||||
\int_{\mathcal{T}_X} x \cdot p(x) \,dx & \text{if } $X$ \text{ is continuous} \\
|
||||
\end{cases}
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Variance}
|
||||
\begin{description}
|
||||
\item[Covariance (univariate)] \marginnote{Covariance (univariate)}
|
||||
Given two univariate random variables $X$ and $Y$, their covariance is:
|
||||
\[ \text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[(x - \mathbb{E}_X[x])(y - \mathbb{E}_Y[y])] \]
|
||||
|
||||
\begin{lemma}
|
||||
$\text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[x, y] - \mathbb{E}_{X}[x]\mathbb{E}_{Y}[y]$
|
||||
\end{lemma}
|
||||
|
||||
\item[Variance (univariate)] \marginnote{Variance (univariate)}
|
||||
The variance of a univariate random variable is given by:
|
||||
\[ \mathbb{V}_X[x] = \text{Cov}_X[x, x] \]
|
||||
Its square root is the standard deviation $\sigma(x)$.
|
||||
|
||||
\item[Covariance (multivariate)] \marginnote{Covariance (multivariate)}
|
||||
Given two multivariate random variables
|
||||
$X$ and $Y$ with states $\bm{x} \in \mathbb{R}^D$ and $\bm{y} \in \mathbb{R}^E$,
|
||||
their covariance is:
|
||||
\[
|
||||
\text{Cov}_{XY}[\bm{x}, \bm{y}] = \text{Cov}_{XY}[\bm{y}, \bm{x}]^T =
|
||||
\mathbb{E}_{XY}[\bm{xy}^T] - \mathbb{E}_{X}[\bm{x}]\mathbb{E}_{Y}[\bm{y}]^T \in \mathbb{R}^{D \times E}
|
||||
\]
|
||||
|
||||
|
||||
\item[Variance (multivariate)] \marginnote{Variance (multivariate)}
|
||||
Given a multivariate random variable $X$ with
|
||||
states $\bm{x} \in \mathbb{R}^D$ and mean vector $\bm{\mu} \in \mathbb{R}^D$.
|
||||
Its variance is given by:
|
||||
\[
|
||||
\begin{split}
|
||||
\mathbb{V}_X[\bm{x}] &= \text{Cov}_X[\bm{x}, \bm{x}] \\
|
||||
&= \mathbb{E}_X[\bm{xx}^T] - \mathbb{E}_X[\bm{x}]\mathbb{E}_X[\bm{x}]^T \\
|
||||
&=
|
||||
\begin{pmatrix}
|
||||
\text{Cov}[x_1, x_1] & \text{Cov}[x_1, x_2] & \cdots & \text{Cov}[x_1, x_D] \\
|
||||
\text{Cov}[x_2, x_1] & \text{Cov}[x_2, x_2] & \cdots & \text{Cov}[x_2, x_D] \\
|
||||
\vdots & \vdots & \ddots & \vdots \\
|
||||
\text{Cov}[x_D, x_1] & \text{Cov}[x_D, x_2] & \cdots & \text{Cov}[x_D, x_D] \\
|
||||
\end{pmatrix} \in \mathbb{R}^{D \times D}
|
||||
\end{split}
|
||||
\]
|
||||
This matrix is called covariance matrix and is symmetric positive semidefinite.
|
||||
|
||||
\item[Correlation] \marginnote{Correlation}
|
||||
Given two random variables $X$ and $Y$, their correlation is:
|
||||
\[ \text{corr}[x, y] = \frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}} \in [-1, 1] \]
|
||||
\begin{itemize}
|
||||
\item When $\text{corr}[x, y] \rightarrow +1$, $x$ and $y$ are expected to grow together.
|
||||
\item When $\text{corr}[x, y] \rightarrow -1$, $x$ grows when $y$ decreases and vice versa.
|
||||
\item When $\text{corr}[x, y] \rightarrow 0$, $x$ and $y$ are not correlated.
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Empirical mean and variance}
|
||||
In practice, it is not always possible to compute statistics on the real population.
|
||||
Empirical observations can be made on a (finite) subset of the real population sampled as
|
||||
a finite number of identical random variables $X_1, \dots, X_N$.
|
||||
|
||||
\begin{description}
|
||||
\item[Empirical mean] \marginnote{Empirical mean}
|
||||
\[ \bar{x} = \frac{1}{N} \sum_{n=1}^{N}x_n \]
|
||||
\item[Empirical variance] \marginnote{Empirical variance}
|
||||
\[ \sigma^2 = \frac{1}{N} \sum_{n=1}^{N}(x_n - \bar{x})^2 \]
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Random variables properties}
|
||||
|
||||
\subsection{Manipulations}
|
||||
\begin{itemize}
|
||||
\item $\mathbb{E}[\bm{x} + \bm{y}] = \mathbb{E}[\bm{x}] + \mathbb{E}[\bm{y}]$
|
||||
\marginnote{Manipulations of random variables}
|
||||
\item $\mathbb{E}[\bm{x} - \bm{y}] = \mathbb{E}[\bm{x}] - \mathbb{E}[\bm{y}]$
|
||||
\item $\mathbb{V}[\bm{x} + \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] + \text{Cov}[\bm{x}, \bm{y}] + \text{Cov}[\bm{y}, \bm{x}]$
|
||||
\item $\mathbb{V}[\bm{x} - \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] - \text{Cov}[\bm{x}, \bm{y}] - \text{Cov}[\bm{y}, \bm{x}]$
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\subsection{Statistical independence}
|
||||
\marginnote{Statistical independence}
|
||||
Two random variables $X$ and $Y$ are statistically independent iff:
|
||||
\[ p(\bm{x}, \bm{y}) = p(\bm{x})p(\bm{y}) \]
|
||||
|
||||
\begin{theorem}
|
||||
If $X$ and $Y$ are statistically independent, then:
|
||||
\begin{itemize}
|
||||
\item $p(\bm{x} \vert \bm{y}) = p(\bm{x})$ and $p(\bm{y} \vert \bm{x}) = p(\bm{y})$
|
||||
\item $\mathbb{V}_{XY}[\bm{x} + \bm{y}] = \mathbb{V}_X[\bm{x}] + \mathbb{V}_Y[\bm{y}]$
|
||||
\item $\text{Cov}_{XY}[\bm{x}, \bm{y}] = \nullvec$
|
||||
\end{itemize}
|
||||
\end{theorem}
|
||||
|
||||
|
||||
\subsection{Conditional independence}
|
||||
\marginnote{Conditional independence}
|
||||
Two random variables $X$ and $Y$ are conditionally independent given $Z$ iff:
|
||||
\[ p(\bm{x}, \bm{y} \vert \bm{z}) = p(\bm{x} \vert \bm{z}) p(\bm{y} \vert \bm{z}) \, \forall \bm{z} \in \mathcal{T}_Z \]
|
||||
|
||||
|
||||
\subsection{Inner product}
|
||||
\marginnote{Inner product of random variables}
|
||||
Given two zero mean random variables $X$ and $Y$, their inner product is defined as:
|
||||
\[ \left\langle X, Y \right\rangle = \text{Cov}[x, y] \]
|
||||
The covariance matrix is symmetric positive definite.
|
||||
|
||||
Moreover, we have that:
|
||||
\begin{itemize}
|
||||
\item $\Vert X \Vert = \sqrt{\langle X, X \rangle} = \sqrt{\text{Cov}[x, x]} = \sqrt{\mathbb{V}[x]} = \sigma[x]$
|
||||
\item
|
||||
$\cos\theta = \frac{\langle X, Y \rangle}{\Vert X \Vert \cdot \Vert Y \Vert} =
|
||||
\frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}}$, where $\theta$ is the angle between $X$ and $Y$.
|
||||
\item $X \perp Y \iff \langle X, Y \rangle = 0 \iff \text{Cov}[x, y] = 0 \iff X \text{ and } Y \text{ uncorrelated}$
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\section{Common distributions}
|
||||
|
||||
\subsection{Discrete random variables}
|
||||
\begin{descriptionlist}
|
||||
\item[Uniform distribution] \marginnote{Uniform distribution}
|
||||
Given a discrete random variable $X$ with $\vert \mathcal{T}_X \vert = N$,
|
||||
$X$ has a uniform distribution if:
|
||||
\[ p_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
|
||||
|
||||
\item[Poisson distribution] \marginnote{Poisson distribution}
|
||||
Given a discrete random variable $X$ with mean $\lambda$,
|
||||
$X$ has a poisson distribution if:
|
||||
\[ p_X(x) = e^{-\lambda} \frac{\lambda^x}{x!}, \forall x \in \mathcal{T}_X \]
|
||||
|
||||
A poisson distribution has $\mathbb{E}[x] = \lambda$ and $\mathbb{V}[x] = \lambda$.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\subsection{Continuous random variables}
|
||||
\begin{descriptionlist}
|
||||
\item[Continuous uniform distribution] \marginnote{Continuous uniform distribution}
|
||||
Given a continuous random variable $X$ with $\mathcal{T}_X = [a, b]$,
|
||||
$X$ has a continuous uniform distribution if:
|
||||
\[ p_X(x) = \frac{1}{b-a}, \forall x \in \mathcal{T}_X \]
|
||||
|
||||
\item[Normal distribution] \marginnote{Normal distribution}
|
||||
Given a continuous random variable $X$ and the parameters $\mu$ (mean) and $\sigma$ (variance).
|
||||
$X$ has a normal distribution if:
|
||||
\[ p_X(x) = \frac{1}{\sigma \sqrt{2\pi}} e^{\frac{-(x-\mu)^2}{2\sigma^2}} , \forall x \in \mathcal{T}_X\]
|
||||
|
||||
In the multivariate case, it is defined as:
|
||||
\[
|
||||
p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}, \matr{\Sigma}) =
|
||||
(2\pi)^{-\frac{D}{2}} \vert \matr{\Sigma} \vert^{-\frac{1}{2}} e^{(-\frac{1}{2}(\bm{x} - \bm{\mu})^T\matr{\Sigma}^{-1}(\bm{x}-\bm{\mu}))}
|
||||
\in \mathbb{R}
|
||||
\]
|
||||
where $\bm{\mu}$ is the mean vector and $\matr{\Sigma}$ the covariance matrix.
|
||||
|
||||
\begin{description}
|
||||
\item[Standard normal distribution] \marginnote{Standard normal distribution}
|
||||
Normal distribution with $\mu = 0$ and $\sigma = 1$ (univariate) or
|
||||
$\bm{\mu} = \nullvec$ and $\matr{\Sigma} = \matr{I}$ (multivariate).
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.40\textwidth]{img/normal_distribution.png}
|
||||
\caption{Normal distributions and standard normal distribution}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\begin{theorem}[Linearity]
|
||||
\marginnote{Gaussian sum and linear transformations}
|
||||
Given $X$ and $Y$ independent Gaussian random variables with
|
||||
$p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}_x, \matr{\Sigma}_x)$ and
|
||||
$p(\bm{y}) = \mathcal{N}(\bm{y} \vert \bm{\mu}_y, \matr{\Sigma}_y)$.
|
||||
It holds that:
|
||||
\[ p(a\bm{x} + b\bm{y}) = \mathcal{N}(a\bm{\mu}_x + b\bm{\mu}_y, a^2\matr{\Sigma}_x + b^2\matr{\Sigma}_y) \]
|
||||
\end{theorem}
|
||||
\end{descriptionlist}
|
||||
@ -0,0 +1,360 @@
|
||||
\chapter{Vector calculus}
|
||||
|
||||
|
||||
\section{Gradient of real-valued multivariate functions}
|
||||
|
||||
\begin{description}
|
||||
\item[Gradient] \marginnote{Gradient}
|
||||
Given a function $f: \mathbb{R}^n \rightarrow \mathbb{R}$,
|
||||
the gradient is a row vector containing the partial derivatives of $f$:
|
||||
\[
|
||||
\nabla f(\vec{x}) =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f(\vec{x})}{\partial x_1} & \frac{\partial f(\vec{x})}{\partial x_2} & \dots & \frac{\partial f(\vec{x})}{\partial x_n}
|
||||
\end{pmatrix}
|
||||
\in \mathbb{R}^{1 \times n}
|
||||
\]
|
||||
|
||||
\item[Hessian] \marginnote{Hessian matrix}
|
||||
Given a function $f: \mathbb{R}^n \rightarrow \mathbb{R}$,
|
||||
the Hessian matrix $\matr{H} \in \mathbb{R}^{n \times n}$ contains the second derivatives of $f$:
|
||||
\[
|
||||
\matr{H} =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f}{\partial x_1^2} & \frac{\partial f}{\partial x_1 \partial x_2} & \dots & \frac{\partial f}{\partial x_1 \partial x_n} \\
|
||||
\frac{\partial f}{\partial x_2 \partial x_1} & \frac{\partial f}{\partial x_2^2} & \dots & \vdots \\
|
||||
\vdots & \vdots & \ddots & \vdots \\
|
||||
\frac{\partial f}{\partial x_n \partial x_1} & \dots & \dots & \frac{\partial f}{\partial x_n^2}
|
||||
\end{pmatrix}
|
||||
\]
|
||||
In other words, $H_{i,j} = \frac{\partial f}{\partial x_i \partial x_j}$.
|
||||
Moreover, $\matr{H}$ is symmetric.
|
||||
\end{description}
|
||||
|
||||
\subsection{Partial differentiation rules}
|
||||
\begin{description}
|
||||
\item[Product rule] \marginnote{Product rule}
|
||||
Let $f, g: \mathbb{R}^n \rightarrow \mathbb{R}$:
|
||||
\[
|
||||
\frac{\partial}{\partial \vec{x}} (f(\vec{x})g(\vec{x})) =
|
||||
\frac{\partial f}{\partial \vec{x}} g(\vec{x}) + f(\vec{x}) \frac{\partial g}{\partial \vec{x}}
|
||||
\]
|
||||
\item[Sum rule] \marginnote{Sum rule}
|
||||
Let $f, g: \mathbb{R}^n \rightarrow \mathbb{R}$:
|
||||
\[
|
||||
\frac{\partial}{\partial \vec{x}} (f(\vec{x}) + g(\vec{x})) =
|
||||
\frac{\partial f}{\partial \vec{x}} + \frac{\partial g}{\partial \vec{x}}
|
||||
\]
|
||||
\item[Chain rule] \marginnote{Chain rule}
|
||||
Let $f: \mathbb{R}^n \rightarrow \mathbb{R}$ and $\vec{g}$ a vector of $n$ functions $g_i: \mathbb{R}^m \rightarrow \mathbb{R}$:
|
||||
\[
|
||||
\frac{\partial}{\partial \vec{x}} (f \circ \vec{g})(\vec{x}) =
|
||||
\frac{\partial}{\partial \vec{x}} \Big( f(\vec{g}(\vec{x})) \Big) =
|
||||
\frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial \vec{x}}
|
||||
\]
|
||||
|
||||
For instance, consider a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables
|
||||
$g_1(t), g_2(t): \mathbb{R} \rightarrow \mathbb{R}$ that are functions of $t$.
|
||||
The gradient of $f$ with respect to $t$ is:
|
||||
\[
|
||||
\frac{\text{d}f}{\text{d}t} =
|
||||
% \frac{\partial f}{\partial (g_1, g_2)} \frac{\partial (g_1, g_2)}{\partial t} =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
\frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
|
||||
\end{pmatrix}
|
||||
= \frac{\partial f}{\partial g_1} \frac{\partial g_1}{\partial t} + \frac{\partial f}{\partial g_2} \frac{\partial g_2}{\partial t}
|
||||
\]
|
||||
In other words, the first matrix represents the gradient of $f$ w.r.t. its variables and
|
||||
the second matrix contains in the $i$-th row the gradient of $g_i$.
|
||||
|
||||
Therefore, if $g_i$ are in turn multivariate functions $g_1(s, t), g_2(s, t): \mathbb{R}^2 \rightarrow \mathbb{R}$,
|
||||
the chain rule can be applied as follows:
|
||||
\[
|
||||
\frac{\text{d}f}{\text{d}(s, t)} =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
\frac{\partial g_1}{\partial s} & \frac{\partial g_1}{\partial t} \\
|
||||
\frac{\partial g_2}{\partial s} & \frac{\partial g_2}{\partial t}
|
||||
\end{pmatrix}
|
||||
\]
|
||||
|
||||
\begin{example}
|
||||
Let $f(x_1, x_2) = x_1^2 + 2x_2$, where $x_1 = \sin(t)$ and $x_2 = \cos(t)$.
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\text{d}f}{\text{d}t} & =
|
||||
\frac{\partial f}{\partial x_1}\frac{\partial x_1}{\partial t} + \frac{\partial f}{\partial x_2}\frac{\partial x_2}{\partial t} \\
|
||||
& = (2x_1)(\cos(t)) + (2)(-\sin(t)) \\
|
||||
& = 2\sin(t)\cos(t) - 2\sin(t)
|
||||
\end{split}
|
||||
\]
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ \vec{g})(t) = f(\vec{g}(t))$ where:
|
||||
\[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(g_1, g_2) = \exp(g_1 g_2^2) \]
|
||||
\[
|
||||
\vec{g}: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as }
|
||||
\vec{g}(t) = \begin{pmatrix} g_1 \\ g_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
|
||||
\]
|
||||
The gradient of $h$ with respect to $t$ can be computed as:
|
||||
\[
|
||||
\frac{\text{d} h}{\text{d} t} =
|
||||
\frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial t} =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
\frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
|
||||
\end{pmatrix}
|
||||
\]
|
||||
\[
|
||||
=
|
||||
\begin{pmatrix} \exp(g_1 g_2^2)g_2^2 & 2\exp(g_1 g_2^2)g_1 g_2 \end{pmatrix}
|
||||
\begin{pmatrix} \cos(t) + (-t\sin(t)) \\ \sin(t) + t\cos(t) \end{pmatrix}
|
||||
\]
|
||||
\end{example}
|
||||
|
||||
\begin{example}[Gradient of a least squares loss] \marginnote{Least squares loss gradient}
|
||||
Given a linear model defined on $\vec{\uptheta}$:
|
||||
\[ \vec{y} = \matr{\Phi}\vec{\uptheta} \]
|
||||
\end{example}
|
||||
with $\vec{\uptheta} \in \mathbb{R}^D$, $\matr{\Phi} \in \mathbb{R}^{N \times D}$ and $\vec{y} \in \mathbb{R}^N$.
|
||||
We can define the least squares loss function as:
|
||||
\[ L(\vec{e}) = \Vert \vec{e} \Vert_2^2 \]
|
||||
\[ \vec{e}(\vec{\uptheta}) = \vec{y} - \matr{\Phi}\vec{\uptheta} \]
|
||||
It must be noted that:
|
||||
\[ L(\vec{e}) = \Vert \vec{e} \Vert_2^2 = \vec{e}^T\vec{e} = \sum_{i=1}^{N} \vec{e}_i^2 \]
|
||||
|
||||
To compute the gradient of $L$ with respect to $\vec{\uptheta}$, we can use the chain rule:
|
||||
\[
|
||||
\begin{split}
|
||||
\nabla L(\vec{\uptheta}) &= \frac{\partial L}{\partial \vec{e}} \frac{\partial \vec{e}}{\partial \vec{\uptheta}}
|
||||
= (2\vec{e}^T) (-\matr{\Phi}) \\
|
||||
& = -2(\vec{y}^T - \vec{\uptheta}^T \matr{\Phi}^T)\matr{\Phi} \\
|
||||
& = -2(\vec{y}^T\matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi})
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
Note that if we enforce $\nabla L(\vec{\uptheta}) = \nullvec$, we obtain the normal equation of \Cref{sec:lls}:
|
||||
\[
|
||||
\begin{split}
|
||||
\nabla L = 0 &\iff -2(\vec{y}^T\matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi}) = \nullvec \\
|
||||
&\iff \vec{y}^T \matr{\Phi} - \vec{\uptheta}^T \matr{\Phi}^T\matr{\Phi} = \nullvec \\
|
||||
&\iff \matr{\Phi}^T \vec{y} - \matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \nullvec
|
||||
\end{split}
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Gradient of vector-valued multivariate functions}
|
||||
|
||||
\begin{description}
|
||||
\item[Vector-valued function]
|
||||
Function $\vec{f}: \mathbb{R}^n \rightarrow \mathbb{R}^m$ with $n \geq 1$ and $m > 1$.
|
||||
Given $\vec{x} \in \mathbb{R}^n$, the output can be represented as:
|
||||
\[
|
||||
\vec{f}(\vec{x}) =
|
||||
\begin{pmatrix}
|
||||
f_1(\vec{x}) \\ \vdots \\ f_m(\vec{x})
|
||||
\end{pmatrix} \in \mathbb{R}^m
|
||||
\]
|
||||
where $f_i: \mathbb{R}^n \rightarrow \mathbb{R}$.
|
||||
|
||||
\item[Jacobian] \marginnote{Jacobian matrix}
|
||||
Given $\vec{f}: \mathbb{R}^n \rightarrow \mathbb{R}^m$, the Jacobian matrix $\matr{J} \in \mathbb{R}^{m \times n}$
|
||||
contains the first-order derivatives of $\vec{f}$:
|
||||
\[
|
||||
\matr{J} = \nabla\vec{f}(\vec{x}) =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial \vec{f}(\vec{x})}{\partial x_1} & \dots & \frac{\partial \vec{f}(\vec{x})}{\partial x_n}
|
||||
\end{pmatrix} =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f_1(\vec{x})}{\partial x_1} & \dots & \frac{\partial f_1(\vec{x})}{\partial x_n} \\
|
||||
\vdots & \ddots & \vdots \\
|
||||
\frac{\partial f_m(\vec{x})}{\partial x_1} & \dots & \frac{\partial f_m(\vec{x})}{\partial x_n} \\
|
||||
\end{pmatrix}
|
||||
\]
|
||||
In other words, $J_{i,j} = \frac{\partial f_i}{\partial x_j}$.
|
||||
Note that the Jacobian matrix is a generalization of the gradient in the real-valued case.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Backpropagation}
|
||||
\marginnote{Backpropagation}
|
||||
Backpropagation is used to tune the parameters of a neural network.
|
||||
A neural network can be seen as a composition of many functions:
|
||||
\[ \vec{y} = (\vec{f}_K \circ \vec{f}_{K-1} \circ \dots \circ \vec{f}_1)(\vec{x}) = \vec{f}_K(\vec{f}_{K-1}(\cdots \vec{f}_1(\vec{x}) \cdots)) \]
|
||||
Each $\vec{f}_i$ takes as input the output of the previous layer $\vec{x}_{i-1}$ and has the form:
|
||||
\[ \vec{f}_i(\vec{x}_{i-1}) = \sigma_i(\matr{A}_{i-1}\vec{x}_{i-1} + \vec{b}_{i-1}) \]
|
||||
where $\sigma_i$ is an activation function\footnote{\url{https://en.wikipedia.org/wiki/Activation_function}} (a function to add nonlinearity),
|
||||
while $\matr{A}_{i-1}$ (linear mapping) and $\vec{b}_{i-1}$ (biases) are the parameters of $\vec{f}_i$.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.7\textwidth]{img/_forward_pass.pdf}
|
||||
\caption{Forward pass}
|
||||
\end{figure}
|
||||
|
||||
We can more compactly denote a neural network with input $\vec{x}$ and $K$ layers as:
|
||||
\[
|
||||
\begin{split}
|
||||
\vec{f}_0 &= \vec{x} \\
|
||||
\vec{f}_i &= \sigma_i(\matr{A}_{i-1} \vec{f}_{i-1} + \vec{b}_{i-1}) \text{ } i=1, \dots, K
|
||||
\end{split}
|
||||
\]
|
||||
Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimize the squared loss:
|
||||
\[ L(\vec{\uptheta}) = \Vert \vec{y} - \vec{f}_K(\vec{\uptheta}, \vec{x}) \Vert^2 \]
|
||||
where $\vec{\uptheta} = \{ \matr{A}_{0}, \vec{b}_{0}, \dots, \matr{A}_{K-1}, \vec{b}_{K-1} \}$ are the parameters of each layer.
|
||||
This can be done by using the chain rule to compute the partial derivatives of $L$ with respect to the parameters $\vec{\uptheta}_j = \{ \matr{A}_j, \vec{b}_j \}$:
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial L}{\partial \vec{\uptheta}_{K-1}} &=
|
||||
\overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{\uptheta}_{K-1}}}^{\mathclap{\text{New}}} \\
|
||||
\frac{\partial L}{\partial \vec{\uptheta}_{K-2}} &=
|
||||
\overbrace{\frac{\partial L}{\partial \vec{f}_K}}^{\mathclap{\text{Known}}}
|
||||
\overbrace{\frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}} \frac{\partial \vec{f}_{K-1}}{\partial \vec{\uptheta}_{K-2}}}^{\mathclap{\text{New}}} \\
|
||||
\frac{\partial L}{\partial \vec{\uptheta}_{K-3}} &=
|
||||
\overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}}}^{\mathclap{\text{Known}}}
|
||||
\overbrace{\frac{\partial \vec{f}_{K-1}}{\partial \vec{f}_{K-2}} \frac{\partial \vec{f}_{K-2}}{\partial \vec{\uptheta}_{K-3}}}^{\mathclap{\text{New}}} \\
|
||||
\vdots \\
|
||||
\frac{\partial L}{\partial \vec{\uptheta}_{i}} &=
|
||||
\overbrace{\frac{\partial L}{\partial \vec{f}_K} \frac{\partial \vec{f}_K}{\partial \vec{f}_{K-1}} \dots}^{\mathclap{\text{Known}}}
|
||||
\overbrace{\frac{\partial \vec{f}_{i+2}}{\partial \vec{f}_{i+1}} \frac{\partial \vec{f}_{i+1}}{\partial \vec{\uptheta}_{i}}}^{\mathclap{\text{New}}}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.7\textwidth]{img/_backward_pass.pdf}
|
||||
\caption{Backward pass}
|
||||
\end{figure}
|
||||
|
||||
|
||||
|
||||
\section{Automatic differentiation}
|
||||
Starting from the example below first is recommended.\\
|
||||
|
||||
\marginnote{Automatic differentiation}
|
||||
Automatic differentiation allows to numerically compute
|
||||
the gradient of complex functions using elementary functions, intermediate variables and the chain rule through a computation graph.
|
||||
When the gradient has many components, it also allows to compute it more efficiently.
|
||||
|
||||
Let $f$ be a function,
|
||||
$x_1, \dots, x_d$ the input variables of $f$,
|
||||
$x_{d+1}, \dots, x_{D-1}$ the intermediate variables and
|
||||
$x_D$ the output variable.
|
||||
The computation graph can be expressed as:
|
||||
\[
|
||||
\forall i \in \{ d+1, \dots, D \}: x_i = g_i(x_{\text{Pa}(x_i)})
|
||||
\]
|
||||
where $g_i$ are elementary functions and $x_{\text{Pa}(x_i)}$ are the parent nodes of $x_i$ in the graph.
|
||||
In other words, each intermediate variable is expressed as an elementary function of its preceding nodes.
|
||||
The derivatives of $f$ can then be computed step-by-step going backward as:
|
||||
\[ \frac{\partial f}{\partial x_D} = 1 \text{, as by definition } f = x_D \]
|
||||
\[
|
||||
\frac{\partial f}{\partial x_i} = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial x_c}{\partial x_i}
|
||||
= \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial g_c}{\partial x_i}
|
||||
\]
|
||||
where $\text{Pa}(x_c)$ is the set of parent nodes of $x_c$ in the graph.
|
||||
In other words, to compute the partial derivative of $f$ w.r.t. $x_i$,
|
||||
we apply the chain rule by computing
|
||||
the partial derivative of $f$ w.r.t. the variables following $x_i$ in the graph (as the computation goes backward).
|
||||
|
||||
Automatic differentiation is applicable to all functions that can be expressed as a computational graph and
|
||||
when the elementary functions are differentiable.
|
||||
Note that backpropagation is a special case of automatic differentiation.
|
||||
|
||||
\begin{example}
|
||||
Given the function:
|
||||
\[ f(x) = \sqrt{x^2 + \exp(x^2)} + \cos(x^2 + \exp(x^2)) \]
|
||||
and the elementary functions $\{ (\cdot)^2, \exp(\cdot), +, \sqrt{\cdot}, \cos(\cdot) \}$,
|
||||
$f$ can be decomposed in the following intermediate variables:\\
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
a &= x^2 \\
|
||||
b &= \exp(a) \\
|
||||
c &= a + b \\
|
||||
d &= \sqrt{c} \\
|
||||
\end{split}
|
||||
\]
|
||||
\end{minipage}%
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
e &= \cos(c) \\
|
||||
f &= d + e \\
|
||||
\end{split}
|
||||
\]
|
||||
\end{minipage}\\
|
||||
|
||||
Which corresponds to the following computation graph:
|
||||
\begin{center}
|
||||
\includegraphics[width=0.75\textwidth]{img/auto_diff.png}
|
||||
\end{center}
|
||||
|
||||
We can then compute the derivatives of the intermediate variables w.r.t. their inputs (i.e. inbound edges):\\
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial a}{\partial x} &= 2x \\
|
||||
\frac{\partial b}{\partial a} &= \exp(a) \\
|
||||
\frac{\partial c}{\partial a} &= 1 \\
|
||||
\frac{\partial c}{\partial b} &= 1
|
||||
\end{split}
|
||||
\]
|
||||
\end{minipage}%
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial d}{\partial c} &= \frac{1}{2\sqrt{c}} \\
|
||||
\frac{\partial e}{\partial c} &= -\sin(c) \\
|
||||
\frac{\partial f}{\partial d} &= 1 \\
|
||||
\frac{\partial f}{\partial e} &= 1
|
||||
\end{split}
|
||||
\]
|
||||
\end{minipage}\\
|
||||
|
||||
Finally, we can compute $\frac{\partial f}{\partial x}$ by going backward from the output ($f$) to the input ($x$):\\
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial f}{\partial d} &= \text{ known (previous step)} \\
|
||||
\frac{\partial f}{\partial e} &= \text{ known (previous step)} \\
|
||||
\frac{\partial f}{\partial c} &=
|
||||
\frac{\partial f}{\partial d}\frac{\partial d}{\partial c} + \frac{\partial f}{\partial e}\frac{\partial e}{\partial c} \\
|
||||
\end{split}
|
||||
\]
|
||||
\end{minipage}%
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial f}{\partial b} &= \frac{\partial f}{\partial c}\frac{\partial c}{\partial b} \\
|
||||
\frac{\partial f}{\partial a} &=
|
||||
\frac{\partial f}{\partial b}\frac{\partial b}{\partial a} + \frac{\partial f}{\partial c}\frac{\partial c}{\partial a} \\
|
||||
\frac{\partial f}{\partial x} &= \frac{\partial f}{\partial a}\frac{\partial a}{\partial x}
|
||||
\end{split}
|
||||
\]
|
||||
\end{minipage}\\
|
||||
|
||||
In other words, to compute the partial derivative of $f$ w.r.t. a variable $x_i$,
|
||||
all variables $w_j$ that follows $x_i$ in the graph are considered.
|
||||
|
||||
Now, by substituting we obtain:
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial f}{\partial c} &= 1 \cdot \frac{1}{2\sqrt{c}} + 1 \cdot (-\sin(c)) \\
|
||||
\frac{\partial f}{\partial b} &= \frac{\partial f}{\partial c} \cdot 1 \\
|
||||
\frac{\partial f}{\partial a} &= \frac{\partial f}{\partial b} \cdot \exp(a) + \frac{\partial f}{\partial c} \cdot 1 \\
|
||||
\frac{\partial f}{\partial x} &= \frac{\partial f}{\partial a} \cdot 2x
|
||||
\end{split}
|
||||
\]
|
||||
\end{example}
|
||||
@ -0,0 +1,21 @@
|
||||
\documentclass[11pt]{ainotes}
|
||||
|
||||
\title{Statistical and Mathematical Methods for Artificial Intelligence}
|
||||
\date{2023 -- 2024}
|
||||
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
|
||||
|
||||
\begin{document}
|
||||
|
||||
\makenotesfront
|
||||
|
||||
\input{sections/_finite_numbers.tex}
|
||||
\input{sections/_linear_algebra.tex}
|
||||
\input{sections/_linear_systems.tex}
|
||||
\input{sections/_matrix_decomp.tex}
|
||||
\input{sections/_vector_calculus.tex}
|
||||
\input{sections/_gradient_methods.tex}
|
||||
\input{sections/_probability.tex}
|
||||
\input{sections/_machine_learning.tex}
|
||||
\eoc
|
||||
|
||||
\end{document}
|
||||