mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 02:52:22 +01:00
Fix typos <noupdate>
This commit is contained in:
@ -6,7 +6,7 @@
|
||||
|
||||
\begin{description}
|
||||
\item[Measure error] \marginnote{Measure error}
|
||||
Precision of the measurement instrument.
|
||||
Precision of the measuring instrument.
|
||||
|
||||
\item[Arithmetic error] \marginnote{Arithmetic error}
|
||||
Propagation of rounding errors in each step of an algorithm.
|
||||
@ -37,7 +37,7 @@ Let $x$ be a value and $\hat{x}$ its approximation. Then:
|
||||
Note that, out of context, the absolute error is meaningless.
|
||||
\item[Relative error]
|
||||
\[
|
||||
E_{a} = \frac{\hat{x} - x}{x}
|
||||
E_{r} = \frac{\hat{x} - x}{x}
|
||||
\marginnote{Relative error}
|
||||
\]
|
||||
\end{descriptionlist}
|
||||
@ -148,7 +148,7 @@ In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest repre
|
||||
\subsection{IEEE standard}
|
||||
IEEE 754 defines two floating-point formats:
|
||||
\begin{descriptionlist}
|
||||
\item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{float32}
|
||||
\item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{\texttt{float32}}
|
||||
\begin{center}
|
||||
\small
|
||||
\begin{tabular}{|c|c|c|}
|
||||
@ -158,7 +158,7 @@ IEEE 754 defines two floating-point formats:
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
|
||||
\item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{float64}
|
||||
\item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{\texttt{float64}}
|
||||
\begin{center}
|
||||
\small
|
||||
\begin{tabular}{|c|c|c|}
|
||||
|
||||
@ -11,11 +11,13 @@ Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in
|
||||
|
||||
\item[Local minimum] \marginnote{Local minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a local minimum of $f$ iff:
|
||||
\[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
|
||||
\[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
|
||||
f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
|
||||
|
||||
\item[Strict local minimum] \marginnote{Strict local minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a strict local minimum of $f$ iff:
|
||||
\[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
|
||||
\[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
|
||||
f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
|
||||
|
||||
\item[Global minimum] \marginnote{Global minimum}
|
||||
$\vec{x}^* \in \mathbb{R}^N$ is a global minimum of $f$ iff:
|
||||
@ -26,7 +28,7 @@ Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in
|
||||
\[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
|
||||
\end{descriptionlist}
|
||||
|
||||
Note that $\max f(x) = \min -f(x)$.
|
||||
Note that $\max \{ f(x) \} = \min \{ -f(x)$ \}.
|
||||
|
||||
|
||||
\subsection{Optimality conditions}
|
||||
@ -52,7 +54,7 @@ As the second order condition requires to compute the Hessian matrix, which is e
|
||||
|
||||
\marginnote{Descent methods}
|
||||
Descent methods are iterative methods that have the property:
|
||||
\[ f(\vec{x}_k) < f(\vec{x}_{k+1}) \]
|
||||
\[ f(\vec{x}_k) < f(\vec{x}_{k-1}) \]
|
||||
|
||||
The iteration is defined as:
|
||||
\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
|
||||
@ -107,7 +109,7 @@ Note: descent methods usually converge to a local minimum.
|
||||
but it can be proved that this does not guarantee convergence.
|
||||
|
||||
\item[Backtracking procedure] \marginnote{Backtracking procedure}
|
||||
$\alpha_k$ is chose such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
|
||||
$\alpha_k$ is chosen such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
|
||||
\begin{lstlisting}[mathescape=true, belowskip = -0.8\baselineskip]
|
||||
def backtracking($\tau$, $c_1$):
|
||||
$\alpha_k$ = 1 # Initial guess
|
||||
@ -121,7 +123,7 @@ Note: descent methods usually converge to a local minimum.
|
||||
|
||||
\subsection{Stopping condition}
|
||||
\marginnote{Stopping condition}
|
||||
We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, $\nabla f(\vec{x}_k) \approx \nullvec$.
|
||||
We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, when $\nabla f(\vec{x}_k) \approx \nullvec$.
|
||||
We can verify this by checking the norm of the gradient against a tolerance $\tau$:
|
||||
\begin{descriptionlist}
|
||||
\item[Absolute condition] $\Vert \nabla f(x_k) \Vert_2 < \tau$
|
||||
@ -152,7 +154,7 @@ A generic gradient-like method can then be defined as:
|
||||
it may cause numerical instabilities or bad results.
|
||||
Heuristics can be used to select an adequate starting point.
|
||||
|
||||
\item[Flag regions and local optima] \marginnote{Flag regions and local optima}
|
||||
\item[Flat regions and local optima] \marginnote{Flat regions and local optima}
|
||||
Flat regions slow down the learning speed,
|
||||
while a local optima causes the method to converge at a poor solution.
|
||||
\begin{figure}[ht]
|
||||
@ -164,7 +166,7 @@ A generic gradient-like method can then be defined as:
|
||||
\item[Differential curvature]
|
||||
Different magnitudes of the partial derivatives may cause the problem of
|
||||
vanishing and exploding gradient. \marginnote{Vanishing gradient\\Exploding gradient}
|
||||
This causes the learning process to require more iterations to correct the direction.
|
||||
This causes the learning process to require more iterations to adjust the direction.
|
||||
|
||||
In practice, as the gradient of complex functions is only an instantaneous direction of best decrease and
|
||||
does not represent the direction to the minimum in the long term,
|
||||
@ -254,11 +256,11 @@ A generic gradient-like method can then be defined as:
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
\marginnote{Convex properties}
|
||||
% \marginnote{Convex properties}
|
||||
\begin{itemize}
|
||||
\item $\text{if } f \text{ convex} \Rightarrow \text{ any local minimum of } f \text{ is also global}$
|
||||
\item $\text{if } f \text{ strictly convex} \Rightarrow \text{ the global minimum of } f \text{ is unique}$
|
||||
\item $\text{if } f \text{ convex and differentiable} \Rightarrow \text{ any stationary point of } f \text{ is a global minimum}$
|
||||
\item $\text{if } f \text{ convex} \Rightarrow \text{any local minimum of } f \text{ is also global}$
|
||||
\item $\text{if } f \text{ strictly convex} \Rightarrow \text{the global minimum of } f \text{ is unique}$
|
||||
\item $\text{if } f \text{ convex and differentiable} \Rightarrow \text{any stationary point of } f \text{ is a global minimum}$
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
@ -324,9 +324,9 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is
|
||||
\end{theorem}
|
||||
|
||||
|
||||
\begin{theorem}[Spectral theorem] \marginnote{Spectral theorem}
|
||||
\begin{theorem}[Spectral theorem] \label{th:spectral_theorem} \marginnote{Spectral theorem}
|
||||
Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
Its eigenvectors form a orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
|
||||
Its eigenvectors form an orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
|
||||
\end{theorem}
|
||||
|
||||
|
||||
|
||||
@ -45,16 +45,22 @@ where:
|
||||
\section{Square linear systems}
|
||||
\marginnote{Square linear system}
|
||||
A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$
|
||||
has an unique solution iff one of the following conditions is satisfied:
|
||||
has a unique solution iff one of the following conditions is satisfied:
|
||||
\begin{enumerate}
|
||||
\item $\matr{A}$ is non-singular (invertible)
|
||||
\item $\text{rank}(\matr{A}) = n$ (full rank)
|
||||
\item $\matr{A}\vec{x}$ admits only the solution $\vec{x} = \nullvec$
|
||||
\item $\matr{A}\vec{x}$ only admits the solution $\vec{x} = \nullvec$
|
||||
\end{enumerate}
|
||||
|
||||
The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
|
||||
\[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
|
||||
However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
|
||||
Therefore, numerical methods are usually more suited.
|
||||
The two main families of methods are:
|
||||
\begin{itemize}
|
||||
\item Direct methods.
|
||||
\item Iterative methods.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
@ -70,15 +76,19 @@ The most common approach consists in factorizing the matrix $\matr{A}$.
|
||||
Given a square linear system $\matr{A}\vec{x} = \vec{b}$,
|
||||
the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that:
|
||||
\begin{itemize}
|
||||
\item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix
|
||||
\item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix
|
||||
\item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix.
|
||||
\item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix.
|
||||
\end{itemize}
|
||||
%
|
||||
The system can be decomposed to:
|
||||
The system can be decomposed into:
|
||||
\[
|
||||
\begin{split}
|
||||
\matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
|
||||
& \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \vec{b}
|
||||
& \iff
|
||||
\begin{cases}
|
||||
\matr{L}\vec{y} = \vec{b} \\
|
||||
\vec{y} = \matr{U}\vec{x}
|
||||
\end{cases}
|
||||
\end{split}
|
||||
\]
|
||||
To find the solution, it is sufficient to solve in order:
|
||||
@ -89,7 +99,7 @@ To find the solution, it is sufficient to solve in order:
|
||||
|
||||
The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\
|
||||
$O(\frac{n^3}{3})$ is the time complexity of the LU factorization.
|
||||
$O(n^2)$ is the complexity to directly solving a system with a triangular matrix (forward or backward substitutions).
|
||||
$O(n^2)$ is the complexity to directly solve a system with a triangular matrix (forward or backward substitutions).
|
||||
|
||||
|
||||
\subsection{Gaussian factorization with pivoting}
|
||||
@ -97,15 +107,19 @@ $O(n^2)$ is the complexity to directly solving a system with a triangular matrix
|
||||
During the computation of $\matr{A} = \matr{L}\matr{U}$
|
||||
(using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}),
|
||||
a division by 0 may occur.
|
||||
A method to prevent this problem (and to lower the algorithmic error) is to change the order of the rows of $\matr{A}$ before decomposing it.
|
||||
A method to prevent this problem (and to lower the algorithmic error (i.e. overflows)) is to change the order of the rows of $\matr{A}$ before decomposing it.
|
||||
This is achieved by using a permutation matrix $\matr{P}$, which is obtained as a permutation of the identity matrix.
|
||||
|
||||
The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
|
||||
The system can be decomposed to:
|
||||
The system can be decomposed into:
|
||||
\[
|
||||
\begin{split}
|
||||
\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
|
||||
& \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \matr{P}\vec{b}
|
||||
& \iff
|
||||
\begin{cases}
|
||||
\matr{L}\vec{y} = \matr{P}\vec{b} \\
|
||||
\vec{y} = \matr{U}\vec{x}
|
||||
\end{cases}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
@ -117,7 +131,7 @@ The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can
|
||||
|
||||
|
||||
\subsection{Cholesky factorization}
|
||||
Given a symmetric definite positive matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
Given a symmetric positive definite matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
|
||||
It is possible to decompose $\matr{A}$ as:
|
||||
\[ \matr{A} = \matr{L}\matr{L}^T \]
|
||||
where $\matr{L}$ is lower triangular.
|
||||
@ -183,7 +197,7 @@ This problem is independent from the algorithm and is estimated using exact arit
|
||||
Given a system $\matr{A}\vec{x} = \vec{b}$, we perturbate $\matr{A}$ and/or $\vec{b}$ and study the inherited error.
|
||||
For instance, if we perturbate $\vec{b}$, we obtain the following system:
|
||||
\[ \matr{A}\tilde{\vec{x}} = (\vec{b} + \Delta\vec{b}) \]
|
||||
After finding $\tilde{\vec{x}}$, we can compute the inherited error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
|
||||
After finding $\tilde{\vec{x}}$, we can compute the inherent error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
|
||||
|
||||
By comparing $\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert$ and $\left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert$,
|
||||
we can compute the error introduced by the perturbation.
|
||||
@ -201,4 +215,28 @@ Otherwise it is \textbf{well-conditioned}. \marginnote{Well-conditioned}
|
||||
|
||||
|
||||
\section{Linear least squares problem}
|
||||
See \Cref{sec:lls}.
|
||||
|
||||
A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$
|
||||
does not generally have a solution.
|
||||
\marginnote{Linear least squares}
|
||||
Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
|
||||
\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
|
||||
In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
|
||||
This problem is usually formulated as:
|
||||
\[
|
||||
\tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
|
||||
\]
|
||||
It always admits a solution and, depending on $\text{rank}(\matr{A})$, there are two possible cases:
|
||||
\begin{descriptionlist}
|
||||
\item[$\text{rank}(\matr{A}) = n$]
|
||||
The solution is unique for each $b \in \mathbb{R}^m$.
|
||||
\marginnote{Normal equation}
|
||||
It is found by solving the normal equation:
|
||||
\[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
|
||||
$\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
|
||||
|
||||
\item[$\text{rank}(\matr{A}) < n$]
|
||||
The system admits infinite solutions.
|
||||
Of all the solutions $S$, we are interested in the one with minimum norm:
|
||||
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
|
||||
\end{descriptionlist}
|
||||
@ -98,8 +98,8 @@ The parameters are determined as the most likely to predict the correct label gi
|
||||
\begin{description}
|
||||
\item[Negative log-likelihood] \marginnote{Negative log-likelihood}
|
||||
\sloppy
|
||||
Given a random variable $\bm{x}$, a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$
|
||||
and a predictor, the negative log-likelihood of $\bm{x}$ is:
|
||||
Given a random variable $\bm{x}$ and a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$,
|
||||
the negative log-likelihood of $\bm{x}$ is:
|
||||
\[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
|
||||
Note that:
|
||||
\begin{itemize}
|
||||
@ -118,7 +118,7 @@ The parameters are determined as the most likely to predict the correct label gi
|
||||
Moreover, as the dataset is identically distributed,
|
||||
each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.
|
||||
|
||||
By applying the logarithm, we have that the negative log-likelihood of a i.i.d. dataset is define as:
|
||||
By applying the logarithm, we have that the negative log-likelihood of a i.i.d. dataset is defined as:
|
||||
\[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
|
||||
and to find good parameters $\vec{\uptheta}$, we solve the problem:
|
||||
\[
|
||||
@ -173,7 +173,7 @@ The parameters are determined as the most likely to predict the correct label gi
|
||||
\caption{When the parameters are bad, the label will be far the mean}
|
||||
\end{subfigure}
|
||||
|
||||
\caption{Geometric interpretation of the Gaussian likelihood. (not sure if this is correct)}
|
||||
\caption{Geometric interpretation of the Gaussian likelihood}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
@ -191,7 +191,7 @@ By applying the Bayes' theorem, the problem becomes:
|
||||
\begin{split}
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D}
|
||||
-\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(Y \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
|
||||
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
|
||||
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
@ -10,6 +10,8 @@ then $\matr{A} \in \mathbb{R}^{n \times n}$ can be decomposed into:
|
||||
where $\matr{P} \in \mathbb{R}^{n \times n}$ contains the eigenvectors of $\matr{A}$ as its columns and
|
||||
$\matr{D}$ is a diagonal matrix whose diagonal contains the eigenvalues of $\matr{A}$.
|
||||
|
||||
Note that a symmetric matrix can always be decomposed (\Cref{th:spectral_theorem})
|
||||
|
||||
|
||||
|
||||
\section{Singular value decomposition}
|
||||
@ -40,10 +42,10 @@ The singular value decomposition (SVD) of $\matr{A}$ is always possible and has
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item
|
||||
$\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix with columns $\vec{u}_i$ called left-singular vectors.
|
||||
$\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix whose columns $\vec{u}_i$ are called left-singular vectors.
|
||||
|
||||
\item
|
||||
$\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix with columns $\vec{v}_i$ called right-singular vectors.
|
||||
$\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix whose columns $\vec{v}_i$ are called right-singular vectors.
|
||||
|
||||
\item
|
||||
$\matr{\Sigma} \in \mathbb{R}^{m \times n}$ is a matrix with $\matr{\Sigma}_{i,j} = 0$ (i.e. diagonal if it was a square matrix) and
|
||||
@ -79,8 +81,8 @@ For $\matr{A}^T\matr{A}$, we can compute:
|
||||
\]
|
||||
As $\matr{V}$ is orthogonal ($\matr{V}^T = \matr{V}^{-1}$), we can apply the eigendecomposition theorem:
|
||||
\begin{itemize}
|
||||
\item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$
|
||||
\item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$
|
||||
\item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$.
|
||||
\item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$.
|
||||
\end{itemize}
|
||||
|
||||
The same process holds for $\matr{A}\matr{A}^T$. In this case, the columns of $\matr{U}$ (left-singular vectors) are the eigenvectors.
|
||||
@ -99,7 +101,8 @@ We can compute the 2-norm as:
|
||||
\[ \Vert \matr{A} \Vert_2 = \sqrt{\rho(\matr{A}^T\matr{A})} = \sqrt{\rho(\matr{A}^2)} = \sqrt{\max\{\sigma_1^2, \dots, \sigma_r^2\}} = \sigma_1 \]
|
||||
\[
|
||||
\Vert \matr{A}^{-1} \Vert_2 = \sqrt{\rho((\matr{A}^{-1})^T(\matr{A}^{-1}))} =
|
||||
\sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} = \sqrt{\max\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2}\}} = \frac{1}{\sigma_r}
|
||||
\sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} =
|
||||
\sqrt{\max \left\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2} \right\}} = \frac{1}{\sigma_r}
|
||||
\]
|
||||
Furthermore, we can compute the condition number of $\matr{A}$ as:
|
||||
\[ K(\matr{A}) = \Vert \matr{A} \Vert_2 \cdot \Vert \matr{A}^{-1} \Vert_2 = \sigma_1 \cdot \frac{1}{\sigma_r} \]
|
||||
@ -126,7 +129,7 @@ By considering only the first $k < r$ singular values, we can obtain a rank-$k$
|
||||
\hat{\matr{A}}(k) = \arg \min_{\matr{B} \in \mathbb{R}^{m \times n}, \text{rank}(\matr{B}) = k} \Vert \matr{A} - \matr{B} \Vert_2
|
||||
\]
|
||||
\end{theorem}
|
||||
In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closer one to $\matr{A}$.
|
||||
In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closest one to $\matr{A}$.
|
||||
Moreover, the error of the rank-$k$ approximation is:
|
||||
\[
|
||||
\Vert \matr{A} - \hat{\matr{A}}(k) \Vert_2 =
|
||||
@ -152,32 +155,15 @@ Therefore, the compression factor is given by: \marginnote{Compression factor}
|
||||
|
||||
|
||||
\subsection{Application: Linear least squares problem} \label{sec:lls}
|
||||
A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$
|
||||
does not generally have a solution.
|
||||
\marginnote{Linear least squares}
|
||||
Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
|
||||
\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
|
||||
In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
|
||||
This problem is usually formulated as:
|
||||
Given a least squares problem:
|
||||
\[
|
||||
\tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
|
||||
\]
|
||||
It always admits a solution and, depending on $\text{rank}(\matr{A})$, there two possible cases:
|
||||
\begin{descriptionlist}
|
||||
\item[$\text{rank}(\matr{A}) = n$]
|
||||
The solution is unique for each $b \in \mathbb{R}^m$.
|
||||
\marginnote{Normal equation}
|
||||
It is found by solving the normal equation:
|
||||
\[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
|
||||
$\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
|
||||
|
||||
\item[$\text{rank}(\matr{A}) < n$] \marginnote{Least squares using SVD}
|
||||
The system admits infinite solutions.
|
||||
Of all the solutions $S$, we are interested in the one with minimum norm:
|
||||
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
|
||||
This problem can be solved using SVD:
|
||||
\[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
|
||||
\end{descriptionlist}
|
||||
When $\text{rank}(\matr{A}) < n$, the system admits infinite solutions.
|
||||
Of all the solutions $S$, we are interested in the one with minimum norm:
|
||||
\[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
|
||||
This problem can be solved using SVD:
|
||||
\[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
|
||||
|
||||
|
||||
\subsection{Application: Polynomial interpolation}
|
||||
|
||||
@ -3,9 +3,9 @@
|
||||
|
||||
\begin{description}
|
||||
\item[Probability]
|
||||
model of a process where the underlying uncertainty is captured by random variables.
|
||||
Model of a process where the underlying uncertainty is captured by random variables.
|
||||
\item[Statistics]
|
||||
determine the underlying process that explains an observation.
|
||||
Determines the underlying process that explains an observation.
|
||||
\end{description}
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
|
||||
\item[Probability] \marginnote{Probability}
|
||||
Let $\mathcal{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
|
||||
The probability is a function:
|
||||
The probability of an event is a function:
|
||||
\[ \prob{A}: \mathcal{E} \rightarrow [0, 1] \]
|
||||
\begin{example}
|
||||
Let $\Omega$ be as above.
|
||||
@ -115,14 +115,14 @@
|
||||
\begin{example}
|
||||
A coin is tossed twice.
|
||||
|
||||
The random variable is $X(\omega) = \{ \text{number of heads} \}$.
|
||||
Given the random variable $X(\omega) = \{ \text{number of heads} \}$.
|
||||
We have that $\mathcal{T}_X = \{ 0, 1, 2 \}$, therefore $X$ is discrete.
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
Roll a die until 6 comes out.
|
||||
|
||||
The random variable is $Y(\omega) = \{ \text{number of rolls before 6} \}$.
|
||||
Given the random variable $Y(\omega) = \{ \text{number of rolls before 6} \}$.
|
||||
We have that $\mathcal{T}_Y = \{ 1, 2, \dots \} = \mathbb{N} \smallsetminus \{0\}$,
|
||||
therefore $Y$ is discrete as $\mathcal{T}_Y$ is a countable set.
|
||||
\end{example}
|
||||
@ -143,7 +143,7 @@
|
||||
\begin{example}
|
||||
Let $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$.
|
||||
Given a random variable $X = \{ \text{number of heads} \}$ with $\mathcal{T}_X = \{ 0, 1, 2 \}$.
|
||||
The PMF is:
|
||||
Its PMF is:
|
||||
\[
|
||||
\begin{split}
|
||||
p_X &= \prob{X = 0} = \frac{1}{4} \\
|
||||
@ -160,7 +160,7 @@
|
||||
\begin{description}
|
||||
\item[Continuous random variable] \marginnote{Continuous random variable}
|
||||
A random variable $X$ is continuous if its target space $\mathcal{T}_X$ is uncountably infinite (i.e. a subset of $\mathbb{R}$).
|
||||
Usually, $\mathcal{T}_X$ is an interval or union of intervals.
|
||||
Usually, $\mathcal{T}_X$ is an interval or a union of intervals.
|
||||
|
||||
\begin{example}
|
||||
Given a random variable $Z = \{ \text{Time before the arrival of a client} \}$.
|
||||
@ -210,25 +210,25 @@
|
||||
\end{center}
|
||||
We denote with:
|
||||
\begin{itemize}
|
||||
\item $N$ the number of events
|
||||
\item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p(x, y) = n_{ij}$)
|
||||
\item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column
|
||||
\item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row
|
||||
\item $N$ the number of events.
|
||||
\item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p_{XY}(x, y) = n_{ij}$).
|
||||
\item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column.
|
||||
\item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row.
|
||||
\end{itemize}
|
||||
|
||||
The marginal probabilities are:\\
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
\[ p(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
|
||||
\[ p_X(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
|
||||
\end{minipage}
|
||||
\begin{minipage}{.48\linewidth}
|
||||
\centering
|
||||
\[ p(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
|
||||
\[ p_Y(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
|
||||
\end{minipage}
|
||||
|
||||
The conditional probabilities can be computed as:
|
||||
\[ \prob{Y = y_j \vert X = x_i} = \frac{p(x_i, y_i)}{p(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
|
||||
\[ \prob{X = x_i \vert Y = y_j} = \frac{p(x_i, y_i)}{p(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
|
||||
\[ \prob{Y = y_j \vert X = x_i} = \frac{p_{XY}(x_i, y_i)}{p_X(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
|
||||
\[ \prob{X = x_i \vert Y = y_j} = \frac{p_{XY}(x_i, y_i)}{p_Y(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
@ -240,18 +240,18 @@
|
||||
\marginnote{Sum rule\\Marginalization property}
|
||||
Given $X$ and $Y$ random variables. The sum rule states that:
|
||||
\[
|
||||
p(\bm{x}) =
|
||||
p_X(\bm{x}) =
|
||||
\begin{cases}
|
||||
\sum_{\bm{y} \in \mathcal{T}_Y} p(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
|
||||
\int_{\mathcal{T}_Y} p(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
|
||||
\sum_{\bm{y} \in \mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
|
||||
\int_{\mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
|
||||
\end{cases}
|
||||
\]
|
||||
|
||||
The sum rule relates the joint distribution and a marginal distribution.
|
||||
The sum rule relates the joint distribution and the marginal distribution.
|
||||
In fact, the sum rule can be applied to any subset of the random variables of a joint distribution.
|
||||
Given $\bm{x} = \begin{pmatrix} x_1, \dots, x_D \end{pmatrix}^T$,
|
||||
the marginal w.r.t. $x_i$ can be obtained by integrating/summing out all random variables except $x_i$:
|
||||
\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\backslash i} \]
|
||||
\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\smallsetminus i} \]
|
||||
|
||||
\subsection{Product rule}
|
||||
\marginnote{Product rule}
|
||||
@ -302,7 +302,7 @@ Note: sometimes, instead of the full posterior, the maximum is considered (with
|
||||
|
||||
\begin{description}
|
||||
\item[Statistic] \marginnote{Statistic}
|
||||
A statistic of a random variable is a deterministic function of it.
|
||||
A statistic of a random variable is a deterministic function defined on it.
|
||||
\end{description}
|
||||
|
||||
|
||||
@ -447,7 +447,7 @@ Two random variables $X$ and $Y$ are conditionally independent given $Z$ iff:
|
||||
\marginnote{Inner product of random variables}
|
||||
Given two zero mean random variables $X$ and $Y$, their inner product is defined as:
|
||||
\[ \left\langle X, Y \right\rangle = \text{Cov}[x, y] \]
|
||||
The covariance matrix is symmetric, positive definite.
|
||||
The covariance matrix is symmetric positive definite.
|
||||
|
||||
Moreover, we have that:
|
||||
\begin{itemize}
|
||||
@ -465,7 +465,7 @@ Moreover, we have that:
|
||||
\subsection{Discrete random variables}
|
||||
\begin{descriptionlist}
|
||||
\item[Uniform distribution] \marginnote{Uniform distribution}
|
||||
Given a discrete random variable $X$ with $\#(\mathcal{T}_X) = N$,
|
||||
Given a discrete random variable $X$ with $\vert \mathcal{T}_X \vert = N$,
|
||||
$X$ has an uniform distribution if:
|
||||
\[ p_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
|
||||
|
||||
|
||||
@ -49,11 +49,11 @@
|
||||
Let $f: \mathbb{R}^n \rightarrow \mathbb{R}$ and $\vec{g}$ a vector of $n$ functions $g_i: \mathbb{R}^m \rightarrow \mathbb{R}$:
|
||||
\[
|
||||
\frac{\partial}{\partial \vec{x}} (f \circ \vec{g})(\vec{x}) =
|
||||
\frac{\partial}{\partial \vec{x}} (f(\vec{g}(\vec{x}))) =
|
||||
\frac{\partial}{\partial \vec{x}} \Big( f(\vec{g}(\vec{x})) \Big) =
|
||||
\frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial \vec{x}}
|
||||
\]
|
||||
|
||||
More precisely, considering a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables
|
||||
For instance, consider a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables
|
||||
$g_1(t), g_2(t): \mathbb{R} \rightarrow \mathbb{R}$ that are functions of $t$.
|
||||
The gradient of $f$ with respect to $t$ is:
|
||||
\[
|
||||
@ -71,7 +71,7 @@
|
||||
the second matrix contains in the $i$-th row the gradient of $g_i$.
|
||||
|
||||
Therefore, if $g_i$ are in turn multivariate functions $g_1(s, t), g_2(s, t): \mathbb{R}^2 \rightarrow \mathbb{R}$,
|
||||
the chain rule can be applies as:
|
||||
the chain rule can be applies as follows:
|
||||
\[
|
||||
\frac{\text{d}f}{\text{d}(s, t)} =
|
||||
\begin{pmatrix}
|
||||
@ -96,26 +96,26 @@
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ g)(t)$ where:
|
||||
\[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(\vec{x}) = \exp(x_1 x_2^2) \]
|
||||
Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ \vec{g})(t) = f(\vec{g}(t))$ where:
|
||||
\[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(g_1, g_2) = \exp(g_1 g_2^2) \]
|
||||
\[
|
||||
g: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as }
|
||||
\vec{g}(t) = \begin{pmatrix} x_1 \\ x_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
|
||||
\vec{g}: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as }
|
||||
\vec{g}(t) = \begin{pmatrix} g_1 \\ g_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
|
||||
\]
|
||||
The gradient of $h$ with respect to $t$ can be computed as:
|
||||
\[
|
||||
\frac{\text{d} h}{\text{d} t} =
|
||||
\frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial t} =
|
||||
\begin{pmatrix}
|
||||
\frac{\partial f}{\partial x_1} & \frac{\partial f}{\partial x_2}
|
||||
\frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
|
||||
\end{pmatrix}
|
||||
\begin{pmatrix}
|
||||
\frac{\partial x_1}{\partial t} \\ \frac{\partial x_2}{\partial t}
|
||||
\frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
|
||||
\end{pmatrix}
|
||||
\]
|
||||
\[
|
||||
=
|
||||
\begin{pmatrix} \exp(x_1 x_2^2)x_2^2 & 2\exp(x_1 x_2^2)x_1 x_2 \end{pmatrix}
|
||||
\begin{pmatrix} \exp(g_1 g_2^2)g_2^2 & 2\exp(g_1 g_2^2)g_1 g_2 \end{pmatrix}
|
||||
\begin{pmatrix} \cos(t) + (-t\sin(t)) \\ \sin(t) + t\cos(t) \end{pmatrix}
|
||||
\]
|
||||
\end{example}
|
||||
@ -210,7 +210,7 @@ We can more compactly denote a neural network with input $\vec{x}$ and $K$ layer
|
||||
\vec{f}_i &= \sigma_i(\matr{A}_{i-1} \vec{f}_{i-1} + \vec{b}_{i-1}) \text{ } i=1, \dots, K
|
||||
\end{split}
|
||||
\]
|
||||
Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimizes the squared loss:
|
||||
Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimize the squared loss:
|
||||
\[ L(\vec{\uptheta}) = \Vert \vec{y} - \vec{f}_K(\vec{\uptheta}, \vec{x}) \Vert^2 \]
|
||||
where $\vec{\uptheta} = \{ \matr{A}_{0}, \vec{b}_{0}, \dots, \matr{A}_{K-1}, \vec{b}_{K-1} \}$ are the parameters of each layer.
|
||||
This can be done by using the chain rule to compute the partial derivatives of $L$ with respect to the parameters $\vec{\uptheta}_j = \{ \matr{A}_j, \vec{b}_j \}$:
|
||||
@ -260,12 +260,12 @@ In other words, each intermediate variable is expressed as an elementary functio
|
||||
The derivatives of $f$ can then be computed step-by-step going backwards as:
|
||||
\[ \frac{\partial f}{\partial x_D} = 1 \text{, as by definition } f = x_D \]
|
||||
\[
|
||||
\frac{\partial f}{\partial x_i} = \sum_{\forall x_j: x_i \in \text{Pa}(x_j)} \frac{\partial f}{\partial x_j} \frac{\partial x_j}{\partial x_i}
|
||||
= \sum_{\forall x_j: x_i \in \text{Pa}(x_j)} \frac{\partial f}{\partial x_j} \frac{\partial g_j}{\partial x_i}
|
||||
\frac{\partial f}{\partial x_i} = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial x_c}{\partial x_i}
|
||||
= \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial g_c}{\partial x_i}
|
||||
\]
|
||||
where $\text{Pa}(x_j)$ is the set of parent nodes of $x_j$ in the graph.
|
||||
where $\text{Pa}(x_c)$ is the set of parent nodes of $x_c$ in the graph.
|
||||
In other words, to compute the partial derivative of $f$ w.r.t. $x_i$,
|
||||
we apply the chain rule by first computing
|
||||
we apply the chain rule by computing
|
||||
the partial derivative of $f$ w.r.t. the variables following $x_i$ in the graph (as the computation goes backwards).
|
||||
|
||||
Automatic differentiation is applicable to all functions that can be expressed as a computational graph and
|
||||
@ -327,8 +327,8 @@ Note that backpropagation is a special case of automatic differentiation.
|
||||
\begin{minipage}{.5\linewidth}
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{\partial f}{\partial d} &= \text{ already known (previous step)} \\
|
||||
\frac{\partial f}{\partial e} &= \text{ already known (previous step)} \\
|
||||
\frac{\partial f}{\partial d} &= \text{ known (previous step)} \\
|
||||
\frac{\partial f}{\partial e} &= \text{ known (previous step)} \\
|
||||
\frac{\partial f}{\partial c} &=
|
||||
\frac{\partial f}{\partial d}\frac{\partial d}{\partial c} + \frac{\partial f}{\partial e}\frac{\partial e}{\partial c} \\
|
||||
\end{split}
|
||||
|
||||
Reference in New Issue
Block a user