Add SMM MLE

This commit is contained in:
2023-10-16 21:12:13 +02:00
parent 6068ed5518
commit aaa1a0e2ca
4 changed files with 209 additions and 25 deletions

View File

@ -63,7 +63,7 @@
\newtheorem*{definition}{Def}
\newcommand{\ubar}[1]{\text{\b{$#1$}}}
\renewcommand{\vec}[1]{{\mathbf{#1}}}
\renewcommand{\vec}[1]{{\bm{\mathbf{#1}}}}
\newcommand{\nullvec}[0]{\bar{\vec{0}}}
\newcommand{\matr}[1]{{\bm{#1}}}
\newcommand{\prob}[1]{{\mathcal{P}({#1})}}

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.7 KiB

View File

@ -14,16 +14,20 @@
$\vec{\uptheta} = \begin{pmatrix} \uptheta_0, \dots, \uptheta_D \end{pmatrix}$ is the parameter vector.
\item[Probabilistic model] \marginnote{Probabilistic model}
The model is a multivariate probabilistic distribution.
The model is a multivariate probabilistic distribution that
is able to quantify uncertainty in noisy data.
\end{description}
\section{Learning}
\subsection{Empirical risk minimization}
\marginnote{Empirical risk minimization}
Used for function models.
The parameters of the predictor are directly obtained as an optimization problem that aims to minimize the distance
between the prediction and the ground truth.
Let $(\vec{x}_n, y_n)$ be a dataset of $N$ elements
where $\vec{x}_n \in \mathbb{R}^D$ are the examples and $y_n \in \mathbb{R}$ are the labels.
@ -41,14 +45,14 @@ We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n
the dataset $(\vec{x}_n, y_n)$ is independent and identically distributed.
Therefore, the empirical mean is a good estimate of the population mean.
\begin{description}
\item[Empirical risk] \marginnote{Empirical risk}
Given the example matrix $\matr{X} = \begin{pmatrix} \vec{x}_1, \dots, \vec{x}_N \end{pmatrix} \in \mathbb{R}^{N \times D}$
and the label vector $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix} \in \mathbb{R}^N$.
The empirical risk is given by the average loss:
\[ \textbf{R}_\text{emp}(f_\vec{\uptheta}, \matr{X}, \vec{y}) = \frac{1}{N} \sum_{n=1}^{N} \ell(y_n, \hat{y}_n) \]
\begin{example}[Least-squares loss] \marginnote{Least-squares loss}
\begin{description}
\item[Least-squares loss] \marginnote{Least-squares loss}
The least-squares loss is defined as:
\[ \ell(y_n, \hat{y}_n) = (y_n - \hat{y}_n)^2 \]
@ -58,7 +62,7 @@ We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - \vec{\uptheta}^T\vec{x}_n)^2 =
\min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2
\]
\end{example}
\end{description}
\item[Expected risk] \marginnote{Expected risk}
The expected risk is defined as:
@ -66,6 +70,7 @@ We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n
where the parameters $\vec{\uptheta}$ are fixed and the samples are taken from a test set.
\item[Overfitting] \marginnote{Overfitting}
\sloppy
A predictor $f_\vec{\uptheta}$ is overfitting when $\textbf{R}_\text{emp}(f, \matr{X}_\text{train}, \vec{y}_\text{train})$
underestimates $\textbf{R}_\text{true}(f_\vec{\uptheta})$ (i.e. the loss on the training set is low, but on the test set is high).
@ -73,12 +78,191 @@ We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n
Method that introduces a penalty term to the loss that
helps to find a compromise between the accuracy and the complexity of the solution:
\[ \bar{\ell}(y_n, \hat{y}_n) = \ell(y_n, \hat{y}_n) + \lambda \mathcal{R}(\vec{\uptheta}) \]
where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the penalty.
where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the regularizer (penalty term).
\begin{description}
\item[Regularized least squares] \marginnote{Regularized least squares}
A simple regularization term for the least squares problem is $\Vert \vec{\uptheta} \Vert^2$.
The problem becomes:
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D}
\{ \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
\end{description}
\end{description}
\subsection{Maximum likelihood}
\marginnote{Maximum likelihood}
\subsection{Maximum likelihood estimation (MLE)}
% \marginnote{Maximum likelihood estimation (MLE)}
Used for probabilistic models.
The parameters are determined as the most likely to predict the correct label given an input.
\begin{description}
\item[Negative log-likelihood] \marginnote{Negative log-likelihood}
\sloppy
Given a random variable $\bm{x}$, a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$
and a predictor, the negative log-likelihood of $\bm{x}$ is:
\[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
Note that:
\begin{itemize}
\item The minus is added as we are converting the problem of maximizing the likelihood to a minimization problem.
\item The logarithm is useful for numerical stability.
\end{itemize}
$\mathcal{L}_{\bm{x}}(\vec{\uptheta})$ indicates how likely it is to observe $\bm{x}$ with
$\vec{\uptheta}$ as the parameters of the predictor.
Given a dataset $(\bm{x}_n, y_n)$ of $N$ independent and identically distributed (i.i.d.) elements,
optimizing the likelihood allows to find the most likely parameters to represent the dataset.
As the dataset is independent, we have that:
\[ p_\vec{\uptheta}(\vec{y} \vert \matr{X}) = \prod_{n=1}^{N} p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
where $\matr{X} = \begin{pmatrix} \bm{x}_1, \dots, \bm{x}_N \end{pmatrix}$ and
$\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix}$.
Moreover, as the dataset is identically distributed,
each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.
By applying the logarithm, we have that the negative log-likelihood of a i.i.d. dataset is define as:
\[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
and to find good parameters $\vec{\uptheta}$, we solve the problem:
\[
\min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) =
\min_{\vec{\uptheta} \in \mathbb{R}^D} -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n)
\]
\begin{description}
\item[Gaussian likelihood] \marginnote{Gaussian likelihood}
Using a linear model $\bm{x}^T\vec{\uptheta}$ as predictor and
assuming that the likelihood has a Gaussian distribution as follows:
\[ p_\vec{\uptheta}(y_n \,\vert\, \bm{x}_n) = \mathcal{N}(y_n \,\vert\, \bm{x}_n^T\vec{\uptheta}, \sigma^2) \]
where the Gaussian distribution has mean $\bm{x}_n^T\vec{\uptheta}$ (i.e. $f_\vec{\uptheta}(\bm{x}_n))$
and variance $\sigma^2$ for the $n$-th data point.
The negative log-likelihood is:
\[
\begin{split}
\mathcal{L}(\vec{\uptheta}) &= -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \\
&= -\sum_{n=1}^{N} \log \mathcal{N}(y_n \vert \bm{x}_n^T\vec{\uptheta}, \sigma^2) \\
&= -\sum_{n=1}^{N} \log \left( \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) \right) \\
&= -\sum_{n=1}^{N} \log\exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}} \\
&= \frac{1}{2\sigma^2} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}
\end{split}
\]
The minimization problem becomes:
\[
\begin{split}
\min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) &=
\min_{\vec{\uptheta} \in \mathbb{R}^D}
\overbrace{\frac{1}{2\sigma^2}}^{\mathclap{\text{constant}}}
\sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 -
\overbrace{\sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}}^{\mathclap{\text{constant}}} \\
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 \\
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2
\end{split}
\]
which corresponds to the least squares problem.
\end{description}
\begin{figure}[ht]
\begin{subfigure}{.45\textwidth}
\centering
\includegraphics[width=.75\linewidth]{img/gaussian_mle_good.png}
\caption{When the parameters are good, the label will be near the mean (i.e. predictor)}
\end{subfigure}
\hspace*{1em}
\begin{subfigure}{.45\textwidth}
\centering
\includegraphics[width=.75\linewidth]{img/gaussian_mle_bad.png}
\caption{When the parameters are bad, the label will be far the mean}
\end{subfigure}
\caption{Geometric interpretation of the Gaussian likelihood. (not sure if this is correct)}
\end{figure}
\end{description}
\subsection{Maximum a posteriori estimation (MAP)}
\marginnote{Maximum a posteriori (MAP)}
Maximum a posteriori estimation uses the opposite distribution of MLE and maximizes:
\[
\max_{\vec{\uptheta} \in \mathbb{R}^D} p(\vec{\uptheta} \vert \matr{X}, \vec{y}) =
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{\uptheta} \vert \matr{X}, \vec{y})
\]
In other words, it maximizes the probability of a set of parameters $\vec{\uptheta}$ given the observation of the dataset $(\matr{X}, \vec{y})$.
By applying the Bayes' theorem, the problem becomes:
\[
\begin{split}
\min_{\vec{\uptheta} \in \mathbb{R}^D}
-\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
\min_{\vec{\uptheta} \in \mathbb{R}^D} -p(Y \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
&= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
\end{split}
\]
\begin{description}
\item[Gaussian posteriori] \marginnote{Gaussian posteriori}
By assuming that the conditional probability of the dataset follows a Gaussian distribution (as in MLE),
the problem becomes:
\[
\min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \} =
\min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 -\log p(\vec{\uptheta}) \}
\]
Moreover, assuming that $p(\vec{\uptheta}) \sim \mathcal{N}(0, \matr{\Sigma})$, we have that:
\[ -\log p(\vec{\uptheta}) = \frac{1}{2\sigma^2} \Vert \vec{\uptheta} \Vert^2 \]
Therefore, the problem becomes:
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
MAP can be seen as a regularization factor for MLE.
\end{description}
\section{Linear regression}
\marginnote{Linear regression}
Given a dataset of inputs $\vec{x}_n \in \mathbb{R}^D$ with corresponding labels $y_n = f(\vec{x}_n) + \varepsilon$,
where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise.
We want to estimate the function $f$.
\begin{description}
\item[Model]
Because of the noise, we use a probabilistic model with likelihood:
\[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, f(\vec{x}), \sigma^2) \]
As model, we use a linear predictor:
\[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \]
\item[Parameter estimation]
To estimate $\vec{\uptheta}$, we can use MLE:
\[ \min_{\vec{\uptheta} \in \mathbb{R}^D} -p_\vec{\uptheta}(\vec{y} \vert \matr{X}) \]
\end{description}
\subsection{Maximum likelihood estimation with features}
\marginnote{MLE with features}
Linear regression is linear only with respect to the parameters $\vec{\uptheta}$.
Therefore, it is possible to apply any transformation to the inputs of $f$ such that:
\[ f(\vec{x}_n) = (\phi(\vec{x}_n))^T \vec{\uptheta} \]
where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ and $\vec{\uptheta} \in \mathbb{R}^K$.
The likelihood becomes:
\[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, (\phi(\vec{x}))^T\vec{\uptheta}, \sigma^2) \]
\begin{description}
\item[Polynomial regression] \marginnote{Polynomial regression}
The transformation function $\phi: \mathbb{R} \rightarrow \mathbb{R}^K$ is defined as:
\[
\phi(x) =
\begin{pmatrix}
\phi_0(x) \\ \phi_1(x) \\ \phi_2(x) \\ \vdots \\ \phi_{K-1}(x)
\end{pmatrix}
=
\begin{pmatrix}
1 \\ x \\ x^2 \\ \vdots \\ x^{K-1}
\end{pmatrix}
\]
The predictor is then defined as:
\[
\begin{split}
f(x) &= (\phi(x))^T \vec{\uptheta} \\
&= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i \\
&= \sum_{i=0}^{K-1} x^i \vartheta_i
\end{split}
\]
\end{description}