Add SMM MLE

2025-12-15 19:12:22 +01:00 · 2023-10-16 21:12:13 +02:00
parent 6068ed5518
commit aaa1a0e2ca
4 changed files with 209 additions and 25 deletions
--- a/src/ainotes.cls
+++ b/src/ainotes.cls
@ -63,7 +63,7 @@
 \newtheorem*{definition}{Def}
 \newcommand{\ubar}[1]{\text{\b{$#1$}}}
-\renewcommand{\vec}[1]{{\mathbf{#1}}}
+\renewcommand{\vec}[1]{{\bm{\mathbf{#1}}}}
 \newcommand{\nullvec}[0]{\bar{\vec{0}}}
 \newcommand{\matr}[1]{{\bm{#1}}}
 \newcommand{\prob}[1]{{\mathcal{P}({#1})}}
--- a/src/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_bad.png
+++ b/src/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_bad.png
--- a/src/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_good.png
+++ b/src/statistical-and-mathematical-methods-for-ai/img/gaussian_mle_good.png
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
@ -14,16 +14,20 @@
        $\vec{\uptheta} = \begin{pmatrix} \uptheta_0, \dots, \uptheta_D \end{pmatrix}$ is the parameter vector.
    \item[Probabilistic model] \marginnote{Probabilistic model}
-        The model is a multivariate probabilistic distribution.
+        The model is a multivariate probabilistic distribution that 
        is able to quantify uncertainty in noisy data.
 \end{description}
 \section{Learning}
 \subsection{Empirical risk minimization}
 \marginnote{Empirical risk minimization}
 Used for function models.
 The parameters of the predictor are directly obtained as an optimization problem that aims to minimize the distance
 between the prediction and the ground truth.
 Let $(\vec{x}_n, y_n)$ be a dataset of $N$ elements
 where $\vec{x}_n \in \mathbb{R}^D$ are the examples and $y_n \in \mathbb{R}$ are the labels.
@ -41,14 +45,14 @@ We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n
        the dataset $(\vec{x}_n, y_n)$ is independent and identically distributed. 
        Therefore, the empirical mean is a good estimate of the population mean.
-        \begin{description}
+    \item[Empirical risk] \marginnote{Empirical risk}
-            \item[Empirical risk] \marginnote{Empirical risk}
+        Given the example matrix $\matr{X} = \begin{pmatrix} \vec{x}_1, \dots, \vec{x}_N \end{pmatrix} \in \mathbb{R}^{N \times D}$
-                Given the example matrix $\matr{X} = \begin{pmatrix} \vec{x}_1, \dots, \vec{x}_N \end{pmatrix} \in \mathbb{R}^{N \times D}$
+        and the label vector $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix} \in \mathbb{R}^N$.
-                and the label vector $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix} \in \mathbb{R}^N$.
+        The empirical risk is given by the average loss:
-                The empirical risk is given by the average loss:
+        \[ \textbf{R}_\text{emp}(f_\vec{\uptheta}, \matr{X}, \vec{y}) = \frac{1}{N} \sum_{n=1}^{N} \ell(y_n, \hat{y}_n) \]
                \[ \textbf{R}_\text{emp}(f_\vec{\uptheta}, \matr{X}, \vec{y}) = \frac{1}{N} \sum_{n=1}^{N} \ell(y_n, \hat{y}_n) \]
-            \begin{example}[Least-squares loss] \marginnote{Least-squares loss}
+        \begin{description}
            \item[Least-squares loss] \marginnote{Least-squares loss}
                The least-squares loss is defined as:
                \[ \ell(y_n, \hat{y}_n) = (y_n - \hat{y}_n)^2 \]
@ -58,27 +62,207 @@ We denote the output of the estimator as $\hat{y}_n = f_\vec{\uptheta}(\vec{x}_n
                    \min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \sum_{n=1}^{N} (y_n - \vec{\uptheta}^T\vec{x}_n)^2 =
                    \min_{\vec{\uptheta} \in \mathbb{R}^D} \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2
                \]
-            \end{example}
+        \end{description}
-            \item[Expected risk] \marginnote{Expected risk}
+    \item[Expected risk] \marginnote{Expected risk}
-                The expected risk is defined as:
+        The expected risk is defined as:
-                \[ \textbf{R}_\text{true}(f_\vec{\uptheta}) = \mathbb{E}_{\vec{x}, y}[\ell(y, f_\vec{\uptheta}(\vec{x}_\text{test}))] \]
+        \[ \textbf{R}_\text{true}(f_\vec{\uptheta}) = \mathbb{E}_{\vec{x}, y}[\ell(y, f_\vec{\uptheta}(\vec{x}_\text{test}))] \]
-                where the parameters $\vec{\uptheta}$ are fixed and the samples are taken from a test set.
+        where the parameters $\vec{\uptheta}$ are fixed and the samples are taken from a test set.
-            \item[Overfitting] \marginnote{Overfitting}
+    \item[Overfitting] \marginnote{Overfitting}
-                A predictor $f_\vec{\uptheta}$ is overfitting when $\textbf{R}_\text{emp}(f, \matr{X}_\text{train}, \vec{y}_\text{train})$
+        \sloppy
-                underestimates $\textbf{R}_\text{true}(f_\vec{\uptheta})$ (i.e. the loss on the training set is low, but on the test set is high).
+        A predictor $f_\vec{\uptheta}$ is overfitting when $\textbf{R}_\text{emp}(f, \matr{X}_\text{train}, \vec{y}_\text{train})$
        underestimates $\textbf{R}_\text{true}(f_\vec{\uptheta})$ (i.e. the loss on the training set is low, but on the test set is high).
-            \item[Regularization] \marginnote{Regularization}
+    \item[Regularization] \marginnote{Regularization}
-                Method that introduces a penalty term to the loss that
+        Method that introduces a penalty term to the loss that
-                helps to find a compromise between the accuracy and the complexity of the solution:
+        helps to find a compromise between the accuracy and the complexity of the solution:
-                \[ \bar{\ell}(y_n, \hat{y}_n) = \ell(y_n, \hat{y}_n) + \lambda \mathcal{R}(\vec{\uptheta}) \]
+        \[ \bar{\ell}(y_n, \hat{y}_n) = \ell(y_n, \hat{y}_n) + \lambda \mathcal{R}(\vec{\uptheta}) \]
-                where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the penalty.
+        where $\lambda \in \mathbb{R}^+$ is the regularization parameter and $\mathcal{R}$ is the regularizer (penalty term).
        \begin{description}
            \item[Regularized least squares] \marginnote{Regularized least squares} 
                A simple regularization term for the least squares problem is $\Vert \vec{\uptheta} \Vert^2$.
                The problem becomes:
                \[ \min_{\vec{\uptheta} \in \mathbb{R}^D} 
                    \{ \frac{1}{N} \Vert \vec{y} - \matr{X}\vec{\uptheta} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \] 
        \end{description}
 \end{description}
 \subsection{Maximum likelihood estimation (MLE)}
 % \marginnote{Maximum likelihood estimation (MLE)}
 Used for probabilistic models.
 The parameters are determined as the most likely to predict the correct label given an input.
-\subsection{Maximum likelihood}
+\begin{description}
-\marginnote{Maximum likelihood}
+    \item[Negative log-likelihood] \marginnote{Negative log-likelihood}
-Used for probabilistic models.
+        \sloppy
        Given a random variable $\bm{x}$, a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$
        and a predictor, the negative log-likelihood of $\bm{x}$ is:
        \[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
        Note that:
        \begin{itemize}
            \item The minus is added as we are converting the problem of maximizing the likelihood to a minimization problem.
            \item The logarithm is useful for numerical stability.
        \end{itemize}
        $\mathcal{L}_{\bm{x}}(\vec{\uptheta})$ indicates how likely it is to observe $\bm{x}$ with
        $\vec{\uptheta}$ as the parameters of the predictor.
        Given a dataset $(\bm{x}_n, y_n)$ of $N$ independent and identically distributed (i.i.d.) elements,
        optimizing the likelihood allows to find the most likely parameters to represent the dataset.
        As the dataset is independent, we have that:
        \[ p_\vec{\uptheta}(\vec{y} \vert \matr{X}) = \prod_{n=1}^{N} p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
        where $\matr{X} = \begin{pmatrix} \bm{x}_1, \dots, \bm{x}_N \end{pmatrix}$ and
        $\vec{y} = \begin{pmatrix} y_1, \dots, y_N \end{pmatrix}$.
        Moreover, as the dataset is identically distributed, 
        each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.
        By applying the logarithm, we have that the negative log-likelihood of a i.i.d. dataset is define as:
        \[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
        and to find good parameters $\vec{\uptheta}$, we solve the problem:
        \[ 
            \min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) =  
            \min_{\vec{\uptheta} \in \mathbb{R}^D} -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) 
        \]
        \begin{description}
            \item[Gaussian likelihood] \marginnote{Gaussian likelihood}
                Using a linear model $\bm{x}^T\vec{\uptheta}$ as predictor and 
                assuming that the likelihood has a Gaussian distribution as follows:
                \[ p_\vec{\uptheta}(y_n \,\vert\, \bm{x}_n) = \mathcal{N}(y_n \,\vert\, \bm{x}_n^T\vec{\uptheta}, \sigma^2) \]
                where the Gaussian distribution has mean $\bm{x}_n^T\vec{\uptheta}$ (i.e. $f_\vec{\uptheta}(\bm{x}_n))$ 
                and variance $\sigma^2$ for the $n$-th data point.
                The negative log-likelihood is:
                \[
                    \begin{split}
                        \mathcal{L}(\vec{\uptheta}) &= -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \\
                            &= -\sum_{n=1}^{N} \log \mathcal{N}(y_n \vert \bm{x}_n^T\vec{\uptheta}, \sigma^2) \\
                            &= -\sum_{n=1}^{N} \log \left( \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) \right) \\
                            &= -\sum_{n=1}^{N} \log\exp\left(-\frac{(y_n-\bm{x}_n^T\vec{\uptheta})^2}{2\sigma^2}\right) - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}} \\
                            &= \frac{1}{2\sigma^2} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - \sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}
                    \end{split}  
                \]
                The minimization problem becomes:
                \[
                    \begin{split}
                        \min_{\vec{\uptheta} \in \mathbb{R}^D} \mathcal{L}(\vec{\uptheta}) &= 
                            \min_{\vec{\uptheta} \in \mathbb{R}^D} 
                                \overbrace{\frac{1}{2\sigma^2}}^{\mathclap{\text{constant}}} 
                                \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 - 
                                \overbrace{\sum_{n=1}^{N} \log\frac{1}{\sqrt{2\pi\sigma^2}}}^{\mathclap{\text{constant}}} \\
                            &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \sum_{n=1}^{N} (y_n-\bm{x}_n^T\vec{\uptheta})^2 \\
                            &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2
                    \end{split}    
                \]
                which corresponds to the least squares problem.
        \end{description}
        \begin{figure}[ht]
            \begin{subfigure}{.45\textwidth}
                \centering
                \includegraphics[width=.75\linewidth]{img/gaussian_mle_good.png}
                \caption{When the parameters are good, the label will be near the mean (i.e. predictor)}
            \end{subfigure}
            \hspace*{1em}
            \begin{subfigure}{.45\textwidth}
                \centering
                \includegraphics[width=.75\linewidth]{img/gaussian_mle_bad.png}
                \caption{When the parameters are bad, the label will be far the mean}
            \end{subfigure}
            \caption{Geometric interpretation of the Gaussian likelihood. (not sure if this is correct)}
        \end{figure}
 \end{description}
 \subsection{Maximum a posteriori estimation (MAP)}
 \marginnote{Maximum a posteriori (MAP)}
 Maximum a posteriori estimation uses the opposite distribution of MLE and maximizes:
 \[ 
    \max_{\vec{\uptheta} \in \mathbb{R}^D} p(\vec{\uptheta} \vert \matr{X}, \vec{y}) =
    \min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{\uptheta} \vert \matr{X}, \vec{y})
 \]
 In other words, it maximizes the probability of a set of parameters $\vec{\uptheta}$ given the observation of the dataset $(\matr{X}, \vec{y})$.
 By applying the Bayes' theorem, the problem becomes:
 \[ 
    \begin{split}
        \min_{\vec{\uptheta} \in \mathbb{R}^D} 
            -\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
        \min_{\vec{\uptheta} \in \mathbb{R}^D} -p(Y \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
        &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
    \end{split}
 \]
 \begin{description}
    \item[Gaussian posteriori] \marginnote{Gaussian posteriori}
        By assuming that the conditional probability of the dataset follows a Gaussian distribution (as in MLE),
        the problem becomes:
        \[ 
            \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \} = 
            \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 -\log p(\vec{\uptheta}) \} 
        \]
        Moreover, assuming that $p(\vec{\uptheta}) \sim \mathcal{N}(0, \matr{\Sigma})$, we have that:
        \[ -\log p(\vec{\uptheta}) = \frac{1}{2\sigma^2} \Vert \vec{\uptheta} \Vert^2 \]
        Therefore, the problem becomes:
        \[ \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ \Vert \vec{y} - \matr{\uptheta}\matr{X} \Vert^2 + \lambda \Vert \vec{\uptheta} \Vert^2 \} \]
        MAP can be seen as a regularization factor for MLE.
 \end{description}
 \section{Linear regression}
 \marginnote{Linear regression}
 Given a dataset of inputs $\vec{x}_n \in \mathbb{R}^D$ with corresponding labels $y_n = f(\vec{x}_n) + \varepsilon$,
 where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise.
 We want to estimate the function $f$.
 \begin{description}
    \item[Model]
        Because of the noise, we use a probabilistic model with likelihood:
        \[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, f(\vec{x}), \sigma^2) \]
        As model, we use a linear predictor:
        \[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \]
    \item[Parameter estimation]  
        To estimate $\vec{\uptheta}$, we can use MLE:
        \[ \min_{\vec{\uptheta} \in \mathbb{R}^D} -p_\vec{\uptheta}(\vec{y} \vert \matr{X}) \]
 \end{description}
 \subsection{Maximum likelihood estimation with features}
 \marginnote{MLE with features}
 Linear regression is linear only with respect to the parameters $\vec{\uptheta}$. 
 Therefore, it is possible to apply any transformation to the inputs of $f$ such that:
 \[ f(\vec{x}_n) = (\phi(\vec{x}_n))^T \vec{\uptheta}  \]
 where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ and $\vec{\uptheta} \in \mathbb{R}^K$.
 The likelihood becomes:
 \[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, (\phi(\vec{x}))^T\vec{\uptheta}, \sigma^2) \]
 \begin{description}
    \item[Polynomial regression] \marginnote{Polynomial regression}
        The transformation function $\phi: \mathbb{R} \rightarrow \mathbb{R}^K$ is defined as:
        \[  
            \phi(x) = 
            \begin{pmatrix}
                \phi_0(x) \\ \phi_1(x) \\ \phi_2(x) \\ \vdots \\ \phi_{K-1}(x)
            \end{pmatrix}
            = 
            \begin{pmatrix}
                1 \\ x \\ x^2 \\ \vdots \\ x^{K-1}
            \end{pmatrix}
        \]
        The predictor is then defined as:
        \[ 
            \begin{split}
                f(x) &= (\phi(x))^T \vec{\uptheta} \\
                    &= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i \\
                    &= \sum_{i=0}^{K-1} x^i \vartheta_i
            \end{split}
        \]
 \end{description}