diff --git a/src/ainotes.cls b/src/ainotes.cls index 61a807c..f91e37f 100644 --- a/src/ainotes.cls +++ b/src/ainotes.cls @@ -97,4 +97,7 @@ \restoregeometry \newpage \pagenumbering{arabic} -} \ No newline at end of file +} + + +\newcommand{\eoc}[0]{\begin{flushright}\texttt{\raggedleft\small }\end{flushright}} \ No newline at end of file diff --git a/src/statistical-and-mathematical-methods-for-ai/main.tex b/src/statistical-and-mathematical-methods-for-ai/main.tex index 7fc2ae9..c8f3386 100644 --- a/src/statistical-and-mathematical-methods-for-ai/main.tex +++ b/src/statistical-and-mathematical-methods-for-ai/main.tex @@ -15,5 +15,6 @@ \input{sections/_gradient_methods.tex} \input{sections/_probability.tex} \input{sections/_machine_learning.tex} - + \eoc + \end{document} \ No newline at end of file diff --git a/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex b/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex index ccd400d..de01bbd 100644 --- a/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex +++ b/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex @@ -218,15 +218,15 @@ By applying the Bayes' theorem, the problem becomes: \section{Linear regression} \marginnote{Linear regression} Given a dataset of inputs $\vec{x}_n \in \mathbb{R}^D$ with corresponding labels $y_n = f(\vec{x}_n) + \varepsilon$, -where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise. -We want to estimate the function $f$. +where $f: \mathbb{R}^D \rightarrow \mathbb{R}$ and $\varepsilon \sim \mathcal{N}(0, \sigma^2)$ is a Gaussian noise, +we want to estimate the function $f$. \begin{description} \item[Model] + We use as predictor: + \[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \] Because of the noise, we use a probabilistic model with likelihood: \[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, f(\vec{x}), \sigma^2) \] - As model, we use a linear predictor: - \[ f(\vec{x}) = \vec{x}^T \vec{\uptheta} \] \item[Parameter estimation] To estimate $\vec{\uptheta}$, we can use MLE: @@ -237,12 +237,51 @@ We want to estimate the function $f$. \subsection{Maximum likelihood estimation with features} \marginnote{MLE with features} Linear regression is linear only with respect to the parameters $\vec{\uptheta}$. -Therefore, it is possible to apply any transformation to the inputs of $f$ such that: +Therefore, it is possible to apply any transformation to the inputs of the predictor $f$ such that: \[ f(\vec{x}_n) = (\phi(\vec{x}_n))^T \vec{\uptheta} \] -where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ and $\vec{\uptheta} \in \mathbb{R}^K$. +where $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$ is a transformation and +$\vec{\uptheta} \in \mathbb{R}^K$ are the parameters. -The likelihood becomes: -\[ p_\vec{\uptheta}(y \,\vert\, \vec{x}) = \mathcal{N}(y \,\vert\, (\phi(\vec{x}))^T\vec{\uptheta}, \sigma^2) \] +Given a dataset of $N$ entries $\vec{x}_n \in \mathbb{R}^D$ with labels $y_n \in \mathbb{R}$ +and a transformation function $\phi: \mathbb{R}^D \rightarrow \mathbb{R}^K$, +the transformed features can be expressed through a feature matrix $\matr{\Phi} \in \mathbb{R}^{N \times K}$: +\[ + \matr{\Phi} = + \begin{pmatrix} + (\phi(\vec{x}_1))^T \\ \vdots \\ (\phi(\vec{x}_N))^T + \end{pmatrix} + = + \begin{pmatrix} + \phi_0(\vec{x}_1) & \cdots & \phi_{K-1}(\vec{x}_1) \\ + \vdots & \ddots & \vdots \\ + \phi_0(\vec{x}_N) & \cdots & \phi_{K-1}(\vec{x}_N) \\ + \end{pmatrix} +\] + +The negative log-likelihood can be defined as: +\[ + -\log p_\vec{\uptheta}(\vec{y} \,\vert\, \matr{X}) = + \frac{1}{2\sigma^2} (\vec{y} - \matr{\Phi}\vec{\uptheta})^T (\vec{y} - \matr{\Phi}\vec{\uptheta}) + \text{constant} +\] +As $\matr{\Phi}$ is (usually) full-rank and convex, the problem can be solved directly using normal equations: +\[ + \matr{\Phi}^T \matr{\Phi} \vec{\uptheta} = \matr{\Phi}^T \vec{y} \iff + \vec{\uptheta} = (\matr{\Phi}^T \matr{\Phi})^{-1} \matr{\Phi}^T \vec{y} +\] +Obviously, the negative log-likelihood can also be minimized by using a gradient method. + +\begin{description} + \item[Root mean square error (RMSE)] \marginnote{Root mean square error (RMSE)} + RMSE is computed as: + \[ + \sqrt{ \frac{1}{N} \Vert \vec{y} - \matr{\Phi}\vec{\uptheta} \Vert^2 } = + \sqrt{ \frac{1}{N} \sum_{n=1}^{N}(y_n - (\phi(\vec{x}_n))^T\vec{\uptheta})^2 } + \] + Differently from MSE, RMSE allows to compare errors of datasets with different sizes + and scales its result to the labels. + + By comparing the RMSE of the train and test sets, it is possible to check if a model is overfitting. +\end{description} \begin{description} \item[Polynomial regression] \marginnote{Polynomial regression} @@ -261,8 +300,7 @@ The likelihood becomes: \[ \begin{split} f(x) &= (\phi(x))^T \vec{\uptheta} \\ - &= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i \\ - &= \sum_{i=0}^{K-1} x^i \vartheta_i + &= \sum_{i=0}^{K-1} \phi_i(x)\vartheta_i = \sum_{i=0}^{K-1} x^i \vartheta_i \end{split} \] \end{description} \ No newline at end of file