diff --git a/src/ainotes.cls b/src/ainotes.cls index 0323a80..b41a4ae 100644 --- a/src/ainotes.cls +++ b/src/ainotes.cls @@ -39,7 +39,8 @@ showspaces = false, showstringspaces = true, showtabs = false, - tabsize = 3 + tabsize = 3, + belowskip = -0.8\baselineskip } \lstset{style=mystyle} \lstset{language=Python} diff --git a/src/statistical-and-mathematical-methods-for-ai/img/_gradient_contour.pdf b/src/statistical-and-mathematical-methods-for-ai/img/_gradient_contour.pdf new file mode 100644 index 0000000..08403c7 Binary files /dev/null and b/src/statistical-and-mathematical-methods-for-ai/img/_gradient_contour.pdf differ diff --git a/src/statistical-and-mathematical-methods-for-ai/main.tex b/src/statistical-and-mathematical-methods-for-ai/main.tex index b7c882b..2356a8b 100644 --- a/src/statistical-and-mathematical-methods-for-ai/main.tex +++ b/src/statistical-and-mathematical-methods-for-ai/main.tex @@ -12,5 +12,6 @@ \input{sections/_linear_systems.tex} \input{sections/_matrix_decomp.tex} \input{sections/_vector_calculus.tex} + \input{sections/_gradient_methods.tex} \end{document} \ No newline at end of file diff --git a/src/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex b/src/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex new file mode 100644 index 0000000..0f1de01 --- /dev/null +++ b/src/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex @@ -0,0 +1,139 @@ +\chapter{Gradient methods} + + +\section{Minimum of a function} + +Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$. +\begin{descriptionlist} + \item[Stationary point] \marginnote{Stationary point} + $\vec{x}^*$ is a stationary point of $f$ iff: + \[ \nabla f(\vec{x}^*) = \nullvec \] + + \item[Local minimum] \marginnote{Local minimum} + $\vec{x}^* \in \mathbb{R}^N$ is a local minimum of $f$ iff: + \[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \] + + \item[Strict local minimum] \marginnote{Strict local minimum} + $\vec{x}^* \in \mathbb{R}^N$ is a strict local minimum of $f$ iff: + \[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \] + + \item[Global minimum] \marginnote{Global minimum} + $\vec{x}^* \in \mathbb{R}^N$ is a global minimum of $f$ iff: + \[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \] + + \item[Strict global minimum] \marginnote{Strict global minimum} + $\vec{x}^* \in \mathbb{R}^N$ is a strict global minimum of $f$ iff: + \[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \] +\end{descriptionlist} + + +\subsection{Optimality conditions} + +\begin{description} + \item[First order condition] \marginnote{First order condition} + Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in $\mathbb{R}^N$. + \[ \text{If } \vec{x}^* \text{ local minimum of } f \Rightarrow \nabla f(\vec{x}^*) = \nullvec \] + + \item[Second order condition] \marginnote{Second order condition} + Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and twice differentiable. + \[ + \text{If } \nabla f(\vec{x}^*) = \nullvec \text{ and } \nabla^2 f(\vec{x}^*) \text{ positive definite} \Rightarrow + \vec{x}^* \text{ strict local minimum of } f + \] +\end{description} + +As the second order condition requires to compute the Hessian matrix, which is expensive, in practice only the first order condition is checked. + + + +\section{Descent methods} + +\marginnote{Descent methods} +Descent methods are iterative methods that have the property: +\[ f(\vec{x}_k) < f(\vec{x}_{k+1}) \] + +The iteration is defined as: +\[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \] +where $\vec{p}_{k-1} \in \mathbb{R}^N$ is the search direction and \marginnote{Search direction\\Step length} +$\alpha_{k-1} \in \mathbb{R}$ is the step length. + +Note: descent methods usually converge to a local minimum. + +\begin{figure} + \centering + \includegraphics[width=0.5\linewidth]{img/_gradient_contour.pdf} + \caption{Descent method steps in $\mathbb{R}^2$ (i.e. moving across contour lines)} +\end{figure} + + +\subsection{Choice of the search direction} + +\begin{description} + \item[Descent direction] \marginnote{Descent direction} + $\vec{p} \in \mathbb{R}^N$ is a descent direction of $f$ in $\vec{x}$ if: + \[ \exists \bar{\alpha} > 0, \forall \alpha \in [0, \bar{\alpha}]: f(\vec{x} + \alpha \vec{p}) < f(\vec{x}) \] +\end{description} + +\begin{theorem} + Let $\vec{p} \in \mathbb{R}^N$, $\vec{p} \neq \nullvec$. + \[ \text{If } \vec{p}^T \nabla f(\vec{x}) < 0 \Rightarrow \vec{p} \text{ descent direction of } f \text{ in } x \] +\end{theorem} + +\begin{theorem} + For all $\vec{x}$, $\vec{p} = -\nabla f(\vec{x})$ is a descent direction of $f$ in $x$. +\end{theorem} +\begin{proof} + \[ + \begin{split} + \vec{p}^T \nabla f(\vec{x}) < 0 &\iff -(\nabla f(\vec{x}))^T \nabla f(\vec{x}) < 0 \\ + &\iff - \Vert \nabla f(\vec{x}) \Vert_2^2 < 0 + \end{split} + \] + This holds as the norm is always positive. +\end{proof} + +\begin{description} + \item[Gradient-like methods] \marginnote{Gradient-like methods} + Gradient-like methods are descent methods that use $-\nabla f$ as step. +\end{description} + + +\subsection{Choice of the step length} +\begin{description} + \item[Constant] + In machine learning, it is common to set a constant value for the step (learning rate), + but it can be proved that this does not guarantee convergence. + + \item[Backtracking procedure] \marginnote{Backtracking procedure} + $\alpha_k$ is chose such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}: + \begin{lstlisting}[mathescape=true] + def backtracking($\tau$, $c_1$): + $\alpha_k$ = 1 # Initial guess + while $f(x_k - \alpha_k \nabla f(\vec{x}_k))$ > $f(\vec{x}_k)$ + $c_1 \alpha_k \nabla f(\vec{x}_k)^T \nabla f(\vec{x}_k)$: + $\alpha_k$ = $\alpha_k$ / $\tau$ + return $\alpha_k$ + \end{lstlisting} + It can be proved that, by using the backtracking procedure, gradient methods converge to a local minimum. +\end{description} + + +\subsection{Stopping condition} +\marginnote{Stopping condition} +We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, $\nabla f(\vec{x}_k) \approx \nullvec$. +We can verify this by checking the norm of the gradient against a tolerance $\tau$: +\begin{descriptionlist} + \item[Absolute condition] $\Vert \nabla f(x_k) \Vert_2 < \tau$ + \item[Relative condition] $\frac{\Vert \nabla f(x_k) \Vert_2}{\Vert \nabla f(x_0) \Vert_2} < \tau$ +\end{descriptionlist} + +A generic gradient-like method can then be defined as: +\begin{lstlisting}[mathescape=true] + def gradientMethod($f$, $\vec{x}_0$): + $k$ = 0 + while stoppingCondition($f$, $\vec{x}_k$, $\vec{x}_0$): + $p_k$ = $-\nabla f(\vec{x}_k)$ + $\alpha_k$ = backtracking($\dots$) + $\vec{x}_{k+1}$ = $\vec{x}_k$ + $\alpha_k \vec{p}_k$ + $k$ = $k$ + 1 + return $x_k$ +\end{lstlisting} \ No newline at end of file