mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 19:12:22 +01:00
Add SMM gradient descent
This commit is contained in:
@ -17,6 +17,7 @@
|
||||
\usepackage{scrhack, algorithm, listings}
|
||||
\usepackage{array, makecell}
|
||||
\usepackage{acro}
|
||||
\usepackage{subcaption}
|
||||
|
||||
\geometry{ margin=3cm, lmargin=1.5cm, rmargin=4.5cm, marginparwidth=3cm }
|
||||
\hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=black, urlcolor=black, linktoc=all }
|
||||
@ -55,6 +56,7 @@
|
||||
|
||||
\theoremstyle{definition}
|
||||
\newtheorem{theorem}{Theorem}[section]
|
||||
\newtheorem{corollary}{Corollary}[theorem]
|
||||
\newtheorem*{example}{Example}
|
||||
\theoremstyle{definition}
|
||||
\newtheorem*{definition}{Def}
|
||||
|
||||
Binary file not shown.
BIN
src/statistical-and-mathematical-methods-for-ai/img/cliff.png
Normal file
BIN
src/statistical-and-mathematical-methods-for-ai/img/cliff.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 73 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 31 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 9.3 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 9.4 KiB |
BIN
src/statistical-and-mathematical-methods-for-ai/img/valley.png
Normal file
BIN
src/statistical-and-mathematical-methods-for-ai/img/valley.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 133 KiB |
@ -26,6 +26,8 @@ Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in
|
||||
\[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
|
||||
\end{descriptionlist}
|
||||
|
||||
Note that $\max f(x) = \min -f(x)$.
|
||||
|
||||
|
||||
\subsection{Optimality conditions}
|
||||
|
||||
@ -109,7 +111,7 @@ Note: descent methods usually converge to a local minimum.
|
||||
\begin{lstlisting}[mathescape=true, belowskip = -0.8\baselineskip]
|
||||
def backtracking($\tau$, $c_1$):
|
||||
$\alpha_k$ = 1 # Initial guess
|
||||
while $f(x_k - \alpha_k \nabla f(\vec{x}_k))$ > $f(\vec{x}_k)$ + $c_1 \alpha_k \nabla f(\vec{x}_k)^T \nabla f(\vec{x}_k)$:
|
||||
while $f(x_k + \alpha_k \nabla f(\vec{x}_k))$ > $f(\vec{x}_k)$ + $c_1 \alpha_k \nabla f(\vec{x}_k)^T \nabla f(\vec{x}_k)$:
|
||||
$\alpha_k$ = $\alpha_k$ / $\tau$
|
||||
return $\alpha_k$
|
||||
\end{lstlisting}
|
||||
@ -137,3 +139,202 @@ A generic gradient-like method can then be defined as:
|
||||
$k$ = $k$ + 1
|
||||
return $x_k$
|
||||
\end{lstlisting}
|
||||
|
||||
|
||||
\subsection{Problems}
|
||||
|
||||
\begin{description}
|
||||
\item[Choice of the initialization point] \marginnote{Initialization point}
|
||||
The starting point of an iterative method is a user defined parameter.
|
||||
For simple problems, it is usually chosen randomly in $[-1, +1]$.
|
||||
|
||||
For complex problems, the choice of the initialization point is critical as
|
||||
it may cause numerical instabilities or bad results.
|
||||
Heuristics can be used to select an adequate starting point.
|
||||
|
||||
\item[Flag regions and local optima] \marginnote{Flag regions and local optima}
|
||||
Flat regions slow down the learning speed,
|
||||
while a local optima causes the method to converge at a poor solution.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.9\textwidth]{img/_descent_local_flat.pdf}
|
||||
\caption{Flat regions and local minima}
|
||||
\end{figure}
|
||||
|
||||
\item[Differential curvature]
|
||||
Different magnitudes of the partial derivatives may cause the problem of
|
||||
vanishing and exploding gradient. \marginnote{Vanishing gradient\\Exploding gradient}
|
||||
This causes the learning process to require more iterations to correct the direction.
|
||||
|
||||
In practice, as the gradient of complex functions is only an instantaneous direction of best decrease and
|
||||
does not represent the direction to the minimum in the long term,
|
||||
many updates are required for a gradient method to converge.
|
||||
|
||||
A method to mitigate this issue is to use feature normalization techniques.
|
||||
|
||||
\item[Non-differentiable objective function]
|
||||
If the objective function has a small number of non-differentiable points,
|
||||
the gradient descent method can be applied with minor modifications.
|
||||
|
||||
If lots of points are non-differentiable, the gradients will not be informative enough
|
||||
to determine a decrease direction.
|
||||
|
||||
\item[Difficult topologies]
|
||||
\marginnote{Cliff}
|
||||
A cliff in the objective function causes problems when evaluating the gradient at the edge.
|
||||
With a small step size, there is a slow down in convergence.
|
||||
With a large step size, there is an overshoot that may cause the algorithm to diverge.
|
||||
% a slow down when evaluating
|
||||
% the gradient at the edge using a small step size and
|
||||
% an overshoot when the step is too large.
|
||||
|
||||
\marginnote{Valley}
|
||||
A valley in the objective function causes a gradient method to bounce between the sides
|
||||
to a point where no significant progress can be made.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.30\linewidth]{img/cliff.png}
|
||||
\caption{Cliff region}
|
||||
\end{subfigure}%
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.30\linewidth]{img/valley.png}
|
||||
\caption{Ping pong tournament in a valley}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Convex functions}
|
||||
|
||||
\begin{description}
|
||||
\item[Convex set] \marginnote{Convex set}
|
||||
Informally, a set is convex if, for any two points of the set,
|
||||
the points laying on the segment connecting them are also part of the set.
|
||||
|
||||
\begin{figure}[ht]
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.25\linewidth]{img/convex_set.png}
|
||||
\caption{Convex set}
|
||||
\end{subfigure}%
|
||||
\begin{subfigure}{.5\textwidth}
|
||||
\centering
|
||||
\includegraphics[width=.25\linewidth]{img/non_convex_set.png}
|
||||
\caption{Non-convex set}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\item[Convex function] \marginnote{Convex function}
|
||||
Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
|
||||
$f$ is convex if:
|
||||
\[
|
||||
\forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]:
|
||||
f(t\vec{x}_1 + (1-t)\vec{x}_2) \leq t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
|
||||
\]
|
||||
|
||||
In other words, the segment connecting two points of the function lays above the graph.
|
||||
\begin{figure}[ht]
|
||||
\centering
|
||||
\includegraphics[width=0.55\textwidth]{img/convex_function.png}
|
||||
\caption{Convex function}
|
||||
\end{figure}
|
||||
|
||||
\item[Strictly convex function] \marginnote{Strictly convex function}
|
||||
Let $\Omega \subseteq \mathbb{R}^n$ be a convex set and $f: \Omega \rightarrow \mathbb{R}$.
|
||||
$f$ is strictly convex if:
|
||||
\[
|
||||
\forall \vec{x}_1, \vec{x}_2 \in \Omega, \forall t \in [0, 1]:
|
||||
f(t\vec{x}_1 + (1-t)\vec{x}_2) < t f(\vec{x}_1) + (1-t) f(\vec{x}_2)
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
\marginnote{Convex properties}
|
||||
\begin{itemize}
|
||||
\item $\text{if } f \text{ convex} \Rightarrow \text{ any local minimum of } f \text{ is also global}$
|
||||
\item $\text{if } f \text{ strictly convex} \Rightarrow \text{ the global minimum of } f \text{ is unique}$
|
||||
\item $\text{if } f \text{ convex and differentiable} \Rightarrow \text{ any stationary point of } f \text{ is a global minimum}$
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\subsection{Quadratic functions}
|
||||
\marginnote{Quadratic function}
|
||||
A quadratic function has form:
|
||||
\[ f(\vec{x}) = \frac{1}{2}\vec{x}^T\matr{A}\vec{x} - \vec{x}^T\vec{b} + c \]
|
||||
where $\matr{A} \in \mathbb{R}^{n \times n}$, $\vec{b} \in \mathbb{R}^n$ and $c \in \mathbb{R}$.
|
||||
|
||||
\begin{theorem}
|
||||
If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive semidefinite,
|
||||
then $f$ is convex.
|
||||
\end{theorem}
|
||||
|
||||
\begin{theorem}
|
||||
If $f$ is a quadratic form with $\matr{A} \in \mathbb{R}^{n \times n}$ symmetric positive definite,
|
||||
then $f$ is strictly convex.
|
||||
\end{theorem}
|
||||
|
||||
\begin{theorem}
|
||||
\marginnote{Least squares quadratic function}
|
||||
The least squares problem $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is a quadratic function.
|
||||
\end{theorem}
|
||||
\begin{proof}
|
||||
\[
|
||||
\begin{split}
|
||||
(\matr{A}\vec{x} - \vec{b})^T(\matr{A}\vec{x} - \vec{b}) &= (\vec{x}^T\matr{A}^T - \vec{b}^T)(\matr{A}\vec{x} - \vec{b}) \\
|
||||
&= \vec{x}^T\matr{A}^T\matr{A}\vec{x} - \vec{b}^T\matr{A}\vec{x} - \vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \\
|
||||
\end{split}
|
||||
\]
|
||||
As $\vec{b}^T\matr{A}\vec{x} = \vec{x}^T\matr{A}^T\vec{b}$, we have:
|
||||
\[ \vec{x}^T\matr{A}^T\matr{A}\vec{x} - 2\vec{x}^T\matr{A}^T\vec{b} + \vec{b}^T\vec{b} \]
|
||||
|
||||
Let $\matr{B} = \matr{A}^T\matr{A}$, $\vec{q} = \matr{A}^T\vec{b}$ and $c = \vec{b}^T\vec{b}$,
|
||||
we have the quadratic form:
|
||||
\[ \vec{x}^T\matr{B}\vec{x} - 2\vec{x}^T\vec{q} + c \]
|
||||
|
||||
$\matr{B}$ is symmetric positive semidefinite (i.e. $\Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2$ is convex).
|
||||
Moreover, when $\matr{A}$ is full-rank, $\matr{B}$ is symmetric positive definite (i.e. strictly convex).
|
||||
\end{proof}
|
||||
|
||||
|
||||
|
||||
\section{Gradient descent with momentum}
|
||||
\marginnote{Momentum}
|
||||
The momentum is an additional term to keep track of previous iterations:
|
||||
\[
|
||||
\Delta \vec{x}_k = \vec{x}_k - \vec{x}_{k-1} = \gamma \Delta \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1})
|
||||
\]
|
||||
where $\gamma \in [0, 1]$. An iteration is therefore defined as:
|
||||
\[
|
||||
\vec{x}_k = \vec{x}_{k-1} - \alpha_{k-1}\nabla f(\vec{x}_{k-1}) + \gamma \Delta\vec{x}_{k-1}
|
||||
\]
|
||||
|
||||
|
||||
|
||||
\section{Stochastic gradient descent (SGD)}
|
||||
\marginnote{Stochastic gradient descent}
|
||||
SGD is a stochastic approximation of gradient descent that uses an approximation of the gradient.
|
||||
Given $N$ data points, the loss can be defined as the sum of the individual losses:
|
||||
\[ L(\vec{x}) = \sum_{n=1}^{N} L_n(\vec{x}) \]
|
||||
where $\vec{x}$ is the vector of parameters.
|
||||
The corresponding gradient can be computed as:
|
||||
\[ \nabla L(\vec{x}) = \sum_{n=1}^{N} \nabla L_n(\vec{x}) \]
|
||||
|
||||
\marginnote{Mini-batch}
|
||||
SGD reduces the amount of computation by approximating the gradient with a subset (mini-batch) $B$ of $\nabla L_n$:
|
||||
\[ \nabla L(\vec{x}) = \sum_{i \in B} \nabla L_i(\vec{x}) \]
|
||||
|
||||
\begin{theorem}
|
||||
Under some assumptions and with an appropriate decrease in learning rate,
|
||||
SGD is guaranteed to converge to a local minimum.
|
||||
\end{theorem}
|
||||
|
||||
Different sizes of the mini-batch result in different behavior:
|
||||
\begin{descriptionlist}
|
||||
\item[Large mini-batches] accurate estimates of the gradient.
|
||||
\item[Small mini-batches] faster computation.
|
||||
\end{descriptionlist}
|
||||
Reference in New Issue
Block a user