diff --git a/src/year2/machine-learning-for-computer-vision/ainotes.cls b/src/year2/machine-learning-for-computer-vision/ainotes.cls new file mode 120000 index 0000000..146fd3c --- /dev/null +++ b/src/year2/machine-learning-for-computer-vision/ainotes.cls @@ -0,0 +1 @@ +../../ainotes.cls \ No newline at end of file diff --git a/src/year2/machine-learning-for-computer-vision/img/2order_1step.png b/src/year2/machine-learning-for-computer-vision/img/2order_1step.png new file mode 100644 index 0000000..ddc4d84 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/2order_1step.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_2order_optimizer.pdf b/src/year2/machine-learning-for-computer-vision/img/_2order_optimizer.pdf new file mode 100644 index 0000000..c371bf6 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_2order_optimizer.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/adagrad.png b/src/year2/machine-learning-for-computer-vision/img/adagrad.png new file mode 100644 index 0000000..c8be7f3 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/adagrad.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/adaptive_lr.png b/src/year2/machine-learning-for-computer-vision/img/adaptive_lr.png new file mode 100644 index 0000000..0b4f21d Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/adaptive_lr.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/momentum.png b/src/year2/machine-learning-for-computer-vision/img/momentum.png new file mode 100644 index 0000000..aa063b1 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/momentum.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/nesterov_comparison.png b/src/year2/machine-learning-for-computer-vision/img/nesterov_comparison.png new file mode 100644 index 0000000..f55202e Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/nesterov_comparison.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/nesterov_momentum.png b/src/year2/machine-learning-for-computer-vision/img/nesterov_momentum.png new file mode 100644 index 0000000..fc4ee57 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/nesterov_momentum.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/rmsprop.png b/src/year2/machine-learning-for-computer-vision/img/rmsprop.png new file mode 100644 index 0000000..c0eb4bd Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/rmsprop.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/sgd_canyon.png b/src/year2/machine-learning-for-computer-vision/img/sgd_canyon.png new file mode 100644 index 0000000..dac3ed3 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/sgd_canyon.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/sgd_local_minima.png b/src/year2/machine-learning-for-computer-vision/img/sgd_local_minima.png new file mode 100644 index 0000000..91f110b Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/sgd_local_minima.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/sgd_sphere.png b/src/year2/machine-learning-for-computer-vision/img/sgd_sphere.png new file mode 100644 index 0000000..11383e8 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/sgd_sphere.png differ diff --git a/src/year2/machine-learning-for-computer-vision/metadata.json b/src/year2/machine-learning-for-computer-vision/metadata.json new file mode 100644 index 0000000..5c3bebf --- /dev/null +++ b/src/year2/machine-learning-for-computer-vision/metadata.json @@ -0,0 +1,11 @@ +{ + "name": "Machine Learning for Computer Vision", + "year": 2, + "semester": 1, + "pdfs": [ + { + "name": null, + "path": "ml4cv.pdf" + } + ] +} \ No newline at end of file diff --git a/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex b/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex new file mode 100644 index 0000000..7b9be4c --- /dev/null +++ b/src/year2/machine-learning-for-computer-vision/sections/_optimizers.tex @@ -0,0 +1,235 @@ +\chapter{Optimizers} + + + +\section{Stochastic gradient descent with mini-batches} + +\begin{description} + \item[Stochastic gradient descent (SGD)] \marginnote{SGD} + Gradient descent based on a noisy approximation of the gradient computed on mini-batches of $B$ data samples. + + An epoch $e$ of SGD with mini-batches of size $B$ does the following: + \begin{enumerate} + \item Shuffle the training data $\matr{D}^\text{train}$. + \item For $u = 0, \dots, U$, with $U = \lceil \frac{N}{B} \rceil$: + \begin{enumerate} + \item Classify the examples $\matr{X}^{(u)} = \{ \vec{x}^{(Bu)}, \dots, \vec{x}^{(B(u+1)-1)} \}$ + to obtain the predictions $\hat{Y}^{(u)} = f(\vec{X}^{(u)}; \matr{\theta}^{(e*U+u)})$ + and the loss $\mathcal{L}\big( \matr{\theta}^{(e*U+u)}, (\matr{X}^{(u)}, \hat{Y}^{(u)}) \big)$. + \item Compute the gradient $\nabla \mathcal{L} = \frac{\partial\mathcal{L}}{\partial \matr{\theta}}\big( \matr{\theta}^{(e*U+u)}, (\matr{X}^{(u)}, \hat{Y}^{(u)}) \big)$. + \item Update the parameters $\matr{\theta}^{(e*U+u+1)} = \matr{\theta}^{(e*U+u)} - \texttt{lr} \cdot \nabla \mathcal{L}$. + \end{enumerate} + \end{enumerate} +\end{description} + +\begin{remark}[Spheres] \marginnote{SGD spheres} + GD/SGD works better on convex functions (e.g., paraboloids) as there are no preferred directions to reach a minimum. Moreover, faster convergence can be obtained by using a higher learning rate. + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/sgd_sphere.png} + \end{figure} +\end{remark} + +\begin{remark}[Canyons] \marginnote{SGD canyons} + A function has a canyon shape if it grows faster in some directions. + + The trajectory of SGD oscillates in a canyon (the steep area) and a smaller learning rate is required to reach convergence. Note that, even though there are oscillations, the loss alone decreases and is unable to show the oscillating behavior. + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/sgd_canyon.png} + \end{figure} +\end{remark} + +\begin{remark}[Local minima] \marginnote{SGD local minima} + GD/SGD converges to a critical point. Therefore, it might end up in a saddle point or local minima. + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/sgd_local_minima.png} + \end{figure} +\end{remark} + + + +\section{Second-order methods} + +Methods that also consider the second-order derivatives when determining the step. + +\begin{description} + \item[Newton's method] \marginnote{Newton's method} + Second-order method for the 1D case based on the Taylor expansion: + \[ f(x_t + \Delta x) \approx f(x_t) + f'(x_t)\Delta x + \frac{1}{2}f''(x_t)\Delta x^2 \] + which can be seen as a paraboloid over the variable $\Delta x$. + + Given a function $f$ and a point $x_t$, the method fits a paraboloid at $x_t$ with the same slope and curvature at $f(x_t)$. The update is determined as the step required to reach the minimum of the paraboloid from $x_t$. It can be shown that this step is $-\frac{f'(x_t)}{f''(x_t)}$. + + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/_2order_optimizer.pdf} + \end{figure} +\end{description} + +\begin{remark} + For quadratic functions, second-order methods converge in one step. + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/2order_1step.png} + \end{figure} +\end{remark} + +\begin{description} + \item[General second-order method] \marginnote{General second-order method} + For a generic multivariate non-quadratic function, the update is: + \[ -\texttt{lr} \cdot \matr{H}_f^{-1}(\vec{x}_t) \nabla f(\vec{x}_t) \] + where $\matr{H}_f$ is the Hessian matrix. + + \begin{remark} + Given $k$ variables, $\matr{H}$ requires $O(k^2)$ memory. Moreover, inverting a matrix has time complexity $O(k^3)$. Therefore, in practice second-order methods are not applicable for large models. + \end{remark} +\end{description} + + + +\section{Momentum} + +\begin{description} + \item[Standard momentum]\marginnote{Standard momentum} + Add a velocity term $v^{(t)}$ to account for past gradient updates: + \[ + \begin{split} + v^{(t+1)} &= \mu v^{(t)} - \texttt{lr} \nabla \mathcal{L}(\matr{\theta}^{(t)}) \\ + \matr{\theta}^{(t+1)} &= \matr{\theta}^{(t)} + v^{(t+1)} + \end{split} + \] + where $\mu \in [0, 1[$ is the momentum coefficient. + + In other words, $v^{(t+1)}$ represents a weighted average of the updates steps done up until time $t$. + + \begin{remark} + Momentum helps to counteract a poor conditioning of the Hessian matrix when working with canyons. + \end{remark} + \begin{remark} + Momentum helps to reduce the effect of variance of the approximated gradients (i.e., acts as a low-pass filter). + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/momentum.png} + \caption{ + \parbox[t]{0.7\linewidth}{ + Plain SGD vs momentum SGD in a sphere and a canyon. In both cases, momentum converges before SGD. + } + } + \end{figure} + + \item[Nesterov momentum] \marginnote{Nesterov momentum} + Variation of the standard momentum that computes the gradient step considering the velocity term: + \[ + \begin{split} + v^{(t+1)} &= \mu v^{(t)} - \texttt{lr} \nabla \mathcal{L}(\matr{\theta}^{(t)} + \mu v^{(t)}) \\ + \matr{\theta}^{(t+1)} &= \matr{\theta}^{(t)} + v^{(t+1)} + \end{split} + \] + + \begin{remark} + The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as $\matr{\theta}^{(t)}$ has been partially updated. + \end{remark} + + \begin{remark} + In practice, there are methods to formulate Nesterov momentum without the need of computing the gradient at $\matr{\theta}^{(t)} + \mu v^{(t)}$. + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/nesterov_momentum.png} + \caption{Visualization of the step in Nesterov momentum} + \end{figure} + + \begin{figure}[H] + \centering + \includegraphics[width=0.75\linewidth]{./img/nesterov_comparison.png} + \caption{Plain SGD vs standard momentum vs Nesterov momentum} + \end{figure} +\end{description} + + + +\section{Adaptive learning rates methods} + +\begin{description} + \item[Adaptive learning rates] \marginnote{Adaptive learning rates} + Methods to define per-parameter adaptive learning rates. + + Ideally, assuming that the changes in the curvature of the loss are axis-aligned (e.g., in a canyon), it is reasonable to obtain a faster convergence by: + \begin{itemize} + \item Reducing the learning rate along the dimension where the gradient is large. + \item Increasing the learning rate along the dimension where the gradient is small. + \end{itemize} + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/adaptive_lr.png} + \caption{ + \parbox[t]{0.5\linewidth}{Loss where the $w_1$ parameter has a larger gradient, while $w_2$ has a smaller gradient} + } + \end{figure} + + \begin{remark} + As the landscape of a high-dimensional loss cannot be seen, automatic methods to adjust the learning rates must be used. + \end{remark} +\end{description} + + +\subsection{AdaGrad} + +\begin{description} + \item[Adaptive gradient (AdaGrad)] \marginnote{Adaptive gradient (AdaGrad)} + Each entry of the gradient is rescaled by the inverse of the history of its squared values: + \[ + \begin{split} + \mathbb{R}^{N \times 1} \ni \vec{s}^{(t+1)} &= \vec{s}^{(t)} + \nabla\mathcal{L}(\vec{\theta}^{(t)}) \odot \nabla\mathcal{L}(\vec{\theta}^{(t)}) \\ + \mathbb{R}^{N \times 1} \ni \vec{\theta}^{(t+1)} &= \vec{\theta}^{(t)} - \frac{\texttt{lr}}{\sqrt{\vec{s}^{(t+1)}} + \varepsilon} \odot \nabla\mathcal{L}(\vec{\theta}^{(t)}) + \end{split} + \] + where: + \begin{itemize} + \item $\odot$ is the element-wise product. + \item Division and square root are element-wise. + \item $\varepsilon$ is a small constant. + \end{itemize} + + \begin{remark} + By how it is defined, $\vec{s}^{(t)}$ is monotonically increasing which might reduce the learning rate too early when the minimum is still far away. + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/adagrad.png} + \caption{ + \parbox[t]{0.45\linewidth}{SGD vs AdaGrad. AdaGrad stops before getting close to the minimum.} + } + \end{figure} +\end{description} + + +\subsection{RMSProp} + +\begin{description} + \item[RMSProp] \marginnote{RMSProp} + Modified version of AdaGrad that down-weighs the gradient history $s^{(t)}$: + \[ + \begin{split} + \vec{s}^{(t+1)} &= \beta \vec{s}^{(t)} + (1-\beta) \nabla\mathcal{L}(\vec{\theta}^{(t)}) \odot \nabla\mathcal{L}(\vec{\theta}^{(t)}) \\ + \vec{\theta}^{(t+1)} &= \vec{\theta}^{(t)} - \frac{\texttt{lr}}{\sqrt{\vec{s}^{(t+1)}} + \varepsilon} \odot \nabla\mathcal{L}(\vec{\theta}^{(t)}) + \end{split} + \] + where $\beta \in [0, 1]$ (typically $0.9$ or higher) makes $s^{(t)}$ an exponential moving average. + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/rmsprop.png} + \caption{SGD vs AdaGrad vs RMSProp} + \end{figure} +\end{description} \ No newline at end of file