From 5e8b070fe50b9470bb5cabadc73fb9f955d633cf Mon Sep 17 00:00:00 2001 From: NotXia <35894453+NotXia@users.noreply.github.com> Date: Mon, 11 Mar 2024 20:06:24 +0100 Subject: [PATCH] Add DL backpropagation --- src/deep-learning/dl.tex | 1 + src/deep-learning/sections/_training.tex | 186 +++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 src/deep-learning/sections/_training.tex diff --git a/src/deep-learning/dl.tex b/src/deep-learning/dl.tex index 9d5d923..8d6cc83 100644 --- a/src/deep-learning/dl.tex +++ b/src/deep-learning/dl.tex @@ -8,5 +8,6 @@ \makenotesfront \input{./sections/_expressivity.tex} + \input{./sections/_training.tex} \end{document} \ No newline at end of file diff --git a/src/deep-learning/sections/_training.tex b/src/deep-learning/sections/_training.tex new file mode 100644 index 0000000..cb658df --- /dev/null +++ b/src/deep-learning/sections/_training.tex @@ -0,0 +1,186 @@ +\chapter{Training} + + +\section{Gradient descent} + +\begin{enumerate} + \item + \marginnote{Gradient descent} + Start from a random set of weights $w$. + \item Compute the gradient $\nabla \mathcal{L}$ of the loss function. + \item Make a small step of size $-\nabla \mathcal{L}(w)$. + \item Go to 2., until convergence. +\end{enumerate} + +\begin{description} + \item[Learning rate] \marginnote{Learning rate} + Size of the step. Usually denoted with $\mu$. + \[ w = w - \mu \nabla \mathcal{L}(w) \] + + \item[Optimizer] \marginnote{Optimizer} + Algorithm that tunes the learning rate during training. + + \item[Stochastic gradient descent] \marginnote{Stochastic gradient descent} + Use a subset of the training data to compute the gradient. + \begin{description} + \item[Full-batch] Use the entire dataset. + \item[Mini-batch] Use a subset of the training data. + \item[Online] Use a single sample. + \end{description} + + \begin{remark} + SGD with mini-batch converges to the same result obtained using a full-batch approach. + \end{remark} + + \item[Momentum] \marginnote{Momentum} + Correct the update $v_t$ at time $t$ considering the update $v_{t-1}$ of time $t-1$. + \[ + \begin{split} + w_{t+1} &= w_t - v_t\\ + v_t &= \mu \nabla \mathcal{L}(w_t) + \alpha v_{t-1} + \end{split} + \] + + \begin{description} + \item[Nesterov momentum] \marginnote{Nesterov momentum} + Apply the momentum before computing the gradient. + \end{description} +\end{description} + + + +\section{Backpropagation} + +\begin{description} + \item[Chain rule] \marginnote{Chain rule} + Refer to \href{\gitSMM{}}{\texttt{SMM for AI (Section 5.1.1)}}. + + \item[Backpropagation] \marginnote{Backpropagation} + Algorithm to compute the gradient at each layer of a neural network. + + The output of the $i$-th neuron in the layer $l$ of a neural network can be defined as: + \[ a_{l,i} = \sigma_{l,i}( \vec{w}_{l,i}^T \vec{a}_{l-1} + b_{l,i} ) = \sigma_{l,i}(z_{l,i})\] + where: + \begin{itemize} + \item $a_{l,i} \in \mathbb{R}$ is the output of the neuron. + \item $\vec{w}_{l,i} \in \mathbb{R}^{n_{l-1}}$ is the vector of weights. + \item $\vec{a}_{l-1} \in \mathbb{R}^{n_{l-1}}$ is the vector of the outputs of the previous layer. + \item $b_{l,i} \in \mathbb{R}$ is the bias. + \item $\sigma_{l,i}: \mathbb{R} \rightarrow \mathbb{R}$ is the activation function\footnote{Even if it is possible to have a different activation function in each neuron, in practice, each layer has the same activation function.}. + \item $z_{l,i}(\vec{w}_{l,i}, b_{l,i} | \vec{a}_{l-1}) = \vec{w}_{l,i}^T \vec{a}_{l-1} + b_{l,i}$ is the argument of the activation function and is parametrized on $\vec{w}_{l,i}$ and $b_{l,i}$. + \end{itemize} + + Hence, the outputs of the $l$-th layer can be defined as: + \[ \vec{a}_l = \sigma_l( \matr{W}_l^T \vec{a}_{l-1} + \vec{b}_l ) = \sigma_l( \vec{z}_l(\matr{W}_l, \vec{b}_l | \vec{a}_{l-1}) ) \] + where: + \begin{itemize} + \item $\sigma_l: \mathbb{R}^{n_l} \rightarrow \mathbb{R}^{n_l}$ is the element-wise activation function. + \item + $\matr{W}_l \in \mathbb{R}^{n_l \times n_{l-1}}$, + $\vec{a}_{l-1} \in \mathbb{R}^{n_{l-1}}$, + $\vec{b}_l \in \mathbb{R}^{n_l}$, + $\vec{a}_l \in \mathbb{R}^{n_l}$. + \end{itemize} + + Finally, a neural network with input $\vec{x}$ can be expressed as: + \[ + \begin{split} + \vec{a}_0 &= \vec{x} \\ + \vec{a}_i &= \sigma_i( \vec{z}_i(\matr{W}_i, \vec{b}_i | \vec{a}_{i-1}) ) + \end{split} + \] + + Given a neural network with $K$ layers and a loss function $\mathcal{L}$, + we want to compute the derivative of $\mathcal{L}$ w.r.t. the weights of each layer to tune the parameters. + + First, we highlight the parameters of each of the functions involved: + \begin{descriptionlist} + \item[Loss] $\mathcal{L}(a_K) = \mathcal{L}(\sigma_K)$ takes as input the output of the network (i.e. the output of the last activation function). + \item[Activation function] $\sigma_i(\vec{z}_i)$ takes as input the value of the neurons at the $i$-th layer. + \item[Neurons] $\vec{z}_i(\matr{W}_i, \vec{b}_i)$ takes as input the weights and biases at the $i$-th layer. + \end{descriptionlist} + + Let $\odot$ be the Hadamard product. + By exploiting the chain rule, we can compute the derivatives w.r.t. the weights going backward: + \[ + \frac{\partial\mathcal{L}}{\partial\matr{W}_K} = + \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} \frac{\partial\vec{z}_K}{\partial\matr{W}_K} = + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} \cdot + \underset{1 \times \mathbb{R}^{n_{K-1}}}{\vec{a}_{K-1}^T} + \in \mathbb{R}^{n_K \times n_{K-1}} + \] + \[ + \begin{split} + \frac{\partial\mathcal{L}}{\partial\matr{W}_{K-1}} &= + \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_{K}} \frac{\partial\vec{z}_{K}}{\partial\sigma_{K-1}} + \frac{\partial\sigma_{K-1}}{\partial\vec{z}_{K-1}} \frac{\partial\vec{z}_{K-1}}{\partial\matr{W}_{K-1}} \\ + &= ( + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} + )^T \cdot + \underset{\mathbb{R}^{n_K} \times \mathbb{R}^{n_{K-1}}}{\matr{W}_K} \odot + \underset{\mathbb{R}^{n_{K-1}} \times 1}{\nabla\sigma_{K-1}(\vec{z}_{K-1})} \cdot + \underset{1 \times \mathbb{R}^{n_{K-2}}}{\vec{a}_{K-2}^T} + \in \mathbb{R}^{n_{K-1} \times n_{K-2}} + \end{split} + \] + \[ \vdots \] + In the same way, we can compute the derivatives w.r.t. the biases: + \[ + \frac{\partial\mathcal{L}}{\partial\vec{b}_K} = + \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} \frac{\partial\vec{z}_K}{\partial\vec{b}_K} = + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} \cdot + 1 + \in \mathbb{R}^{n_K} + \] + \[ + \begin{split} + \frac{\partial\mathcal{L}}{\partial\vec{b}_{K-1}} &= + \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_{K}} \frac{\partial\vec{z}_{K}}{\partial\sigma_{K-1}} + \frac{\partial\sigma_{K-1}}{\partial\vec{z}_{K-1}} \frac{\partial\vec{z}_{K-1}}{\partial\vec{b}_{K-1}} \\ + &= ( + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\mathcal{L}(\vec{a}_K)} \odot + \underset{\mathbb{R}^{n_K} \times 1}{\nabla\sigma_K(\vec{z}_K)} + )^T \cdot + \underset{\mathbb{R}^{n_K} \times \mathbb{R}^{n_{K-1}}}{\matr{W}_K} \odot + \underset{\mathbb{R}^{n_{K-1}} \times 1}{\nabla\sigma_{K-1}(\vec{z}_{K-1})} \cdot + 1 + \in \mathbb{R}^{n_{K-1}} + \end{split} + \] + \[ \vdots \] + + It can be noticed that many terms are repeated from one layer to another. + By exploiting this, we can store the following intermediate values: + \[ + \begin{split} + \delta_K &= \frac{\partial\mathcal{L}}{\partial\vec{z}_K} = \frac{\partial\mathcal{L}}{\partial\sigma_K} \frac{\partial\sigma_K}{\partial\vec{z}_K} = + \nabla\mathcal{L}(\vec{a}_K) \odot \nabla\sigma_K(\vec{z}_K) \\ + \delta_l &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l} = \delta_{l+1}^T \cdot \matr{W}_{l+1} \odot \nabla\sigma_l(\vec{z}_l) + \end{split} + \] + and reused them to compute the derivatives as follows: + \[ + \begin{split} + \frac{\partial\mathcal{L}}{\partial\matr{W}_l} &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l}\frac{\partial\vec{z}_l}{\partial\matr{W}_l} = + \delta_l \cdot \vec{a}_{l-1}^T \\ + \frac{\partial\mathcal{L}}{\partial\vec{b}_l} &= \frac{\partial\mathcal{L}}{\partial\vec{z}_l}\frac{\partial\vec{z}_l}{\partial\vec{b}_l} = + \delta_l \cdot 1 + \end{split} + \] +\end{description} + + +\begin{description} + \item[Vanishing gradient] \marginnote{Vanishing gradient} + As backpropagation consists of a chain of products, + when a component is small (i.e. $< 1$), it will gradually cancel out the gradient when backtracking, + causing the first layers to learn much slower than the last layers. + + \begin{remark} + This is an issue of the sigmoid function. + ReLU was designed to solve this problem. + \end{remark} +\end{description} \ No newline at end of file