mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Fix typos <noupdate>
This commit is contained in:
@ -292,7 +292,7 @@ Network with bottleneck-block-inspired inception modules.
|
||||
Global average pooling to obtain a channel-wise vector.
|
||||
|
||||
\item[Excitation]
|
||||
Feed-forward network that first compresses the input channels by a ratio $r$ (typically $16$) and then restores them.
|
||||
Feed-forward network that first compresses the input channels by a ratio $r$ (typically $16$) and then restores them. A final sigmoid gives the channel weights.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
@ -341,7 +341,7 @@ Network with bottleneck-block-inspired inception modules.
|
||||
|
||||
\begin{description}
|
||||
\item[Inverted residual block] \marginnote{Inverted residual block}
|
||||
Modified bottleneck block defined as follows:
|
||||
Modified bottleneck block composed by:
|
||||
\begin{enumerate}
|
||||
\item A $1 \times 1$ convolution to expand the input channels by a factor of $t$.
|
||||
\item A $3 \times 3$ depth-wise convolution.
|
||||
@ -370,25 +370,25 @@ Network with bottleneck-block-inspired inception modules.
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\caption{\parbox[t]{0.6\linewidth}{Architecture of MobileNetV2 with expansion factor ($t$), number of channels ($c$), number of times a block is repeated ($n$), and stride ($s$).}}
|
||||
\caption{\parbox[t]{0.77\linewidth}{Architecture of MobileNetV2 with expansion factor ($t$), number of channels ($c$), number of times a block is repeated ($n$), and stride ($s$).}}
|
||||
\small
|
||||
\begin{tabular}{cccccc}
|
||||
\toprule
|
||||
\textbf{Input} & \textbf{Operator} & $t$ & $c$ & $n$ & $s$ \\
|
||||
\midrule
|
||||
$2242 \times 3$ & \texttt{conv2d} & - & 32 & 1 & 2 \\
|
||||
$224^2 \times 3$ & \texttt{conv2d} & - & 32 & 1 & 2 \\
|
||||
\midrule
|
||||
$1122 \times 32$ & \texttt{bottleneck} & 1 & 16 & 1 & 1 \\
|
||||
$1122 \times 16$ & \texttt{bottleneck} & 6 & 24 & 2 & 2 \\
|
||||
$562 \times 24$ & \texttt{bottleneck} & 6 & 32 & 3 & 2 \\
|
||||
$282 \times 32$ & \texttt{bottleneck} & 6 & 64 & 4 & 2 \\
|
||||
$142 \times 64$ & \texttt{bottleneck} & 6 & 96 & 3 & 1 \\
|
||||
$142 \times 96$ & \texttt{bottleneck} & 6 & 160 & 3 & 2 \\
|
||||
$72 \times 160$ & \texttt{bottleneck} & 6 & 320 & 1 & 1 \\
|
||||
$112^2 \times 32$ & \texttt{bottleneck} & 1 & 16 & 1 & 1 \\
|
||||
$112^2 \times 16$ & \texttt{bottleneck} & 6 & 24 & 2 & 2 \\
|
||||
$56^2 \times 24$ & \texttt{bottleneck} & 6 & 32 & 3 & 2 \\
|
||||
$28^2 \times 32$ & \texttt{bottleneck} & 6 & 64 & 4 & 2 \\
|
||||
$14^2 \times 64$ & \texttt{bottleneck} & 6 & 96 & 3 & 1 \\
|
||||
$14^2 \times 96$ & \texttt{bottleneck} & 6 & 160 & 3 & 2 \\
|
||||
$7^2 \times 160$ & \texttt{bottleneck} & 6 & 320 & 1 & 1 \\
|
||||
\midrule
|
||||
$72 \times 320$ & \texttt{conv2d $1\times1$} & - & 1280 & 1 & 1 \\
|
||||
$7^2 \times 320$ & \texttt{conv2d $1\times1$} & - & 1280 & 1 & 1 \\
|
||||
\midrule
|
||||
$72 \times 1280$ & \texttt{avgpool $7\times7$} & - & - & 1 & - \\
|
||||
$7^2 \times 1280$ & \texttt{avgpool $7\times7$} & - & - & 1 & - \\
|
||||
$1 \times 1 \times 1280$ & \texttt{conv2d $1\times1$} & - & k & - & 1 \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
@ -401,7 +401,7 @@ Network with bottleneck-block-inspired inception modules.
|
||||
|
||||
\begin{description}
|
||||
\item[Single dimension scaling]
|
||||
Scaling a baseline model by width, depth, or resolution. It generally always improve the accuracy.
|
||||
Scaling a baseline model by width, depth, or resolution generally improves the accuracy.
|
||||
|
||||
\begin{description}
|
||||
\item[Width scaling] \marginnote{Width scaling}
|
||||
@ -443,7 +443,7 @@ Network with bottleneck-block-inspired inception modules.
|
||||
In practice, $\alpha$, $\beta$, and $\gamma$ are determined through grid search.
|
||||
|
||||
\begin{remark}
|
||||
The constraint is formulated in this way as FLOPS scales linearly with depth but quadratically with width and resolution.
|
||||
The constraint is formulated in this way as FLOPS scales linearly by depth but quadratically by width and resolution.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
@ -105,7 +105,7 @@ Methods that also consider the second-order derivatives when determining the ste
|
||||
\]
|
||||
where $\mu \in [0, 1[$ is the momentum coefficient.
|
||||
|
||||
In other words, $v^{(t+1)}$ represents a weighted average of the updates steps done up until time $t$.
|
||||
In other words, $v^{(t+1)}$ represents a weighted average of the update steps done up until time $t$.
|
||||
|
||||
\begin{remark}
|
||||
Momentum helps to counteract a poor conditioning of the Hessian matrix when working with canyons.
|
||||
@ -134,7 +134,7 @@ Methods that also consider the second-order derivatives when determining the ste
|
||||
\]
|
||||
|
||||
\begin{remark}
|
||||
The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as $\matr{\theta}^{(t)}$ has been partially updated.
|
||||
The key idea is that, once $\mu v^{(t)}$ is summed to $\matr{\theta}^{(t)}$, the gradient computed at $\matr{\theta}^{(t)}$ is obsolete as it has been partially updated.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
@ -228,7 +228,7 @@ Methods that also consider the second-order derivatives when determining the ste
|
||||
where $\beta \in [0, 1]$ (typically $0.9$ or higher) makes $s^{(t)}$ an exponential moving average.
|
||||
|
||||
\begin{remark}
|
||||
RMSProp is faster than SGD at the beginning and slows down reaching similar performances as SGD.
|
||||
RMSProp is faster than SGD at the beginning before slowing down and reaching similar performances as SGD.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
@ -254,11 +254,11 @@ Methods that also consider the second-order derivatives when determining the ste
|
||||
|
||||
Moreover, as $\vec{g}^{(0)} = 0$, $\vec{s}^{(0)} = 0$, and $\beta_1$, $\beta_2$ are typically large (i.e., past history weighs more), Adam starts by taking small steps (e.g., $\vec{g}^{(1)} = (1-\beta_1) \nabla\mathcal{L}(\vec{\theta}^{(0)})$ is simply rescaling the gradient for no reason). To cope with this, a debiased formulation of $\vec{g}$ and $\vec{s}$ is used:
|
||||
\[
|
||||
\vec{g}^{(t)}_{\text{debiased}} = \frac{g^{(t+1)}}{1-\beta_1^{t+1}}
|
||||
\vec{g}^{(t)}_{\text{debiased}} = \frac{\vec{g}^{(t+1)}}{1-\beta_1^{t+1}}
|
||||
\qquad
|
||||
\vec{s}^{(t)}_{\text{debiased}} = \frac{s^{(t+1)}}{1-\beta_2^{t+1}}
|
||||
\vec{s}^{(t)}_{\text{debiased}} = \frac{\vec{s}^{(t+1)}}{1-\beta_2^{t+1}}
|
||||
\]
|
||||
where the denominator $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.
|
||||
where the denominators $(1-\beta_i^{t+1}) \rightarrow 1$ for increasing values of $t$.
|
||||
|
||||
Finally, the update is defined as:
|
||||
\[
|
||||
@ -296,7 +296,7 @@ Methods that also consider the second-order derivatives when determining the ste
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape" small basins.
|
||||
Momentum based approaches tend to prefer large basins. Intuitively, by accumulating momentum, it is possible to ``escape'' smaller ones.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
|
||||
@ -70,7 +70,7 @@
|
||||
With the projection $\matr{W}_K \in \mathbb{R}^{d_Y \times d_K}$ such that $\mathbb{R}^{M \times d_K} \ni \matr{K} = \matr{Y} \matr{W}_K$, where $d_K$ is the dimension of the keys.
|
||||
|
||||
\item[Query]
|
||||
With the projection $\matr{W}_Q \in \mathbb{R}^{d_X \times d_K}$ such that $\mathbb{R}^{d_K} \ni \vec{q}_1 = \matr{Y} \matr{W}_X$, where $d_X$ is the length of $\vec{x}_1$ that is no longer required to be $d_Y$ as there is a projection.
|
||||
With the projection $\matr{W}_Q \in \mathbb{R}^{d_X \times d_K}$ such that $\mathbb{R}^{d_K} \ni \vec{q}_1 = \vec{x}_1 \matr{W}_X$, where $d_X$ is the length of $\vec{x}_1$ that is no longer required to be $d_Y$ as there is a projection.
|
||||
|
||||
\item[Values]
|
||||
With the projection $\matr{W}_V \in \mathbb{R}^{d_Y \times d_V}$ such that $\mathbb{R}^{M \times d_V} \ni \matr{V} = \matr{Y} \matr{W}_K$, where $d_V$ is the dimension of the values.
|
||||
@ -88,7 +88,7 @@
|
||||
% \caption{Steps of scaled dot-product attention}
|
||||
\end{figure}
|
||||
|
||||
Finally, due to the linear projections, instead of a single vector there can be an arbitrary number $N$ of inputs $\matr{X} \in \mathbb{R}^{N \times d_X}$ to compute the queries $\mathbb{R}^{N \times d_K} \ni \matr{Q} = \matr{X} \matr{W}_Q$. This change affects the similarity scores $\matr{Q}\matr{K}^T \in \mathbb{R}^{N \times M}$ and the output activations $\matr{A} \in \mathbb{R}^{N \times d_V}$.
|
||||
Finally, due to the linear projections, instead of a single vector, there can be an arbitrary number $N$ of inputs $\matr{X} \in \mathbb{R}^{N \times d_X}$ to compute the queries $\mathbb{R}^{N \times d_K} \ni \matr{Q} = \matr{X} \matr{W}_Q$. This change affects the similarity scores $\matr{Q}\matr{K}^T \in \mathbb{R}^{N \times M}$ and the output activations $\matr{A} \in \mathbb{R}^{N \times d_V}$.
|
||||
|
||||
The overall attention mechanism can be defined as:
|
||||
\[ \matr{A} = \texttt{softmax}_\texttt{row-wise}\left( \frac{\matr{Q}\matr{K}^T}{\sqrt{d_K}} \right) \matr{V} \]
|
||||
@ -166,7 +166,7 @@
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Layer normalization is easier to distribute on multiple computation units and has the same behavior at both train and inference time.
|
||||
Layer normalization is easier to distribute on multiple computation units and has the same behavior at both training and inference time.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
@ -176,7 +176,7 @@
|
||||
\end{figure}
|
||||
|
||||
\item[Feed-forward network (\texttt{FFN})] \marginnote{Feed-forward network}
|
||||
MLP with one hidden layer applied to each token independently. ReLU or one of its variants are used as activation function:
|
||||
MLP with one hidden layer applied to each token independently. ReLU or one of its variant is used as activation function:
|
||||
\[ \texttt{FFN}(\vec{x}) = \texttt{relu}(\vec{x}\matr{W}_1 + \vec{b}_1)\matr{W}_2 + \vec{b}_2 \]
|
||||
|
||||
\begin{remark}
|
||||
@ -301,7 +301,7 @@
|
||||
|
||||
\begin{description}
|
||||
\item[Masked self-attention] \marginnote{Masked self-attention}
|
||||
Modification to self-attention to prevent tokens to attend at future positions (i.e., at their right). This can be done by either setting the similarity scores with future tokens to $-\infty$ or directly setting the corresponding attention weights to $0$ (i.e., make the attention weights a triangular matrix).
|
||||
Modification to self-attention to prevent tokens from attending at future positions (i.e., at their right). This can be done by either setting the similarity scores with future tokens to $-\infty$ or directly setting the corresponding attention weights to $0$ (i.e., make the attention weights a triangular matrix).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
@ -318,7 +318,7 @@
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_self_attention_permutation.jpg}
|
||||
\includegraphics[width=0.7\linewidth]{./img/_self_attention_permutation.jpg}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
|
||||
@ -372,7 +372,7 @@
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Compared to text, image pixels are more redundant and less semantically rich. Therefore, processing all of them together is not strictly necessary.
|
||||
Compared to text, image pixels are more redundant and less semantically rich. Therefore, processing all of them individually is not strictly necessary.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
|
||||
Reference in New Issue
Block a user