mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add ZFNet, VGG, Inception v1
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 68 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 40 KiB |
@ -18,6 +18,20 @@
|
||||
Pooling layer with kernel size and stride chosen in such a way that
|
||||
some pixels at a step have also been considered at the previous one (e.g. $3 \times 3$ kernel with stride $2$).
|
||||
|
||||
\item[$1 \times 1$ convolution] \marginnote{$1 \times 1$ convolution}
|
||||
Convolution used to change the depth of the activation while maintaining its spatial dimension.
|
||||
It can be seen as a linear fully connected layer at each spatial dimension.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/_1conv.pdf}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
Stacking multiple $1 \times 1$ convolutions is equivalent to a multi-layer perceptron
|
||||
(i.e. universal function approximator).
|
||||
\end{remark}
|
||||
|
||||
\item[Parameters computation] \marginnote{Parameters computation}
|
||||
\phantom{}
|
||||
\begin{description}
|
||||
@ -117,7 +131,8 @@ The network has the following properties:
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/lenet5.png}
|
||||
\includegraphics[width=0.7\linewidth]{./img/lenet5.png}
|
||||
\caption{LeNet-5 architecture}
|
||||
\end{figure}
|
||||
|
||||
|
||||
@ -136,9 +151,14 @@ AlexNet is composed of:
|
||||
\item 3 feed-forward layers.
|
||||
\end{itemize}
|
||||
|
||||
\begin{remark}
|
||||
Some layers are normalized using local response normalization (more active neurons are enhanced and the others are inhibited).
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/alexnet.png}
|
||||
\includegraphics[width=0.75\linewidth]{./img/alexnet.png}
|
||||
\caption{AlexNet architecture}
|
||||
\end{figure}
|
||||
|
||||
|
||||
@ -155,7 +175,7 @@ Due to GPU memory limitations, training was split into two parallel lines on two
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
At the time, training took 5-6 days on two Nvidia GTX 580.
|
||||
At the time, training took 5-6 days on two NVIDIA GTX 580.
|
||||
\end{remark}
|
||||
|
||||
|
||||
@ -174,36 +194,389 @@ AlexNet has the following trends:
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\caption{Parameters of AlexNet (batch size of 128)}
|
||||
\small
|
||||
\scriptsize
|
||||
\begin{tabular}{cccccccccccc}
|
||||
\toprule
|
||||
\multirow{2}[20]{*}{\textbf{Layer}}
|
||||
& \multicolumn{4}{c}{\textbf{Convolution}}
|
||||
& \multicolumn{3}{c}{\textbf{Single activation}}
|
||||
& \multirow{2}[20]{*}{\texttt{\#params}}
|
||||
& \multicolumn{3}{c}{\textbf{Batch requirements}} \\
|
||||
\cmidrule(lr){2-5} \cmidrule(lr){6-8} \cmidrule(lr){10-12}
|
||||
& \multicolumn{2}{c}{\textbf{Batch requirements}}
|
||||
& \multicolumn{2}{c}{\textbf{Parameters}} \\
|
||||
\cmidrule(lr){2-5} \cmidrule(lr){6-8} \cmidrule(lr){9-10} \cmidrule(lr){11-12}
|
||||
& \rot{\textbf{Channels}} & \rot{\textbf{H/W}} & \rot{\textbf{Stride}} & \rot{\textbf{Padding}}
|
||||
& \rot{\textbf{H/W}} & \rot{\textbf{Channels}} & \rot{\texttt{\#activ.}}
|
||||
&
|
||||
& \rot{\textbf{MFLOPs}} & \rot{\textbf{Activ. mem.}} & \rot{\textbf{Params mem.}} \\
|
||||
& \rot{\textbf{H/W}} & \rot{\textbf{Channels}} & \rot{\texttt{\#activations}}
|
||||
& \rot{\textbf{MFLOPs}} & \rot{\textbf{Activ. mem.}} & \rot{\textbf{Amount}} & \rot{\textbf{Memory}} \\
|
||||
\midrule
|
||||
\texttt{input} & & & & & \num{227} & \num{3} & \num{154587} & \num{0} & -- & \num{75.5} MB & \num{0.0} \\
|
||||
\texttt{conv1} & \num{96} & \num{11} & \num{4} & \num{0} & \num{55} & \num{96} & \num{290400} & \num{35} K & \num{26986.3} & \num{283.6} MB & \num{0.4} MB \\
|
||||
\texttt{pool1} & \num{1} & \num{3} & \num{2} & \num{0} & \num{27} & \num{96} & \num{69984} & \num{0} & \num{80.6} & \num{68.3} MB & \num{0.0} \\
|
||||
\texttt{conv2} & \num{256} & \num{5} & \num{1} & \num{2} & \num{27} & \num{256} & \num{186624} & \num{615} K & \num{114661.8} & \num{182.3} MB & \num{7.0} MB \\
|
||||
\texttt{pool2} & \num{1} & \num{3} & \num{2} & \num{0} & \num{13} & \num{256} & \num{43264} & \num{0} & \num{49.8} & \num{42.3} MB & \num{0.0} \\
|
||||
\texttt{conv3} & \num{384} & \num{3} & \num{1} & \num{1} & \num{13} & \num{384} & \num{64896} & \num{885} K & \num{38277.2} & \num{63.4} MB & \num{10.1} MB \\
|
||||
\texttt{conv4} & \num{384} & \num{3} & \num{1} & \num{1} & \num{13} & \num{384} & \num{64896} & \num{1327} K & \num{57415.8} & \num{63.4} MB & \num{15.2} MB \\
|
||||
\texttt{conv5} & \num{256} & \num{3} & \num{1} & \num{1} & \num{13} & \num{256} & \num{43264} & \num{885} K & \num{38277.2} & \num{42.3} MB & \num{10.1} MB \\
|
||||
\texttt{pool3} & \num{1} & \num{3} & \num{2} & \num{0} & \num{6} & \num{256} & \num{9216} & \num{0} & \num{10.6} & \num{9.0} MB & \num{0.0} \\
|
||||
\texttt{flatten} & \num{0} & \num{0} & \num{0} & \num{0} & \num{1} & \num{9216} & \num{9216} & \num{0} & \num{0.0} & \num{0.0} & \num{0.0} \\
|
||||
\texttt{fc6} & \num{4096} & \num{1} & \num{1} & \num{0} & \num{1} & \num{4096} & \num{4096} & \num{37758} K & \num{9663.7} & \num{4.0} MB & \num{432.0} MB \\
|
||||
\texttt{fc7} & \num{4096} & \num{1} & \num{1} & \num{0} & \num{1} & \num{4096} & \num{4096} & \num{16781} K & \num{4295.0} & \num{4.0} MB & \num{192.0} MB \\
|
||||
\texttt{fc8} & \num{1000} & \num{1} & \num{1} & \num{0} & \num{1} & \num{1000} & \num{1000} & \num{4097} K & \num{1048.6} & \num{1.0} MB & \num{46.9} MB \\
|
||||
\texttt{input} & -- & -- & -- & -- & \num{227} & \num{3} & \num{154587} & -- & \num{75.5} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv1} & \num{96} & \num{11} & \num{4} & \num{0} & \num{55} & \num{96} & \num{290400} & \num{26986.3} & \num{283.6} {\tiny MB} & \num{35} {\tiny K} & \num{0.4} {\tiny MB} \\
|
||||
\texttt{pool1} & \num{1} & \num{3} & \num{2} & \num{0} & \num{27} & \num{96} & \num{69984} & \num{80.6} & \num{68.3} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv2} & \num{256} & \num{5} & \num{1} & \num{2} & \num{27} & \num{256} & \num{186624} & \num{114661.8} & \num{182.3} {\tiny MB} & \num{615} {\tiny K} & \num{7.0} {\tiny MB} \\
|
||||
\texttt{pool2} & \num{1} & \num{3} & \num{2} & \num{0} & \num{13} & \num{256} & \num{43264} & \num{49.8} & \num{42.3} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv3} & \num{384} & \num{3} & \num{1} & \num{1} & \num{13} & \num{384} & \num{64896} & \num{38277.2} & \num{63.4} {\tiny MB} & \num{885} {\tiny K} & \num{10.1} {\tiny MB} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv4} & \num{384} & \num{3} & \num{1} & \num{1} & \num{13} & \num{384} & \num{64896} & \num{57415.8} & \num{63.4} {\tiny MB} & \num{1327} {\tiny K} & \num{15.2} {\tiny MB} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv5} & \num{256} & \num{3} & \num{1} & \num{1} & \num{13} & \num{256} & \num{43264} & \num{38277.2} & \num{42.3} {\tiny MB} & \num{885} {\tiny K} & \num{10.1} {\tiny MB} \\
|
||||
\texttt{pool3} & \num{1} & \num{3} & \num{2} & \num{0} & \num{6} & \num{256} & \num{9216} & \num{10.6} & \num{9.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{flatten} & \num{0} & \num{0} & \num{0} & \num{0} & \num{1} & \num{9216} & \num{9216} & \num{0.0} & \num{0.0} & \num{0} & \num{0.0} \\
|
||||
\texttt{fc6} & \num{4096} & \num{1} & \num{1} & \num{0} & \num{1} & \num{4096} & \num{4096} & \num{9663.7} & \num{4.0} {\tiny MB} & \num{37758} {\tiny K} & \num{432.0} {\tiny MB} \\
|
||||
\texttt{fc7} & \num{4096} & \num{1} & \num{1} & \num{0} & \num{1} & \num{4096} & \num{4096} & \num{4295.0} & \num{4.0} {\tiny MB} & \num{16781} {\tiny K} & \num{192.0} {\tiny MB} \\
|
||||
\texttt{fc8} & \num{1000} & \num{1} & \num{1} & \num{0} & \num{1} & \num{1000} & \num{1000} & \num{1048.6} & \num{1.0} {\tiny MB} & \num{4097} {\tiny K} & \num{46.9} {\tiny MB} \\
|
||||
\midrule
|
||||
&&&&&&& \textbf{Total} & \num{62378} K & \num{290851} & \num{1.406} MB & \num{714} MB \\
|
||||
&&&&&&& \textbf{Total} & \num{290851} & \num{1406} M{\tiny B} & \num{62378} {\tiny K} & \num{714} M{\tiny B} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
\section{ZFNet/Clarifai}
|
||||
\marginnote{ZFNet/Clarifai}
|
||||
|
||||
The aggressive stem layer of AlexNet causes dead neurons that do not specialize in recognizing anything.
|
||||
|
||||
Ablation and visualization studies found out that the first stem layer works better if split into two layers
|
||||
respectively with a $7 \times 7$ and $5 \times 5$ kernel, both with stride $2$.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_zfnet.pdf}
|
||||
\caption{ZFNet architecture}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.4\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/alexnet_stem.png}
|
||||
\caption{AlexNet}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.4\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/zfnet_stem.png}
|
||||
\caption{ZFNet}
|
||||
\end{subfigure}
|
||||
\caption{First layer activations comparison}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\section{VGG}
|
||||
\marginnote{VGG}
|
||||
|
||||
|
||||
\subsection{Architecture}
|
||||
|
||||
Network with a higher depth and smaller components.
|
||||
The authors constrained the layers to:
|
||||
\begin{itemize}
|
||||
\item $3 \times 3$ convolutions with stride $1$ and padding $1$.
|
||||
\item $2 \times 2$ max-pooling with stride $2$ and padding $0$.
|
||||
\item Number of channels that doubles after each pool.
|
||||
\end{itemize}
|
||||
|
||||
\begin{remark}
|
||||
It has been found out that deeper networks work better.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Stage] \marginnote{Stage}
|
||||
Fixed combination of layers that process inputs of the same spatial resolution.
|
||||
|
||||
VGG stages are:
|
||||
\begin{itemize}
|
||||
\item $\texttt{conv} \mapsto \texttt{conv} \mapsto \texttt{pool}$.
|
||||
\item $\texttt{conv} \mapsto \texttt{conv} \mapsto \texttt{conv} \mapsto \texttt{pool}$.
|
||||
\item $\texttt{conv} \mapsto \texttt{conv} \mapsto \texttt{conv} \mapsto \texttt{conv} \mapsto \texttt{pool}$.
|
||||
\end{itemize}
|
||||
|
||||
\begin{remark}
|
||||
One stage has the same receptive field of a single larger convolution but
|
||||
has fewer parameters, requires less computation and adds more non-linearity.
|
||||
|
||||
On the other hand, two activations are computed and both need to be stored for backpropagation.
|
||||
|
||||
\begin{example}
|
||||
\phantom{}
|
||||
\begin{center}
|
||||
\footnotesize
|
||||
\begin{tabular}{cccc}
|
||||
\toprule
|
||||
\textbf{Convolutional layer} & \textbf{Parameters} & \textbf{FLOPs} & \textbf{Activations} \\
|
||||
\midrule
|
||||
$C \times C \times 5 \times 5$, $S=1$, $P=2$ & $25C^2 + C$ & $50C^2 \cdot W_\text{in} \cdot H_\text{in}$ & $C \cdot W_\text{in} \cdot H_\text{in}$ \\
|
||||
Two stacked $C \times C \times 3 \times 3$, $S=1$, $P=1$ & $18C^2 + 2C$ & $36C^2 \cdot W_\text{in} \cdot H_\text{in}$ & $2 \cdot C \cdot W_\text{in} \cdot H_\text{in}$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{center}
|
||||
\end{example}
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
Local response normalization was experimented with and dropped.
|
||||
As batch normalization had not been invented yet, weights at deeper layers were initialized from shallower architectures.
|
||||
\end{remark}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\caption{Architecture of various versions of VGG}
|
||||
\scriptsize
|
||||
\begin{tabular}{c|c|c|c|c|c}
|
||||
\toprule
|
||||
\makecell{\textbf{A}\\\tiny(11 weight layers)} &
|
||||
\makecell{\textbf{B}\\\tiny(11 weight layers)} &
|
||||
\makecell{\textbf{C}\\\tiny(13 weight layers)} &
|
||||
\makecell{\textbf{D}\\\tiny(16 weight layers)} &
|
||||
\makecell{\textbf{VGG-16}\\\tiny(16 weight layers)} &
|
||||
\makecell{\textbf{VGG-19}\\\tiny(19 weight layers)} \\
|
||||
\bottomrule
|
||||
\toprule
|
||||
\multicolumn{6}{c}{Input ($224 \times 224$ RGB image)} \\
|
||||
\midrule
|
||||
conv3-64 & conv3-64 & conv3-64 & conv3-64 & conv3-64 & conv3-64 \\
|
||||
& LRN & conv3-64 & conv3-64 & conv3-64 & conv3-64 \\
|
||||
\midrule
|
||||
\multicolumn{6}{c}{max-pool} \\
|
||||
\midrule
|
||||
conv3-128 & conv3-128 & conv3-128 & conv3-128 & conv3-128 & conv3-128 \\
|
||||
& & conv3-128 & conv3-128 & conv3-128 & conv3-128 \\
|
||||
\midrule
|
||||
\multicolumn{6}{c}{max-pool} \\
|
||||
\midrule
|
||||
conv3-256 & conv3-256 & conv3-256 & conv3-256 & conv3-256 & conv3-256 \\
|
||||
conv3-256 & conv3-256 & conv3-256 & conv3-256 & conv3-256 & conv3-256 \\
|
||||
& & & conv1-256 & conv3-256 & conv3-256 \\
|
||||
& & & & & conv3-256 \\
|
||||
\midrule
|
||||
\multicolumn{6}{c}{max-pool} \\
|
||||
\midrule
|
||||
conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 \\
|
||||
conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 \\
|
||||
& & & conv1-512 & conv3-512 & conv3-512 \\
|
||||
& & & & & conv3-512 \\
|
||||
\midrule
|
||||
\multicolumn{6}{c}{max-pool} \\
|
||||
\midrule
|
||||
conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 \\
|
||||
conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 & conv3-512 \\
|
||||
& & & conv1-512 & conv3-512 & conv3-512 \\
|
||||
& & & & & conv3-512 \\
|
||||
\midrule
|
||||
\multicolumn{6}{c}{max-pool} \\
|
||||
\midrule
|
||||
\multicolumn{6}{c}{FC-4096} \\
|
||||
\multicolumn{6}{c}{FC-4096} \\
|
||||
\multicolumn{6}{c}{FC-1000} \\
|
||||
\multicolumn{6}{c}{\texttt{softmax}} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
|
||||
VGG-16 has the following trends:
|
||||
\begin{itemize}
|
||||
\item Most of the parameters are concentrated at the fully connected layers.
|
||||
\item Most of the computation is required by the convolutions.
|
||||
\item Most of the memory is required to store the activations as there are no stem layers.
|
||||
\item Training was done on 4 GPUs with data parallelism for 2-3 weeks.
|
||||
\end{itemize}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\caption{Parameters of VGG-16 (batch size of 128)}
|
||||
\scriptsize
|
||||
\begin{tabular}{cccccccccccc}
|
||||
\toprule
|
||||
\multirow{2}[20]{*}{\textbf{Layer}}
|
||||
& \multicolumn{4}{c}{\textbf{Convolution}}
|
||||
& \multicolumn{3}{c}{\textbf{Single activation}}
|
||||
& \multicolumn{2}{c}{\textbf{Batch requirements}}
|
||||
& \multicolumn{2}{c}{\textbf{Parameters}} \\
|
||||
\cmidrule(lr){2-5} \cmidrule(lr){6-8} \cmidrule(lr){9-10} \cmidrule(lr){11-12}
|
||||
& \rot{\textbf{Channels}} & \rot{\textbf{H/W}} & \rot{\textbf{Stride}} & \rot{\textbf{Padding}}
|
||||
& \rot{\textbf{H/W}} & \rot{\textbf{Channels}} & \rot{\texttt{\#activations}}
|
||||
& \rot{\textbf{MFLOPs}} & \rot{\textbf{Activ. mem.}} & \rot{\textbf{Amount}} & \rot{\textbf{Memory}} \\
|
||||
\midrule
|
||||
\texttt{input} & -- & -- & -- & -- & 224 & \num{3} & \num{150528} & -- & \num{73.5} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv1} & 64 & 3 & 1 & 1 & 224 & \num{64} & \num{3211264} & \num{22196.3} & \num{3136.0} {\tiny MB} & \num{2} {\tiny K} & \num{0.0} \\
|
||||
\texttt{conv2} & 64 & 3 & 1 & 1 & 224 & \num{64} & \num{3211264} & \num{473520.1} & \num{3136.0} {\tiny MB} & \num{37} {\tiny K} & \num{0.4} {\tiny MB} \\
|
||||
\texttt{pool1} & 1 & 2 & 2 & 0 & 112 & \num{64} & \num{802816} & \num{411.0} & \num{784.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv3} & 128 & 3 & 1 & 1 & 112 & \num{128} & \num{1605632} & \num{236760.1} & \num{1568.0} {\tiny MB} & \num{74} {\tiny K} & \num{0.8} {\tiny MB} \\
|
||||
\texttt{conv4} & 128 & 3 & 1 & 1 & 112 & \num{128} & \num{1605632} & \num{473520.1} & \num{1568.0} {\tiny MB} & \num{148} {\tiny K} & \num{1.7} {\tiny MB} \\
|
||||
\texttt{pool2} & 1 & 2 & 2 & 0 & 56 & \num{128} & \num{401408} & \num{205.5} & \num{392.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv5} & 256 & 3 & 1 & 1 & 56 & \num{256} & \num{802816} & \num{236760.1} & \num{784.0} {\tiny MB} & \num{295} {\tiny K} & \num{3.4} {\tiny MB} \\
|
||||
\texttt{conv6} & 256 & 3 & 1 & 1 & 56 & \num{256} & \num{802816} & \num{473520.1} & \num{784.0} {\tiny MB} & \num{590} {\tiny K} & \num{6.8} {\tiny MB} \\
|
||||
\texttt{conv7} & 256 & 3 & 1 & 1 & 56 & \num{256} & \num{802816} & \num{473520.1} & \num{784.0} {\tiny MB} & \num{590} {\tiny K} & \num{6.8} {\tiny MB} \\
|
||||
\texttt{pool3} & 1 & 2 & 2 & 0 & 28 & \num{256} & \num{200704} & \num{102.8} & \num{196.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv8} & 512 & 3 & 1 & 1 & 28 & \num{512} & \num{401408} & \num{236760.1} & \num{392.0} {\tiny MB} & \num{1180} {\tiny K} & \num{13.5} {\tiny MB} \\
|
||||
\texttt{conv9} & 512 & 3 & 1 & 1 & 28 & \num{512} & \num{401408} & \num{473520.1} & \num{392.0} {\tiny MB} & \num{2360} {\tiny K} & \num{27.0} {\tiny MB} \\
|
||||
\texttt{conv10} & 512 & 3 & 1 & 1 & 28 & \num{512} & \num{401408} & \num{473520.1} & \num{392.0} {\tiny MB} & \num{2360} {\tiny K} & \num{27.0} {\tiny MB} \\
|
||||
\texttt{pool4} & 1 & 2 & 2 & 0 & 14 & \num{512} & \num{100352} & \num{51.4} & \num{98.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{conv11} & 512 & 3 & 1 & 1 & 14 & \num{512} & \num{100352} & \num{118380.0} & \num{98.0} {\tiny MB} & \num{2360} {\tiny K} & \num{27.0} {\tiny MB} \\
|
||||
\texttt{conv12} & 512 & 3 & 1 & 1 & 14 & \num{512} & \num{100352} & \num{118380.0} & \num{98.0} {\tiny MB} & \num{2360} {\tiny K} & \num{27.0} {\tiny MB} \\
|
||||
\texttt{conv13} & 512 & 3 & 1 & 1 & 14 & \num{512} & \num{100352} & \num{118380.0} & \num{98.0} {\tiny MB} & \num{2360} {\tiny K} & \num{27.0} {\tiny MB} \\
|
||||
\texttt{pool5} & 1 & 2 & 2 & 0 & 7 & \num{512} & \num{25088} & \num{12.8} & \num{24.5} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-12}
|
||||
\texttt{flatten} & 1 & 1 & 1 & 0 & 1 & \num{25088} & \num{25088} & \num{0.0} & \num{0.0} & \num{0} & \num{0.0} \\
|
||||
\texttt{fc14} & 4096 & 1 & 1 & 0 & 1 & \num{4096} & \num{4096} & \num{26306.7} & \num{4.0} {\tiny MB} & \num{102786} {\tiny K} & \num{1176.3} {\tiny MB} \\
|
||||
\texttt{fc15} & 4096 & 1 & 1 & 0 & 1 & \num{4096} & \num{4096} & \num{4295.0} & \num{4.0} {\tiny MB} & \num{16781} {\tiny K} & \num{192.0} {\tiny MB} \\
|
||||
\texttt{fc16} & 1000 & 1 & 1 & 0 & 1 & \num{1000} & \num{1000} & \num{1048.6} & \num{1.0} {\tiny MB} & \num{4100} {\tiny K} & \num{46.9} {\tiny MB} \\
|
||||
\midrule
|
||||
&&&&&&& \textbf{Total} & \num{3961171} & \num{14733} {\tiny MB} & \num{138382} {\tiny K} & \num{1584} {\tiny MB} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
|
||||
|
||||
|
||||
\section{Inception v1 (GoogLeNet)}
|
||||
\marginnote{Inception v1 (GoogLeNet)}
|
||||
|
||||
Network that aims to optimize computing resources.
|
||||
|
||||
|
||||
\subsection{Architecture}
|
||||
|
||||
\begin{description}
|
||||
\item[Stem layers]
|
||||
Down-sample the image from a shape of 224 to 28.
|
||||
As in ZFNet, multiple layers are used (5) and the largest convolution is of shape $7 \times 7$ and stride $2$.
|
||||
|
||||
\item[Inception module] \marginnote{Inception module}
|
||||
Main component of Inception v1 that computes multiple convolutions on the input.
|
||||
|
||||
\begin{description}
|
||||
\item[Naive approach]
|
||||
Given the input, the output is the concatenation of:
|
||||
\begin{itemize}
|
||||
\item A $5 \times 5$ convolution with stride $1$ and padding $2$.
|
||||
\item A $3 \times 3$ convolution with stride $1$ and padding $1$.
|
||||
\item A $1 \times 1$ convolution with stride $1$ and padding $0$.
|
||||
\item A $3 \times 3$ max-pooling with stride $1$ and padding $1$.
|
||||
\end{itemize}
|
||||
|
||||
By using this approach, two problems arise:
|
||||
\begin{itemize}
|
||||
\item The max-pooling layer outputs a large number of channels (same as input).
|
||||
\item The convolutions are computationally expensive due to the large number of input channels.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_naive_inception.pdf}
|
||||
\caption{Naive inception module on the output of the stem layers}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\item[Actual approach]
|
||||
Same as the naive approach, but max-pooling, $5 \times 5$ and $3 \times 3$ convolutions
|
||||
are preceded by $1 \times 1$ convolutions.
|
||||
|
||||
\begin{remark}
|
||||
For max-pooling, the $1 \times 1$ convolution can indifferently be placed before or after.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_actual_inception.pdf}
|
||||
\caption{Actual inception module on the output of the stem layers}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
\begin{remark}
|
||||
The multiple convolutions of an inception module can be seen as decision components.
|
||||
\end{remark}
|
||||
|
||||
\item[Auxiliary \texttt{softmax}]
|
||||
Intermediate \texttt{softmax}s are used to ensure that hidden features are good enough.
|
||||
They also act as regularizers.
|
||||
|
||||
During inference, they are discarded.
|
||||
|
||||
\item[Global average pooling classifier] \marginnote{Global average pooling classifier}
|
||||
Instead of flattening between the convolutional and fully connected layers,
|
||||
global average pooling is used to reduce the number of parameters.
|
||||
|
||||
\begin{remark}
|
||||
If the kernel size of the pooling layer is computed by the layer itself (e.g. \texttt{AdaptiveAvgPool2d}),
|
||||
the network will be able to process inputs of any size (but this does not guarantee the quality of classification for all the image shapes).
|
||||
\end{remark}
|
||||
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[angle=-90, width=0.85\linewidth]{./img/_inception_v1.pdf}
|
||||
\caption{Architecture of Inception v1}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
|
||||
\begin{itemize}
|
||||
\item The fully connected layer has a relatively small amount of parameters and a negligible number of FLOPs.
|
||||
\item Metrics were measured using test-time augmentation
|
||||
(the input image is split into random small chunks and each is processed by the network separately. The final result is the average of the results as in ensemble models).
|
||||
Strictly speaking, this makes results difficult to compare to other models that only do a single pass.
|
||||
\end{itemize}
|
||||
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\caption{Parameters of Inception v1 (batch size of 128)}
|
||||
\scriptsize
|
||||
\setlength{\tabcolsep}{2pt}
|
||||
\begin{tabular}{cccccccccccccccccccc}
|
||||
\toprule
|
||||
\multirow{2}[20]{*}{\textbf{Layer}}
|
||||
& \multicolumn{4}{c}{\makecell{ \textbf{Incep. $1 \times 1$}\\\textbf{Other conv.} }}
|
||||
& \multicolumn{3}{c}{\textbf{Incep. $3 \times 3$}}
|
||||
& \multicolumn{3}{c}{\textbf{Incep. $5 \times 5$}}
|
||||
& \multicolumn{2}{c}{\textbf{Max-pool}}
|
||||
& \multicolumn{3}{c}{\textbf{Single activ.}}
|
||||
& \multicolumn{2}{c}{\textbf{Batch requir.}}
|
||||
& \multicolumn{2}{c}{\textbf{Parameters}} \\
|
||||
\cmidrule(lr){2-5} \cmidrule(lr){6-8} \cmidrule(lr){9-11} \cmidrule(lr){12-13} \cmidrule(lr){14-16} \cmidrule(lr){17-18} \cmidrule(lr){19-20}
|
||||
& \rot{\textbf{Channels}} & \rot{\textbf{H/W}} & \rot{\textbf{Stride}} & \rot{\textbf{Padding}}
|
||||
& \rot{\textbf{Channels}} & \rot{\textbf{$1 \times 1$ ch.s}} & \rot{\textbf{H/W}}
|
||||
& \rot{\textbf{Channels}} & \rot{\textbf{$1 \times 1$ ch.s}} & \rot{\textbf{H/W}}
|
||||
& \rot{\textbf{$1 \times 1$ ch.s}} & \rot{\textbf{H/W}}
|
||||
& \rot{\textbf{H/W}} & \rot{\textbf{Channels}} & \rot{\texttt{\#activations}}
|
||||
& \rot{\textbf{MFLOPs}} & \rot{\textbf{Activ. mem.}} & \rot{\textbf{Amount}} & \rot{\textbf{Memory}} \\
|
||||
\midrule
|
||||
\texttt{input} & -- & -- & -- & -- & -- & -- & -- & -- & -- & -- & -- & -- & 224 & \num{3} & \num{150528} & -- & \num{73.5} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\cmidrule(lr){1-20}
|
||||
\texttt{conv1} & 64 & 7 & 2 & 3 & -- & -- & -- & -- & -- & -- & -- & -- & 112 & \num{64} & \num{802816} & \num{30211.6} & \num{784.0} {\tiny MB} & \num{9} {\tiny K} & \num{0.1} {\tiny MB} \\
|
||||
\texttt{pool1} & 1 & 3 & 2 & 1 & -- & -- & -- & -- & -- & -- & -- & -- & 56 & \num{64} & \num{200704} & \num{231.2} & \num{196.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\texttt{conv2} & 64 & 1 & 1 & 0 & -- & -- & -- & -- & -- & -- & -- & -- & 56 & \num{64} & \num{200704} & \num{3288.3} & \num{196.0} {\tiny MB} & \num{4} {\tiny K} & \num{0.0} \\
|
||||
\texttt{conv3} & 192 & 3 & 1 & 1 & -- & -- & -- & -- & -- & -- & -- & -- & 56 & \num{192} & \num{602112} & \num{88785.0} & \num{588.0} {\tiny MB} & \num{111} {\tiny K} & \num{1.3} {\tiny MB} \\
|
||||
\cmidrule(lr){1-20}
|
||||
\texttt{pool2} & 1 & 3 & 2 & 1 & -- & -- & -- & -- & -- & -- & -- & -- & 28 & \num{192} & \num{150528} & \num{173.4} & \num{147.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\texttt{incep1} & 64 & 1 & 1 & 0 & 128 & 96 & 3 & 32 & 16 & 5 & 32 & 3 & 28 & \num{256} & \num{200704} & \num{31380.5} & \num{196.0} {\tiny MB} & \num{163} {\tiny K} & \num{1.9} {\tiny MB} \\
|
||||
\texttt{incep2} & 128 & 1 & 1 & 0 & 192 & 128 & 3 & 96 & 32 & 5 & 64 & 3 & 28 & \num{480} & \num{376320} & \num{75683.1} & \num{367.5} {\tiny MB} & \num{388} {\tiny K} & \num{4.4} {\tiny MB} \\
|
||||
\texttt{pool3} & 1 & 3 & 2 & 1 & -- & -- & -- & -- & -- & -- & -- & -- & 14 & \num{480} & \num{94080} & \num{108.4} & \num{91.9} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\texttt{incep3} & 192 & 1 & 1 & 0 & 208 & 96 & 3 & 48 & 16 & 5 & 64 & 3 & 14 & \num{512} & \num{100352} & \num{17403.4} & \num{98.0} {\tiny MB} & \num{376} {\tiny K} & \num{4.3} {\tiny MB} \\
|
||||
\texttt{incep4} & 160 & 1 & 1 & 0 & 224 & 112 & 3 & 64 & 24 & 5 & 64 & 3 & 14 & \num{512} & \num{100352} & \num{20577.8} & \num{98.0} {\tiny MB} & \num{449} {\tiny K} & \num{5.1} {\tiny MB} \\
|
||||
\texttt{incep5} & 128 & 1 & 1 & 0 & 256 & 128 & 3 & 64 & 24 & 5 & 64 & 3 & 14 & \num{512} & \num{100352} & \num{23609.2} & \num{98.0} {\tiny MB} & \num{509} {\tiny K} & \num{5.8} {\tiny MB} \\
|
||||
\texttt{incep5} & 112 & 1 & 1 & 0 & 288 & 144 & 3 & 64 & 32 & 5 & 64 & 3 & 14 & \num{528} & \num{103488} & \num{28233.4} & \num{101.1} {\tiny MB} & \num{605} {\tiny K} & \num{6.9} {\tiny MB} \\
|
||||
\texttt{incep6} & 256 & 1 & 1 & 0 & 320 & 160 & 3 & 128 & 32 & 5 & 128 & 3 & 14 & \num{832} & \num{163072} & \num{41445.4} & \num{159.3} {\tiny MB} & \num{867} {\tiny K} & \num{9.9} {\tiny MB} \\
|
||||
\texttt{pool4} & 1 & 3 & 2 & 1 & -- & -- & -- & -- & -- & -- & -- & -- & 7 & \num{832} & \num{40768} & \num{47.0} & \num{39.8} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\texttt{incep7} & 256 & 1 & 1 & 0 & 320 & 160 & 3 & 128 & 32 & 5 & 128 & 3 & 7 & \num{832} & \num{40768} & \num{11860.0} & \num{39.8} {\tiny MB} & \num{1042} {\tiny K} & \num{11.9} {\tiny MB} \\
|
||||
\texttt{incep8} & 384 & 1 & 1 & 0 & 384 & 192 & 3 & 128 & 48 & 5 & 128 & 3 & 7 & \num{1024} & \num{50176} & \num{16689.7} & \num{49.0} {\tiny MB} & \num{1443} {\tiny K} & \num{16.5} {\tiny MB} \\
|
||||
\cmidrule(lr){1-20}
|
||||
\texttt{avgpool} & 1 & 1 & 1 & 0 & -- & -- & -- & -- & -- & -- & -- & -- & 1 & \num{1024} & \num{1024} & \num{6.4} & \num{1.0} {\tiny MB} & \num{0} & \num{0.0} \\
|
||||
\texttt{fc1} & 1000 & 1 & 1 & 0 & -- & -- & -- & -- & -- & -- & -- & -- & 1 & \num{1000} & \num{1000} & \num{262.1} & \num{1.0} {\tiny MB} & \num{1025} {\tiny K} & \num{11.7} {\tiny MB} \\
|
||||
\midrule
|
||||
&&&&&&&&&&&&&&& \textbf{Total} & \num{389996} & \num{3251} {\tiny MB} & \num{6992} {\tiny K} & \num{80} {\tiny MB} \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
Reference in New Issue
Block a user