Add image compression

This commit is contained in:
2024-12-04 20:42:59 +01:00
parent c5f6b1bf80
commit 00381a20d0
186 changed files with 196 additions and 173 deletions

View File

@ -176,7 +176,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_grouped_conv.pdf}
\includegraphics[width=0.7\linewidth]{./img/_grouped_conv.jpg}
\end{figure}
By processing the input in smaller chunks, there are the following gains:
@ -195,7 +195,7 @@ Network with bottleneck-block-inspired inception modules.
Given the number of branches $G$ and the number of intermediate channels $d$, a ResNeXt block decomposes a bottleneck residual block into $G$ parallel branches that are summed out at the end.
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/_resnext_block.pdf}
\includegraphics[width=0.35\linewidth]{./img/_resnext_block.jpg}
\end{figure}
\begin{remark}
@ -245,7 +245,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_resnext_to_resnet_l3.pdf}
\includegraphics[width=0.8\linewidth]{./img/_resnext_to_resnet_l3.jpg}
\end{figure}
\item[First $1 \times 1$ convolution]
@ -253,7 +253,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_resnext_to_resnet_l1.pdf}
\includegraphics[width=0.8\linewidth]{./img/_resnext_to_resnet_l1.jpg}
\end{figure}
\item[$3 \times 3$ convolution]
@ -261,7 +261,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_resnext_to_resnet_l2.pdf}
\includegraphics[width=0.6\linewidth]{./img/_resnext_to_resnet_l2.jpg}
\end{figure}
\end{descriptionlist}
@ -301,7 +301,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/se_resnet.png}
\includegraphics[width=0.4\linewidth]{./img/se_resnet.jpg}
\caption{SE-ResNet module}
\end{figure}
\end{description}
@ -331,7 +331,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_depthwise_conv.pdf}
\includegraphics[width=0.45\linewidth]{./img/_depthwise_conv.jpg}
\end{figure}
\end{description}
@ -351,7 +351,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_inverted_residual.pdf}
\includegraphics[width=0.4\linewidth]{./img/_inverted_residual.jpg}
\end{figure}
\end{description}
@ -414,7 +414,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/single_model_scaling.png}
\includegraphics[width=0.85\linewidth]{./img/single_model_scaling.jpg}
\caption{\parbox[t]{0.7\linewidth}{Top-1 accuracy variation with width, depth, and resolution scaling on EfficientNet}}
\end{figure}
@ -423,7 +423,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/compound_scaling.png}
\includegraphics[width=0.45\linewidth]{./img/compound_scaling.jpg}
\caption{Width scaling for different fixed depths and resolutions}
\end{figure}
@ -450,7 +450,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{./img/_model_scaling.pdf}
\includegraphics[width=0.95\linewidth]{./img/_model_scaling.jpg}
\caption{Model scaling approaches}
\end{figure}
@ -463,7 +463,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/wide_resnet.png}
\includegraphics[width=0.5\linewidth]{./img/wide_resnet.jpg}
\end{figure}
\begin{remark}
@ -481,7 +481,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/neural_architecture_search.png}
\includegraphics[width=0.45\linewidth]{./img/neural_architecture_search.jpg}
\end{figure}
\begin{remark}
@ -494,7 +494,7 @@ Network with bottleneck-block-inspired inception modules.
Scaling the baseline model (B0) allowed obtaining high accuracies with a controlled number of FLOPs.
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/efficientnet_scaling.png}
\includegraphics[width=0.45\linewidth]{./img/efficientnet_scaling.jpg}
\end{figure}
\end{description}
@ -521,7 +521,7 @@ Network with bottleneck-block-inspired inception modules.
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{./img/regnet.png}
\includegraphics[width=0.95\linewidth]{./img/regnet.jpg}
\end{figure}
In other words, RegNet defines a $16$-dimensional design space. To evaluate the architectures, the following is done:
@ -532,7 +532,7 @@ Network with bottleneck-block-inspired inception modules.
\item Evaluate the design space by plotting $F$.
\begin{figure}[H]
\centering
\includegraphics[width=0.25\linewidth]{./img/edf.png}
\includegraphics[width=0.25\linewidth]{./img/edf.jpg}
\caption{Example of cumulative distribution}
\end{figure}

View File

@ -14,7 +14,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/stereo_correspondence.png}
\includegraphics[width=0.65\linewidth]{./img/stereo_correspondence.jpg}
\end{figure}
\end{description}
@ -56,7 +56,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_stereo_pipeline_naive.pdf}
\includegraphics[width=0.7\linewidth]{./img/_stereo_pipeline_naive.jpg}
\end{figure}
\item[Reconstruction approach]
@ -64,7 +64,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_stereo_pipeline_reconstruction.pdf}
\includegraphics[width=0.45\linewidth]{./img/_stereo_pipeline_reconstruction.jpg}
\end{figure}
\end{description}
\end{description}
@ -79,7 +79,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_monodepth_naive.pdf}
\includegraphics[width=0.8\linewidth]{./img/_monodepth_naive.jpg}
\caption{Naive training flow}
\end{figure}
@ -97,7 +97,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/_monodepth_train_naive.pdf}
\includegraphics[width=0.65\linewidth]{./img/_monodepth_train_naive.jpg}
\caption{Backward reconstruction from the right image}
\end{figure}
\end{description}
@ -107,13 +107,13 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/_monodepth_train_correct.pdf}
\includegraphics[width=0.65\linewidth]{./img/_monodepth_train_correct.jpg}
\caption{Backward reconstruction from the left image}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_monodepth_correct.pdf}
\includegraphics[width=0.7\linewidth]{./img/_monodepth_correct.jpg}
\caption{Actual training flow}
\end{figure}
@ -141,7 +141,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/monodepth_no_lr_results.png}
\includegraphics[width=0.9\linewidth]{./img/monodepth_no_lr_results.jpg}
\end{figure}
\end{remark}
@ -170,7 +170,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_monodepth_lr.pdf}
\includegraphics[width=0.7\linewidth]{./img/_monodepth_lr.jpg}
\end{figure}
\item[Inference]
@ -189,7 +189,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/monodepth_lr_results.png}
\includegraphics[width=0.7\linewidth]{./img/monodepth_lr_results.jpg}
\caption{Comparison of Monodepth with and without left-right processing}
\end{figure}
\end{description}
@ -212,7 +212,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_sfmlearner.pdf}
\includegraphics[width=0.5\linewidth]{./img/_sfmlearner.jpg}
\caption{SfMLearner with two nearby images}
\end{figure}
\end{description}

View File

@ -8,7 +8,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/generative_task.png}
\includegraphics[width=0.4\linewidth]{./img/generative_task.jpg}
\end{figure}
\begin{remark}
@ -16,7 +16,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/image_manifold.png}
\includegraphics[width=0.6\linewidth]{./img/image_manifold.jpg}
\end{figure}
\end{remark}
@ -33,7 +33,7 @@
Model that takes as input a latent representation and maps it into an output image.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/latent_for_generation.png}
\includegraphics[width=0.7\linewidth]{./img/latent_for_generation.jpg}
\end{figure}
\begin{remark}
@ -192,7 +192,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_inception_score.pdf}
\includegraphics[width=0.8\linewidth]{./img/_inception_score.jpg}
\end{figure}
\end{description}
@ -205,7 +205,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/earth_mover.png}
\includegraphics[width=0.4\linewidth]{./img/earth_mover.jpg}
\caption{
\parbox[t]{0.8\linewidth}{
Three cases of density functions distance. The distributions in the first case are closer than the second one. In the third case, they are mostly overlapping.
@ -228,7 +228,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/earth_mover_plan.png}
\includegraphics[width=0.6\linewidth]{./img/earth_mover_plan.jpg}
\end{figure}
@ -277,7 +277,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/manifold_precision_recall.png}
\includegraphics[width=0.8\linewidth]{./img/manifold_precision_recall.jpg}
\end{figure}
@ -296,7 +296,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_gan_flow.pdf}
\includegraphics[width=0.8\linewidth]{./img/_gan_flow.jpg}
\end{figure}
@ -405,7 +405,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/dcgan.png}
\includegraphics[width=0.6\linewidth]{./img/dcgan.jpg}
\end{figure}
\end{description}
@ -414,7 +414,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/gan_latent_interpolation.png}
\includegraphics[width=0.5\linewidth]{./img/gan_latent_interpolation.jpg}
\end{figure}
\end{remark}
@ -442,7 +442,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/gan_disjoint.png}
\includegraphics[width=0.7\linewidth]{./img/gan_disjoint.jpg}
\end{figure}
\indenttbox
@ -480,7 +480,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/_progan.pdf}
\includegraphics[width=0.65\linewidth]{./img/_progan.jpg}
\end{figure}
\begin{description}
@ -494,7 +494,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_progan_fadein.pdf}
\includegraphics[width=0.7\linewidth]{./img/_progan_fadein.jpg}
\caption{
\parbox[t]{0.7\linewidth}{
ProGAN fade-in. (a) is the starting resolution. (b) depicts the fade-in process. (c) represents the network at the end of the training process for this resolution (i.e., with $\alpha=1$)
@ -531,7 +531,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_stylegan.pdf}
\includegraphics[width=0.4\linewidth]{./img/_stylegan.jpg}
\end{figure}
\begin{remark}

View File

@ -47,7 +47,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_cnn_knn_face_recognition.pdf}
\includegraphics[width=0.7\linewidth]{./img/_cnn_knn_face_recognition.jpg}
\end{figure}
\begin{remark}
@ -57,7 +57,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_mnist_embeddings.pdf}
\includegraphics[width=0.45\linewidth]{./img/_mnist_embeddings.jpg}
\caption{MNIST embeddings in 2D}
\end{figure}
\end{remark}
@ -89,7 +89,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_siamese_network.pdf}
\includegraphics[width=0.7\linewidth]{./img/_siamese_network.jpg}
\end{figure}
\item[Contrastive loss] \marginnote{Contrastive loss}
@ -140,7 +140,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/deepid2.png}
\includegraphics[width=0.7\linewidth]{./img/deepid2.jpg}
\caption{DeepID2 on a single crop}
\end{figure}
\end{description}
@ -183,7 +183,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_triplet_loss.pdf}
\includegraphics[width=0.45\linewidth]{./img/_triplet_loss.jpg}
\end{figure}
\begin{remark}
@ -240,7 +240,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_embedding_l2_norm_effect.pdf}
\includegraphics[width=0.6\linewidth]{./img/_embedding_l2_norm_effect.jpg}
\end{figure}
\end{remark}
@ -253,7 +253,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_template_matching.pdf}
\includegraphics[width=0.6\linewidth]{./img/_template_matching.jpg}
\end{figure}
\end{remark}
@ -267,12 +267,12 @@
\centering
\begin{subfigure}{0.48\linewidth}
\centering
\includegraphics[width=0.5\linewidth]{./img/_arcface_softmax.pdf}
\includegraphics[width=0.5\linewidth]{./img/_arcface_softmax.jpg}
\caption{Softmax}
\end{subfigure}
\begin{subfigure}{0.48\linewidth}
\centering
\includegraphics[width=0.5\linewidth]{./img/_arcface_cluster.pdf}
\includegraphics[width=0.5\linewidth]{./img/_arcface_cluster.jpg}
\caption{ArcFace}
\end{subfigure}
\caption{
@ -288,13 +288,13 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/_arcface_penalty.pdf}
\includegraphics[width=0.65\linewidth]{./img/_arcface_penalty.jpg}
\caption{Penalty application with \texttt{Curie} as the correct class}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{./img/_arcface_flow.png}
\includegraphics[width=0.95\linewidth]{./img/_arcface_flow.jpg}
\caption{Overall ArcFace flow}
\end{figure}
\end{description}
@ -416,7 +416,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_clip_training.pdf}
\includegraphics[width=0.6\linewidth]{./img/_clip_training.jpg}
\caption{
\parbox[t]{0.6\linewidth}{CLIP training flow. NT-Xent loss is applied column or row-wise in the dot product matrix.}
}
@ -438,7 +438,7 @@
Given an image to classify, it is embedded and compared with the embeddings of prompts referencing the classes (e.g., \texttt{a photo of a [object]}). The closest one is considered as the predicted class.
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/_clip_inference.pdf}
\includegraphics[width=0.85\linewidth]{./img/_clip_inference.jpg}
\end{figure}
\end{description}
@ -454,12 +454,12 @@
\centering
\begin{subfigure}{0.35\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift.pdf}
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift.jpg}
\end{subfigure}
\hfill
\begin{subfigure}{0.6\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift_datasets.pdf}
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift_datasets.jpg}
\end{subfigure}
\end{figure}
\end{remark}
@ -476,6 +476,6 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_clip_generation_conditioning.pdf}
\includegraphics[width=0.5\linewidth]{./img/_clip_generation_conditioning.jpg}
\end{figure}
\end{remark}

View File

@ -12,7 +12,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_object_detection_example.pdf}
\includegraphics[width=0.45\linewidth]{./img/_object_detection_example.jpg}
\end{figure}
\begin{remark}
@ -55,7 +55,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/obj_det_recall_precision.png}
\includegraphics[width=0.7\linewidth]{./img/obj_det_recall_precision.jpg}
\caption{
Recall and precision in different scenarios
}
@ -68,7 +68,7 @@
Consider the following image and the bounding boxes found by a detector:
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve1.pdf}
\includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve1.jpg}
\caption{
\parbox[t]{0.6\linewidth}{
Ground-truth (yellow boxes) and predictions (orange boxes) with their confidence score
@ -79,7 +79,7 @@
By sorting the confidence scores, it is possible to plot the precision-recall curve by varying the threshold $\rho_\text{min}$:
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve2.pdf}
\includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve2.jpg}
\end{figure}
\indenttbox
@ -164,7 +164,7 @@
The training samples and their initial weights are the following:
\begin{figure}[H]
\centering
\includegraphics[width=0.3\linewidth]{./img/_adaboost_example1.pdf}
\includegraphics[width=0.3\linewidth]{./img/_adaboost_example1.jpg}
\end{figure}
We want to train an ensemble of $3$ decision stumps $\texttt{WL}_{j}$.
@ -176,7 +176,7 @@
The new reweighed and normalized samples are:
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_adaboost_example2.pdf}
\includegraphics[width=0.9\linewidth]{./img/_adaboost_example2.jpg}
\end{figure}
Now, assume that the second classifier learns $x_1 > 10$. The error rate and reweigh factor are:
@ -185,7 +185,7 @@
The new reweighed and normalized samples are:
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_adaboost_example3.pdf}
\includegraphics[width=0.7\linewidth]{./img/_adaboost_example3.jpg}
\end{figure}
Finally, the third classifier learns $x_2 > 20$. The error rate and reweigh factor are:
@ -220,13 +220,13 @@
\centering
\begin{subfigure}{0.6\linewidth}
\centering
\includegraphics[width=0.5\linewidth]{./img/_haar_like_example.pdf}
\includegraphics[width=0.5\linewidth]{./img/_haar_like_example.jpg}
\caption{Filter applied on a patch}
\end{subfigure}
\hfill
\begin{subfigure}{0.35\linewidth}
\centering
\includegraphics[width=0.65\linewidth]{./img/_haar_like_filters_example.pdf}
\includegraphics[width=0.65\linewidth]{./img/_haar_like_filters_example.jpg}
\caption{Other possible filters}
\end{subfigure}
\caption{Example of filters}
@ -247,7 +247,7 @@
In other words, the value at coordinates $(i, j)$ in the integral image is the sum of all the pixels of the original image in an area that starts from the top-left corner and has as bottom-right corner the pixel at $(i, j)$.
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_integral_image.pdf}
\includegraphics[width=0.45\linewidth]{./img/_integral_image.jpg}
\caption{Example of integral image}
\end{figure}
@ -262,7 +262,7 @@
where $A$, $B$, $C$, and $D$ are coordinates defined as in \Cref{fig:integral_image_features}.
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_integral_image_feature.pdf}
\includegraphics[width=0.5\linewidth]{./img/_integral_image_feature.jpg}
\caption{Summation of the pixels in the blue area}
\label{fig:integral_image_features}
\end{figure}
@ -290,7 +290,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_viola_jones_cascade.pdf}
\includegraphics[width=0.8\linewidth]{./img/_viola_jones_cascade.jpg}
\end{figure}
\end{description}
@ -341,7 +341,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_cnn_object_localization.pdf}
\includegraphics[width=0.9\linewidth]{./img/_cnn_object_localization.jpg}
\caption{Localizer with AlexNet as feature extractor and 1000 classes}
\end{figure}
@ -383,7 +383,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/selective_search.png}
\includegraphics[width=0.45\linewidth]{./img/selective_search.jpg}
\caption{Example of some iterations of selective search}
\end{figure}
@ -404,7 +404,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_r_cnn.pdf}
\includegraphics[width=0.8\linewidth]{./img/_r_cnn.jpg}
\caption{Example of R-CNN using AlexNet}
\end{figure}
@ -471,7 +471,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_fast_r_cnn.pdf}
\includegraphics[width=0.8\linewidth]{./img/_fast_r_cnn.jpg}
\caption{Example of fast R-CNN using AlexNet}
\end{figure}
@ -486,7 +486,7 @@
\end{remark}
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.85\linewidth]{./img/_roipool_snap.pdf}
\includegraphics[width=0.85\linewidth]{./img/_roipool_snap.jpg}
\caption{Project and snap operations}
\end{figure}
\item Apply max pooling with kernel of approximately size $\left\lceil \frac{H_r}{H_O} \right\rceil \times \left\lceil \frac{W_r}{W_O} \right\rceil$ and stride approximately $\left\lfloor \frac{H_r}{H_O} \right\rfloor \times \left\lfloor \frac{W_r}{W_O} \right\rfloor$.
@ -495,7 +495,7 @@
\end{remark}
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.85\linewidth]{./img/_roipool_maxpool.pdf}
\includegraphics[width=0.85\linewidth]{./img/_roipool_maxpool.jpg}
\caption{Pooling operation with varying kernel size}
\end{figure}
\end{enumerate}
@ -552,7 +552,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_faster_r_cnn.pdf}
\includegraphics[width=0.8\linewidth]{./img/_faster_r_cnn.jpg}
\caption{Example of faster R-CNN using AlexNet}
\end{figure}
@ -586,7 +586,7 @@
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.8\linewidth]{./img/_rpn_anchor.pdf}
\includegraphics[width=0.8\linewidth]{./img/_rpn_anchor.jpg}
\caption{Example of an iteration of a 1-anchor RPN}
\end{figure}
@ -606,7 +606,7 @@
\end{enumerate}
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.7\linewidth]{./img/_rpn_architecture.pdf}
\includegraphics[width=0.7\linewidth]{./img/_rpn_architecture.jpg}
\end{figure}
\end{description}
@ -656,7 +656,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_image_pyramid_multi_scale.pdf}
\includegraphics[width=0.6\linewidth]{./img/_image_pyramid_multi_scale.jpg}
\end{figure}
\item[CNN pyramid multi-scale detection] \marginnote{CNN pyramid multi-scale detection}
@ -668,7 +668,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_cnn_pyramid_multi_scale.pdf}
\includegraphics[width=0.8\linewidth]{./img/_cnn_pyramid_multi_scale.jpg}
\end{figure}
\item[Feature pyramid network (FPN)] \marginnote{Feature pyramid network (FPN)}
@ -676,7 +676,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_fpn_flow.pdf}
\includegraphics[width=0.7\linewidth]{./img/_fpn_flow.jpg}
\caption{General FPN flow}
\end{figure}
@ -691,7 +691,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/_fpn_top_down.pdf}
\includegraphics[width=0.65\linewidth]{./img/_fpn_top_down.jpg}
\caption{FPN top-down flow}
\end{figure}
\end{description}
@ -701,7 +701,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_faster_r_cnn_fpn.pdf}
\includegraphics[width=0.8\linewidth]{./img/_faster_r_cnn_fpn.jpg}
\caption{Example of faster R-CNN with FPN}
\end{figure}
@ -733,7 +733,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{./img/_one_stage_detector.pdf}
\includegraphics[width=0.95\linewidth]{./img/_one_stage_detector.jpg}
\end{figure}
\item[Multi-label classification] \marginnote{Multi-label classification}
@ -763,7 +763,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_darknet.pdf}
\includegraphics[width=0.8\linewidth]{./img/_darknet.jpg}
\end{figure}
\item[Learned anchors] \marginnote{Learned anchors}
@ -798,7 +798,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_retinanet.pdf}
\includegraphics[width=0.8\linewidth]{./img/_retinanet.jpg}
\end{figure}
\begin{remark}
@ -827,7 +827,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_focal_loss.pdf}
\includegraphics[width=0.45\linewidth]{./img/_focal_loss.jpg}
\caption{Focal loss for varying $\gamma$}
\end{figure}
@ -875,12 +875,12 @@
\centering
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=0.85\linewidth]{./img/_focal_cdf_foreground.pdf}
\includegraphics[width=0.85\linewidth]{./img/_focal_cdf_foreground.jpg}
\end{subfigure}
\hfill
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=0.85\linewidth]{./img/_focal_cdf_background.pdf}
\includegraphics[width=0.85\linewidth]{./img/_focal_cdf_background.jpg}
\end{subfigure}
\caption{
\parbox[t]{0.7\linewidth}{
@ -940,7 +940,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/centernet_outputs.png}
\includegraphics[width=0.5\linewidth]{./img/centernet_outputs.jpg}
\end{figure}
\begin{description}
@ -994,7 +994,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/_centernet_other_tasks.png}
\includegraphics[width=0.55\linewidth]{./img/_centernet_other_tasks.jpg}
\end{figure}
\end{remark}
@ -1012,7 +1012,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_object_detection_map_speed_plot.pdf}
\includegraphics[width=0.75\linewidth]{./img/_object_detection_map_speed_plot.jpg}
\caption{
mAP -- speed comparison of the various object detection approaches
}
@ -1055,7 +1055,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_detr_architecture.pdf}
\includegraphics[width=0.75\linewidth]{./img/_detr_architecture.jpg}
\end{figure}
\item[Hungarian loss] \marginnote{Hungarian loss}
@ -1085,7 +1085,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/hungarian_loss.png}
\includegraphics[width=0.5\linewidth]{./img/hungarian_loss.jpg}
\caption{
Possible permutations and optimal permutation (in orange).
}
@ -1102,7 +1102,7 @@
\item[Encoder] The encoder tend to solve a segmentation problem (i.e., determine what the object is).
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/detr_encoder.png}
\includegraphics[width=0.8\linewidth]{./img/detr_encoder.jpg}
\caption{
\parbox[t]{0.75\linewidth}{Self-attention map of some pixels at the last encoder. Yellow tiles indicate that the analyzed pixel attends to that patch.}
}
@ -1111,7 +1111,7 @@
\item[Decoder] The decoder tend to attend at object boundaries (i.e., determine where the object is).
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/detr_decoder.png}
\includegraphics[width=0.8\linewidth]{./img/detr_decoder.jpg}
\caption{
\parbox[t]{0.75\linewidth}{Decoder attention. Highlighted areas have a higher attention weight.}
}
@ -1120,7 +1120,7 @@
\item[Object query] Each object query tend to be specialized in recognizing objects in specific areas.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/detr_object_query.png}
\includegraphics[width=0.8\linewidth]{./img/detr_object_query.jpg}
\caption{
\parbox[t]{0.75\linewidth}{Position of the predictions of each object query. Green dots represent small boxes, red large horizontal boxes, and blue large vertical boxes.}
}
@ -1161,7 +1161,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/multiscale_comparison.png}
\includegraphics[width=0.65\linewidth]{./img/multiscale_comparison.jpg}
\end{figure}
\end{remark}
@ -1187,7 +1187,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.15\linewidth]{./img/bifpn.png}
\includegraphics[width=0.15\linewidth]{./img/bifpn.jpg}
\end{figure}
\end{itemize}
\end{description}

View File

@ -27,7 +27,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/sgd_sphere.png}
\includegraphics[width=0.35\linewidth]{./img/sgd_sphere.jpg}
\end{figure}
\end{remark}
@ -38,7 +38,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/sgd_canyon.png}
\includegraphics[width=0.8\linewidth]{./img/sgd_canyon.jpg}
\end{figure}
\end{remark}
@ -47,7 +47,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/sgd_local_minima.png}
\includegraphics[width=0.35\linewidth]{./img/sgd_local_minima.jpg}
\end{figure}
\end{remark}
@ -67,7 +67,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_2order_optimizer.pdf}
\includegraphics[width=0.7\linewidth]{./img/_2order_optimizer.jpg}
\end{figure}
\end{description}
@ -75,7 +75,7 @@ Methods that also consider the second-order derivatives when determining the ste
For quadratic functions, second-order methods converge in one step.
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/2order_1step.png}
\includegraphics[width=0.35\linewidth]{./img/2order_1step.jpg}
\end{figure}
\end{remark}
@ -116,7 +116,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/momentum.png}
\includegraphics[width=0.8\linewidth]{./img/momentum.jpg}
\caption{
\parbox[t]{0.7\linewidth}{
Plain SGD vs momentum SGD in a sphere and a canyon. In both cases, momentum converges before SGD.
@ -143,13 +143,13 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/nesterov_momentum.png}
\includegraphics[width=0.35\linewidth]{./img/nesterov_momentum.jpg}
\caption{Visualization of the step in Nesterov momentum}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/nesterov_comparison.png}
\includegraphics[width=0.75\linewidth]{./img/nesterov_comparison.jpg}
\caption{Plain SGD vs standard momentum vs Nesterov momentum}
\end{figure}
\end{description}
@ -170,7 +170,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/adaptive_lr.png}
\includegraphics[width=0.35\linewidth]{./img/adaptive_lr.jpg}
\caption{
\parbox[t]{0.5\linewidth}{Loss where the $w_1$ parameter has a larger gradient, while $w_2$ has a smaller gradient}
}
@ -206,7 +206,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/adagrad.png}
\includegraphics[width=0.4\linewidth]{./img/adagrad.jpg}
\caption{
\parbox[t]{0.45\linewidth}{SGD vs AdaGrad. AdaGrad stops before getting close to the minimum.}
}
@ -233,7 +233,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/rmsprop.png}
\includegraphics[width=0.35\linewidth]{./img/rmsprop.jpg}
\caption{SGD vs AdaGrad vs RMSProp}
\end{figure}
\end{description}
@ -271,13 +271,13 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/adam.png}
\includegraphics[width=0.75\linewidth]{./img/adam.jpg}
\caption{SGD vs AdaGrad vs RMSProp vs Adam}
\end{figure}
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/adam_noisy.png}
\includegraphics[width=0.75\linewidth]{./img/adam_noisy.jpg}
\caption{SGD vs AdaGrad vs RMSProp vs Adam with a smaller batch size}
\end{figure}
@ -286,7 +286,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/optimizers_no_align.png}
\includegraphics[width=0.8\linewidth]{./img/optimizers_no_align.jpg}
\end{figure}
\end{remark}
\end{description}
@ -300,7 +300,7 @@ Methods that also consider the second-order derivatives when determining the ste
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/momentum_local_global.png}
\includegraphics[width=0.35\linewidth]{./img/momentum_local_global.jpg}
\end{figure}
\end{remark}

View File

@ -49,7 +49,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/motion_data.png}
\includegraphics[width=0.75\linewidth]{./img/motion_data.jpg}
\end{figure}
\item[Depth comparison features] \marginnote{Depth comparison features}
@ -63,7 +63,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_depth_comparison_features.pdf}
\includegraphics[width=0.6\linewidth]{./img/_depth_comparison_features.jpg}
\caption{Examples of feature computation}
\end{figure}
@ -76,7 +76,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_depth_invariant_offset.pdf}
\includegraphics[width=0.6\linewidth]{./img/_depth_invariant_offset.jpg}
\end{figure}
\end{description}
@ -101,7 +101,7 @@
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.7\linewidth]{./img/_random_forest_bagging.pdf}
\includegraphics[width=0.7\linewidth]{./img/_random_forest_bagging.jpg}
\end{figure}
\item[Random splitting] \marginnote{Random splitting}
@ -111,7 +111,7 @@
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.7\linewidth]{./img/_random_forest_random_splitting.pdf}
\includegraphics[width=0.7\linewidth]{./img/_random_forest_random_splitting.jpg}
\end{figure}
\end{description}
@ -138,7 +138,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_segmentation_rcnn.pdf}
\includegraphics[width=0.9\linewidth]{./img/_segmentation_rcnn.jpg}
\caption{R-CNN for segmentation with $20$ ($+1$) classes}
\end{figure}
\end{description}
@ -177,7 +177,7 @@
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.85\linewidth]{./img/_fcn_32.pdf}
\includegraphics[width=0.85\linewidth]{./img/_fcn_32.jpg}
\end{figure}
\item[FCN-16S]
@ -189,7 +189,7 @@
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.85\linewidth]{./img/_fcn_16.pdf}
\includegraphics[width=0.85\linewidth]{./img/_fcn_16.jpg}
\end{figure}
\item[FCN-8S]
@ -197,7 +197,7 @@
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.85\linewidth]{./img/_fcn_8.pdf}
\includegraphics[width=0.85\linewidth]{./img/_fcn_8.jpg}
\end{figure}
\end{descriptionlist}
@ -228,7 +228,7 @@
Consider images with $1$ channel. Given a $3 \times 3$ input image and a $3 \times 3$ transposed convolution kernel with stride $2$, the output activation has spatial dimension $5 \times 5$ and is obtained as follows:
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_transposed_convolution.pdf}
\includegraphics[width=0.9\linewidth]{./img/_transposed_convolution.jpg}
\end{figure}
\end{example}
@ -267,7 +267,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_unet.pdf}
\includegraphics[width=0.6\linewidth]{./img/_unet.jpg}
\caption{
\parbox[t]{0.7\linewidth}{
U-Net structure. Note that in the original paper, convolutions have padding \texttt{valid} and the input is provided from a sliding window. Modern implementations have padding \texttt{same} and same input and output spatial dimension.
@ -306,7 +306,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_dilated_convolution.pdf}
\includegraphics[width=0.7\linewidth]{./img/_dilated_convolution.jpg}
\caption{Example of $3 \times 3$ dilated convolutions with increasing dilation rate}
\end{figure}
@ -320,7 +320,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_dilated_convolution_exponential.pdf}
\includegraphics[width=0.9\linewidth]{./img/_dilated_convolution_exponential.jpg}
\end{figure}
\end{remark}
@ -342,12 +342,12 @@
\centering
\begin{subfigure}{0.45\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{./img/_dilated_resnet_stage1.pdf}
\includegraphics[width=0.9\linewidth]{./img/_dilated_resnet_stage1.jpg}
\caption{ResNet with standard stages}
\end{subfigure}
\begin{subfigure}{0.45\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{./img/_dilated_resnet_stage2.pdf}
\includegraphics[width=0.9\linewidth]{./img/_dilated_resnet_stage2.jpg}
\caption{ResNet with two dilated stages}
\end{subfigure}
\end{figure}
@ -355,7 +355,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_dilated_resnet.pdf}
\includegraphics[width=0.9\linewidth]{./img/_dilated_resnet.jpg}
\caption{Dilated ResNet with total stride $8$}
\end{figure}
@ -381,7 +381,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.95\linewidth]{./img/_deeplabv3.pdf}
\includegraphics[width=0.95\linewidth]{./img/_deeplabv3.jpg}
\end{figure}
\item[DeepLab v3+] \marginnote{DeepLab v3+}
@ -391,18 +391,18 @@
\centering
\begin{subfigure}{0.3\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{./img/_deeplabv3plus_1.pdf}
\includegraphics[width=0.9\linewidth]{./img/_deeplabv3plus_1.jpg}
\caption{DeepLab v3}
\end{subfigure}
\begin{subfigure}{0.3\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{./img/_deeplabv3plus_2.pdf}
\includegraphics[width=0.9\linewidth]{./img/_deeplabv3plus_2.jpg}
\caption{U-Net}
\end{subfigure}
\hfill
\begin{subfigure}{0.3\linewidth}
\centering
\includegraphics[width=0.9\linewidth]{./img/_deeplabv3plus_3.pdf}
\includegraphics[width=0.9\linewidth]{./img/_deeplabv3plus_3.jpg}
\caption{DeepLab v3+}
\end{subfigure}
\end{figure}
@ -418,7 +418,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/obj_detection_and_segmentation.png}
\includegraphics[width=0.7\linewidth]{./img/obj_detection_and_segmentation.jpg}
\end{figure}
\end{description}
@ -436,17 +436,17 @@
\item Divide the proposal into equal subregions without snapping to grid.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_roi_align1.pdf}
\includegraphics[width=0.7\linewidth]{./img/_roi_align1.jpg}
\end{figure}
\item Sample some values following a regular grid within each subregion. Use bilinear interpolation to determine the values of the sampled points (as they are most likely not be pixel-perfect).
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_roi_align2.pdf}
\includegraphics[width=0.7\linewidth]{./img/_roi_align2.jpg}
\end{figure}
\item Max or average pool the sampled values in each subregion.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_roi_align3.pdf}
\includegraphics[width=0.7\linewidth]{./img/_roi_align3.jpg}
\end{figure}
\end{enumerate}
@ -461,13 +461,13 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/_mask_rcnn_head.pdf}
\includegraphics[width=0.85\linewidth]{./img/_mask_rcnn_head.jpg}
\end{figure}
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/_mask_rcnn.pdf}
\includegraphics[width=0.85\linewidth]{./img/_mask_rcnn.jpg}
\caption{Overall architecture of mask R-CNN}
\end{figure}
@ -504,7 +504,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/segmentation_types.png}
\includegraphics[width=0.5\linewidth]{./img/segmentation_types.jpg}
\end{figure}
\end{description}
@ -523,7 +523,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/_panoptic_fpn.pdf}
\includegraphics[width=0.85\linewidth]{./img/_panoptic_fpn.jpg}
\end{figure}
\end{description}
@ -550,7 +550,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_maskformer_naive.pdf}
\includegraphics[width=0.75\linewidth]{./img/_maskformer_naive.jpg}
\end{figure}
\item[Architecture (pixel decoder)]
@ -563,7 +563,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_maskformer_decoder.pdf}
\includegraphics[width=0.75\linewidth]{./img/_maskformer_decoder.jpg}
\end{figure}
\item[Inference]
@ -584,7 +584,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_maskformer_inference.pdf}
\includegraphics[width=0.8\linewidth]{./img/_maskformer_inference.jpg}
\end{figure}
\end{description}
@ -607,7 +607,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_mask2former.pdf}
\includegraphics[width=0.45\linewidth]{./img/_mask2former.jpg}
\end{figure}
\end{description}
@ -628,7 +628,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_spp.pdf}
\includegraphics[width=0.5\linewidth]{./img/_spp.jpg}
\end{figure}
\end{description}
@ -639,7 +639,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/aspp_deeplabv2.png}
\includegraphics[width=0.5\linewidth]{./img/aspp_deeplabv2.jpg}
\end{figure}
\begin{remark}
@ -647,7 +647,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_dilated_conv_weights.pdf}
\includegraphics[width=0.8\linewidth]{./img/_dilated_conv_weights.jpg}
\end{figure}
\end{remark}
@ -658,7 +658,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_deeplabv3_aspp.pdf}
\includegraphics[width=0.75\linewidth]{./img/_deeplabv3_aspp.jpg}
\caption{
ASPP with stride-16. With stride-8, rates are doubled.
}

View File

@ -9,7 +9,7 @@
Neural architecture designed for NLP sequence-to-sequence tasks. It heavily relies on the attention mechanism.
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/transformer.png}
\includegraphics[width=0.4\linewidth]{./img/transformer.jpg}
\end{figure}
\item[Autoregressive generation] \marginnote{Autoregressive generation}
@ -17,7 +17,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.3\linewidth]{./img/_transformer_autoregressive.pdf}
\includegraphics[width=0.3\linewidth]{./img/_transformer_autoregressive.jpg}
\caption{Example of autoregressive generation}
\end{figure}
\end{description}
@ -32,7 +32,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.3\linewidth]{./img/traditional_attention.png}
\includegraphics[width=0.3\linewidth]{./img/traditional_attention.jpg}
\caption{Attention weights for machine translation}
\end{figure}
@ -59,7 +59,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_dot_product_attention.pdf}
\includegraphics[width=0.8\linewidth]{./img/_dot_product_attention.jpg}
% \caption{Steps of dot-product attention}
\end{figure}
@ -84,7 +84,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_scaled_dot_attention.pdf}
\includegraphics[width=0.8\linewidth]{./img/_scaled_dot_attention.jpg}
% \caption{Steps of scaled dot-product attention}
\end{figure}
@ -95,7 +95,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_scaled_dot_attention_multi_q.pdf}
\includegraphics[width=0.8\linewidth]{./img/_scaled_dot_attention_multi_q.jpg}
% \caption{Steps of scaled dot-product attention with multidimensional queries}
\end{figure}
@ -106,7 +106,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_self_attention.pdf}
\includegraphics[width=0.8\linewidth]{./img/_self_attention.jpg}
% \caption{Steps of self-attention}
\end{figure}
\end{description}
@ -120,7 +120,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_transformer_embeddings.pdf}
\includegraphics[width=0.4\linewidth]{./img/_transformer_embeddings.jpg}
\end{figure}
\end{description}
@ -139,7 +139,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_multi_head_attention.pdf}
\includegraphics[width=0.7\linewidth]{./img/_multi_head_attention.jpg}
\caption{\texttt{MHSA} with two heads}
\end{figure}
@ -171,7 +171,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/norm_methods.png}
\includegraphics[width=0.8\linewidth]{./img/norm_methods.jpg}
\caption{Affected axis of normalization methods}
\end{figure}
@ -225,12 +225,12 @@
\centering
\begin{subfigure}{0.40\linewidth}
\centering
\includegraphics[width=0.8\linewidth]{./img/_post_norm_encoder.pdf}
\includegraphics[width=0.8\linewidth]{./img/_post_norm_encoder.jpg}
\caption{Encoder in post-norm transformer}
\end{subfigure}
\begin{subfigure}{0.40\linewidth}
\centering
\includegraphics[width=0.8\linewidth]{./img/_pre_norm_encoder.pdf}
\includegraphics[width=0.8\linewidth]{./img/_pre_norm_encoder.jpg}
\caption{Encoder in pre-norm transformer}
\end{subfigure}
\end{figure}
@ -268,7 +268,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_cross_attention.pdf}
\includegraphics[width=0.75\linewidth]{./img/_cross_attention.jpg}
\caption{Cross-attention data flow}
\end{figure}
@ -282,7 +282,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_transformer_decoder.pdf}
\includegraphics[width=0.5\linewidth]{./img/_transformer_decoder.jpg}
\caption{Decoder in post-norm transformer}
\end{figure}
@ -305,7 +305,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_masked_self_attention.pdf}
\includegraphics[width=0.8\linewidth]{./img/_masked_self_attention.jpg}
\end{figure}
\end{description}
\end{description}
@ -318,7 +318,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_self_attention_permutation.pdf}
\includegraphics[width=0.8\linewidth]{./img/_self_attention_permutation.jpg}
\end{figure}
\end{remark}
@ -344,7 +344,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/_transformer_position_encoding.pdf}
\includegraphics[width=0.65\linewidth]{./img/_transformer_position_encoding.jpg}
\end{figure}
\end{description}
@ -386,7 +386,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_vit_patch.pdf}
\includegraphics[width=0.8\linewidth]{./img/_vit_patch.jpg}
\end{figure}
\item[Vision transformer (ViT)] \marginnote{Vision transformer (ViT)}
@ -398,7 +398,7 @@
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/_vision_transformer.pdf}
\includegraphics[width=0.55\linewidth]{./img/_vision_transformer.jpg}
\end{figure}
\begin{remark}
@ -431,7 +431,7 @@
\item The first embedding projection $W_E$ for RGB images shows a similar behavior to convolutions as they tend to recognize edges and color variations.
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_vit_projection_rgb.pdf}
\includegraphics[width=0.45\linewidth]{./img/_vit_projection_rgb.jpg}
\caption{
\parbox[t]{0.7\linewidth}{
Visualization of the columns of the patches linear projection matrix $W_E$. Each column has shape $3P^2$ and can be reshaped to be a $3 \times P \times P$ image.
@ -442,7 +442,7 @@
\item The learned positional embeddings are able to encode information about the row and column positioning of the patches.
\begin{figure}[H]
\centering
\includegraphics[width=0.33\linewidth]{./img/_vit_embedding_similarity.pdf}
\includegraphics[width=0.33\linewidth]{./img/_vit_embedding_similarity.jpg}
\caption{
\parbox[t]{0.7\linewidth}{
Cosine similarity of the positional encoding of each patch compared to all the others
@ -453,7 +453,7 @@
\item Attention heads at the lower layers attend at both positions around the patch and far from them. Higher layers, as with convolutions, attend to distant patches.
\begin{figure}[H]
\centering
\includegraphics[width=0.33\linewidth]{./img/_vit_head_distance.pdf}
\includegraphics[width=0.33\linewidth]{./img/_vit_head_distance.jpg}
\caption{
Mean attention distance of the heads of ViT-large/16
}
@ -462,7 +462,7 @@
\item On ImageNet top-1 accuracy, ViT outperforms a large ResNet only when pre-trained on a large dataset.
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_vit_results.pdf}
\includegraphics[width=0.4\linewidth]{./img/_vit_results.jpg}
\caption{
\parbox[t]{0.7\linewidth}{
ImageNet top-1 accuracy with different pre-training datasets. BiT represents ResNet (two variants).