Add ML4CV ViT + object detection metrics

2026-02-04 07:41:43 +01:00 · 2024-10-14 18:58:08 +02:00
parent 7de54e2693
commit 2170040f15
12 changed files with 170 additions and 1 deletions
--- a/src/year2/machine-learning-for-computer-vision/img/_example_precision_recall_curve1.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_example_precision_recall_curve1.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_example_precision_recall_curve2.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_example_precision_recall_curve2.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_object_detection_example.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_object_detection_example.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_vit_embedding_similarity.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_vit_embedding_similarity.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_vit_head_distance.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_vit_head_distance.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_vit_projection_rgb.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_vit_projection_rgb.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_vit_results.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_vit_results.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/obj_det_recall_precision.png
+++ b/src/year2/machine-learning-for-computer-vision/img/obj_det_recall_precision.png
--- a/src/year2/machine-learning-for-computer-vision/ml4cv.tex
+++ b/src/year2/machine-learning-for-computer-vision/ml4cv.tex
@ -11,5 +11,6 @@
    \input{./sections/_optimizers.tex}
    \input{./sections/_architectures.tex}
    \input{./sections/_transformers.tex}
+    \input{./sections/_object_detection.tex}

 \end{document}
--- a/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_architectures.tex
@ -155,7 +155,7 @@ Network with bottleneck-block-inspired inception modules.
 \end{figure}


-\graphicspath{../}
+\graphicspath{{.}}



--- a/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex
@ -0,0 +1,90 @@
+\chapter{Object detection}
+
+
+\begin{description}
+    \item[Object detection] \marginnote{Object detection}
+        Given an RGB $W \times H$ image, determine a set of objects $\{ o_1, \dots, o_n \}$ contained in it. Each object $o_j$ is described by:
+        \begin{itemize}
+            \item A category $c_j \in \{ 1, \dots, C \}$ as in image classification.
+            \item A bounding box $BB_j = [ x_j, y_j, w_j, h_j ]$ where $x_j, w_j \in [0, W-1]$ and $y_j, h_j \in [0, H-1]$. ($x_j$, $y_j$) is the center and ($w_j$, $h_j$) is the size of the box.
+            \item A confidence score $\rho_j$.
+        \end{itemize}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.45\linewidth]{./img/_object_detection_example.pdf}
+        \end{figure}
+
+        \begin{remark}
+            Differently from classification, a model has to:
+            \begin{itemize}
+                \item Be able to output a variable number of results.
+                \item Output both categorical and spatial information.
+                \item Work on high resolution input images.
+            \end{itemize}
+        \end{remark}
+\end{description}
+
+
+
+\section{Metrics}
+
+\begin{description}
+    \item[Intersection over union (\texttt{IoU})] 
+    \marginnote{Intersection over union (\texttt{IoU})}
+        Measures the amount of overlap between two boxes computed as the ratio of the area of intersection over the area of union:
+        \[ \texttt{IoU}(BB_i, BB_j) = \frac{\vert BB_i \cap BB_j \vert}{\vert BB_i \vert + \vert BB_j \vert - \vert BB_i \cup BB_j \vert} \]
+
+        \begin{description}
+            \item[True/false positive criteria] 
+                Given a threshold $\rho_\texttt{IoU}$, a detection $BB_i$ is a true positive (\texttt{TP}) w.r.t. a ground truth $\hat{BB_j}$ if it is classified with the same class and:
+                \[ \texttt{IoU}(BB_i, \hat{BB_j}) > \rho_\texttt{IoU} \]
+
+                \begin{remark}
+                    Confidence can also be considered when determining a match through a threshold $\rho_\text{min}$.
+                \end{remark}
+        \end{description}
+
+    \item[Recall]
+        Measures the number of ground truth objects that have been found:
+        \[ \texttt{recall} = \frac{\vert \texttt{TP} \vert}{\vert \text{ground truth boxes} \vert} \]
+
+    \item[Precision]
+        Measures the number of correct detections among all the predictions:
+        \[ \texttt{precision} = \frac{\vert \texttt{TP} \vert}{\vert \text{model detections} \vert} \]
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.7\linewidth]{./img/obj_det_recall_precision.png}
+        \caption{
+            Recall and precision in different scenarios
+        }
+    \end{figure}
+
+    \item[Precision-recall curve]
+        Plot that relates precision and recall.
+
+        \begin{example}
+            Consider the following image and the bounding boxes found by a detector:
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve1.pdf}
+                \caption{
+                    \parbox[t]{0.6\linewidth}{
+                        Ground truth (yellow boxes) and predictions (orange boxes) with their confidence score
+                    }
+                }
+            \end{figure}
+
+            By sorting the confidence scores, it is possible to plot the precision-recall curve by varying the threshold $\rho_\text{min}$:
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve2.pdf}
+            \end{figure}
+
+            \indenttbox
+            \begin{remark}
+                Recall is monotonically decreasing, while precision can both decrease and increase.
+            \end{remark}
+        \end{example}
+\end{description}
--- a/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_transformers.tex
@ -400,4 +400,82 @@
            \centering
            \includegraphics[width=0.55\linewidth]{./img/_vision_transformer.pdf}
        \end{figure}
+
+        \begin{remark}
+            Differently from convolutional neural networks where convolutions are the major source of FLOPs, in ViT the number of FLOPs heavily depends on the length of the input sequence due to the quadratic complexity of the attention mechanism.
+        \end{remark}
+
+    \item[ViT variants]
+        The main size-wise variants of ViT are the following:
+        \begin{center}
+            \small
+            \begin{tabular}{cccccc}
+                \toprule
+                \textbf{Model} & \textbf{Layers} & \textbf{Heads} & \textbf{Hidden size} & \textbf{MLP size} & \textbf{Parameters} \\
+                \midrule
+                ViT-base    & 12 & 12 & 768 & 3072 & 86 M \\
+                ViT-large   & 24 & 16 & 1024 & 4096 & 307 M \\
+                ViT-huge    & 32 & 16 & 1280 & 5120 & 632 M \\
+                \bottomrule
+            \end{tabular}
+        \end{center}
+        Note that, by convention, the MLP size is four times the hidden size.
+
+        Moreover, ViT models can also vary depending on the size of the input patch.
+
+        The overall notation to denote size and patch is: ViT-\texttt{<size>}/\texttt{<patch size>}.
+
+    \item[Results]
+        The main experimental observations and results using vision transformer are the following:
+        \begin{itemize}
+            \item The first embedding projection $W_E$ for RGB images shows a similar behavior to convolutions as they tend to recognize edges and color variations.
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.45\linewidth]{./img/_vit_projection_rgb.pdf}
+                \caption{
+                    \parbox[t]{0.7\linewidth}{
+                        Visualization of the columns of the patches linear projection matrix $W_E$. Each column has shape $3P^2$ and can be reshaped to be a $3 \times P \times P$ image.
+                    }
+                }
+            \end{figure}
+
+            \item The learned positional embeddings are able to encode information about the row and column positioning of the patches.
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.33\linewidth]{./img/_vit_embedding_similarity.pdf}
+                \caption{
+                    \parbox[t]{0.7\linewidth}{
+                        Cosine similarity of the positional encoding of each patch compared to all the others
+                    }
+                }
+            \end{figure}
+
+            \item Attention heads at the lower layers attend at both positions around the patch and far from them. Higher layers, as with convolutions, attend to distant patches.
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.33\linewidth]{./img/_vit_head_distance.pdf}
+                \caption{
+                    Mean attention distance of the heads of ViT-large/16
+                }
+            \end{figure}
+
+            \item On ImageNet top-1 accuracy, ViT outperforms a large ResNet only when pre-trained on a large dataset.
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.4\linewidth]{./img/_vit_results.pdf}
+                \caption{
+                    \parbox[t]{0.7\linewidth}{
+                        ImageNet top-1 accuracy with different pre-training datasets. BiT represents ResNet (two variants).
+                    }
+                }
+            \end{figure}
+        \end{itemize}
+
+        \begin{remark}
+            Comparison between convolutional neural networks and vision transformer is not straightforward.
+        \end{remark}
+
+        \begin{remark}
+            On an execution efficiency point-of-view, the currently more common inference hardware is more optimized for convolutions.
+        \end{remark}
 \end{description}