Add ML4CV DETR + kinect

2026-02-04 15:51:43 +01:00 · 2024-10-28 21:34:39 +01:00
parent 17ef6f3c8d
commit e9b9b4835c
15 changed files with 279 additions and 7 deletions
--- a/src/year2/machine-learning-for-computer-vision/img/_depth_comparison_features.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_depth_comparison_features.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_depth_invariant_offset.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_depth_invariant_offset.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_detr_architecture.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_detr_architecture.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_random_forest_bagging.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_random_forest_bagging.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_random_forest_random_splitting.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_random_forest_random_splitting.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/bifpn.png
+++ b/src/year2/machine-learning-for-computer-vision/img/bifpn.png
--- a/src/year2/machine-learning-for-computer-vision/img/detr_decoder.png
+++ b/src/year2/machine-learning-for-computer-vision/img/detr_decoder.png
--- a/src/year2/machine-learning-for-computer-vision/img/detr_encoder.png
+++ b/src/year2/machine-learning-for-computer-vision/img/detr_encoder.png
--- a/src/year2/machine-learning-for-computer-vision/img/detr_object_query.png
+++ b/src/year2/machine-learning-for-computer-vision/img/detr_object_query.png
--- a/src/year2/machine-learning-for-computer-vision/img/hungarian_loss.png
+++ b/src/year2/machine-learning-for-computer-vision/img/hungarian_loss.png
--- a/src/year2/machine-learning-for-computer-vision/img/motion_data.png
+++ b/src/year2/machine-learning-for-computer-vision/img/motion_data.png
--- a/src/year2/machine-learning-for-computer-vision/img/multiscale_comparison.png
+++ b/src/year2/machine-learning-for-computer-vision/img/multiscale_comparison.png
--- a/src/year2/machine-learning-for-computer-vision/ml4cv.tex
+++ b/src/year2/machine-learning-for-computer-vision/ml4cv.tex
@ -1,4 +1,5 @@
 \documentclass[11pt]{ainotes}
 \usepackage{appendix}
 \title{Machine Learning for Computer Vision}
 \date{2024 -- 2025}
@ -12,5 +13,6 @@
    \include{./sections/_architectures.tex}
    \include{./sections/_transformers.tex}
    \include{./sections/_object_detection.tex}
    \include{./sections/_segmentation.tex}
 \end{document}
--- a/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex
@ -644,7 +644,7 @@
 \end{remark}
-\subsection{Multi-scale detectors}
+\subsection{Multi-scale detectors} \label{sec:multiscale_detector}
 \begin{description}
    \item[Image pyramid multi-scale detection] \marginnote{Image pyramid multi-scale detection}
@ -1012,11 +1012,9 @@
 \begin{figure}[H]
    \centering
-    \includegraphics[width=0.8\linewidth]{./img/_object_detection_map_speed_plot.pdf}
+    \includegraphics[width=0.75\linewidth]{./img/_object_detection_map_speed_plot.pdf}
    \caption{
-        \parbox[t]{0.5\linewidth}{
+        mAP -- speed comparison of the various object detection approaches
            mAP -- speed comparison of the various object detection approaches
        }
    }
 \end{figure}
@ -1028,7 +1026,7 @@
        Method based on two ideas:
        \begin{itemize}
            \item Use transformers to predict a set of objects in a single pass (i.e., solve a set prediction problem).
-            \item Use the bipartite matching (i.e., Hungarian) loss as set prediction loss that forces a unique matching between predictions and ground-truths.
+            \item Use the Hungarian (i.e., bipartite matching) loss as set prediction loss that forces a unique matching between predictions and ground-truths.
        \end{itemize}
        \begin{description}
@ -1043,5 +1041,155 @@
                            In object detection, there are no other generated output tokens to feed into the decoder for parallel decoding. Therefore, only the learned positional encodings are used.
                        \end{remark}
                \end{description}
            \item[Architecture]
                Given an input image $I$ of size $H \times W \times 3$, DETR does the following:
                \begin{enumerate}
                    \item Pass the input $I$ through a CNN feature extractor to obtain an activation $A$ of shape $\frac{H}{32} \times \frac{W}{32} \times 2048$.
                    \item Use a $1 \times 1$ convolution to adjust the number of channels of $A$ from $2048$ to $d$.
                    \item Add positional encoding to $A$.
                    \item Pass the activation $A$ through the transformer encoder to obtain the keys and values for the decoder.
                    \item Pass the learned object queries through the decoder to obtain the outputs $O_i$.
                    \item Pass each output $O_i$ through an MLP to obtain class and box predictions.
                \end{enumerate}
                \begin{figure}[H]
                    \centering
                    \includegraphics[width=0.75\linewidth]{./img/_detr_architecture.pdf}
                \end{figure}
            \item[Hungarian loss] \marginnote{Hungarian loss}
                Consider for simplicity a problem with $2$ classes (plus background). Given:
                \begin{itemize}
                    \item $N$ predictions $\{ \hat{y}_i = (\hat{p}_i, \hat{b}_i) \}_{i=1}^N$ where $\hat{p}_i$ is the class probability distribution and $\hat{b}_i$ describes the bounding box normalized w.r.t. the image size.
                    \item $O$ ground-truth boxes padded to $N$ with background classes $\{ \hat{y}_i = (c_i, b_i) \}_{i=1}^O \cup \{ \hat{y}_i = \varnothing \}_{i=N - O}^{N}$ where $c_i$ is the class, $b_i$ describes the bounding box normalized w.r.t. the image size, and $\varnothing$ represents the background class.
                \end{itemize}
                The Hungarian loss is defined in two steps:
                \begin{enumerate}
                    \item Solve the bipartite matching problem of finding the optimal permutation $\sigma^*$ that associates each prediction to a unique ground-truth box while minimizing the matching loss $\mathcal{L}_\text{match}$ defined as follows:
                    \[ 
                        \mathcal{L}_\text{match}(\hat{y}_i, y_j) = \begin{cases}
                            -\hat{p}_i[c_j] + \mathcal{L}_\text{box}(\hat{b}_i, b_j) & \text{if $c_j \neq \varnothing$} \\
                            0 & \text{otherwise}
                        \end{cases} 
                    \]
                    where $\mathcal{L}_\text{box}$ is a loss based on the linear combination of the Huber loss and IoU.
                    The overall problem is the following:
                    \[ \sigma^* = \arg\min_\sigma \sum_{i=1}^{N} \mathcal{L}_\text{match}(\hat{y}_{\sigma(i)}, y_i) \]
                    \item Given the optimal permutation $\sigma^*$, compute the loss as:
                    \[ \mathcal{L}_\text{hungarian}(\hat{y}, y) = \sum_{i=1}^{N} \left( - \ln\left( \hat{p}_{\sigma^*(i)}(c_i) \right) + \mathbbm{1}[c_i \neq \varnothing] \mathcal{L}_\text{box}(\hat{b}_{\sigma^*(i)}, b_i) \right) \]
                \end{enumerate}
                \begin{figure}[H]
                    \centering
                    \includegraphics[width=0.5\linewidth]{./img/hungarian_loss.png}
                    \caption{
                        Possible permutations and optimal permutation (in orange).
                    }
                \end{figure}
        \end{description}
-\end{description}
+
        \begin{remark}
            Results show that faster R-CNN + FPN is better for smaller object and DETR performs best with larger objects.
        \end{remark}
        \begin{remark}[Visualization]
            By analyzing the main components of DETR, the following can be observed:
            \begin{descriptionlist}
                \item[Encoder] The encoder tend to solve a segmentation problem (i.e., determine what the object is).
                    \begin{figure}[H]
                        \centering
                        \includegraphics[width=0.8\linewidth]{./img/detr_encoder.png}
                        \caption{
                            \parbox[t]{0.75\linewidth}{Self-attention map of some pixels at the last encoder. Yellow tiles indicate that the analyzed pixel attends to that patch.}
                        }
                    \end{figure}
                \item[Decoder] The decoder tend to attend at object boundaries (i.e., determine where the object is).
                    \begin{figure}[H]
                        \centering
                        \includegraphics[width=0.8\linewidth]{./img/detr_decoder.png}
                        \caption{
                            \parbox[t]{0.75\linewidth}{Decoder attention. Highlighted areas have a higher attention weight.}
                        }
                    \end{figure}
                \item[Object query] Each object query tend to be specialized in recognizing objects in specific areas.
                    \begin{figure}[H]
                        \centering
                        \includegraphics[width=0.8\linewidth]{./img/detr_object_query.png}
                        \caption{
                            \parbox[t]{0.75\linewidth}{Position of the predictions of each object query. Green dots represent small boxes, red large horizontal boxes, and blue large vertical boxes.}
                        }
                    \end{figure}
            \end{descriptionlist}
        \end{remark}
 \end{description}
 \begin{subappendices}
    \section{EfficientDet}
        \begin{remark}
            When working with object detection, there are many options to scale a model:
            \begin{descriptionlist}
                \item[Backbone] Change the CNN to refine the input image.
                \item[Image resolution] Change the resolutions produced by the CNN.
                \item[Multi-scale feature representation] Change the architecture of the multi-scale detector.
                \item[Detector head] Change the classification and regression heads.
            \end{descriptionlist}
        \end{remark}
        \begin{description}
            \item[EfficientDet] \marginnote{EfficientDet}
                Similarly to MobileNet and EfficientNet, EfficientDet uses a compound scaling coefficient $\phi$ to scale up the model (using heuristic rules).
                \begin{remark}
                    Before EfficientDec, other multi-scale feature representations have been introduced:
                    \begin{descriptionlist}
                        \item[FPN] 
                            As described in \Cref{sec:multiscale_detector}.
                        \item[PANet] \marginnote{PANet}
                            Based on FPN with an additional bottom-up path to merge higher resolution features with coarser ones.
                        \item[NAS-FPN] \marginnote{NAS-FPN}
                            Base block found using neural architecture search. Multiple blocks are repeated to obtain the multi-scale features.
                    \end{descriptionlist}
                    \begin{figure}[H]
                        \centering
                        \includegraphics[width=0.65\linewidth]{./img/multiscale_comparison.png}
                    \end{figure}
                \end{remark}
                \begin{description}
                    \item[Weighted bi-directional feature pyramid network (BiFPN)] \marginnote{Weighted bi-directional feature pyramid network (BiFPN)}
                        Architecture to represent multi-scale features with the following characteristics:
                        \begin{itemize}
                            \item Applied as repeated blocks (i.e., as NAS-FPN).
                            \item Based on PANet with the following changes:
                            \begin{itemize}
                                \item Activations generated by only one parent activation are removed and the connections are adjusted.
                                \item Add a connection between input and output node of a block.
                                \item Use depth-wise separable convolutions.
                                \item Add learnable weights to weigh features at different resolutions. 
                            \end{itemize}
                            The output $P_i^\text{out}$ at the $i$-th scale is therefore computed as:
                            \[ 
                                \begin{split}
                                    P_i^\text{out} &= \texttt{conv}\left( \frac{w_{i, 1} P_i^\text{in} + w_{i, 2} P_i^\text{top-down} + w_{i, 3} \texttt{downsample}(P_{i-1}^\text{out})}{w_{i, 1} + w_{i, 2} + w_{i, 3} + \varepsilon} \right) \\
                                    P_i^\text{top-down} &= \texttt{conv}\left( w_{i, 1}' P_i^\text{in} + w_{i+1, \text{td}} \texttt{upsample}(P_{i+1}^\text{top-down}) \right)
                                \end{split}
                            \]
                            \begin{figure}[H]
                                \centering
                                \includegraphics[width=0.15\linewidth]{./img/bifpn.png}
                            \end{figure}
                        \end{itemize}
                \end{description}
        \end{description}
 \end{subappendices}
--- a/src/year2/machine-learning-for-computer-vision/sections/_segmentation.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_segmentation.tex
@ -0,0 +1,122 @@
 \chapter{Segmentation}
 \section{Semantic segmentation}
 \begin{description}
    \item[Semantic segmentation] \marginnote{Semantic segmentation}
        Given an input image, output a category for each pixel.
    \item[Pixel-wise IoU] \marginnote{Pixel-wise IoU}
        IoU generalized to pixel-wise segmentation masks. Given a class $c$, the ground-truths $y^{(i)}$, and predictions $\hat{y}^{(i)}$, the IoU w.r.t. $c$ is computed as follows:
        \[
            \begin{gathered}
                TP_c = \sum_{i=1} | \text{pixels $(u, v): y_{(u, v)}^{(i)} = c \land \hat{y}_{(u, v)}^{(i)} = c$} | \\
                \texttt{IoU}_c = \frac{TP_c}{\sum_{i=1} \left( | (u, v): \hat{y}_{(u, v)}^{(i)} = c | + | (u, v): y_{(u, v)}^{(i)} = c | \right) - TP_c}
            \end{gathered}
        \]
        The mean IoU is computed as:
        \[ \texttt{mIoU} = \frac{1}{C} \sum_{c=1}^{C} \texttt{IoU}_c \]
 \end{description}
 \subsection{Kinect human pose estimation}
 \begin{description}
    \item[Human pose detection] \marginnote{Human pose detection}
        Task of detecting the position and orientation of a person.
    \item[Pipeline]
        Kinect pose detection is done in three phases:
        \begin{enumerate}
            \item Capture a depth image and remove the background to obtain the depth map of a person.
            \item Classify each pixel into a body part.
            \item Determine the position of the joints (i.e., skeleton) by finding local modes (i.e., center of mass).
        \end{enumerate}
    \item[Synthetic annotated data] \marginnote{Synthetic annotated data}
        The data used to create the model is artificially generated. $100$k poses were captured using motion capture devices. Then, different mock-up body models (for which the ground-truth is known) were simulated and recorded through a virtual camera with the same intrinsic parameters of the Kinect camera. This allows to obtain robustness with different body and clothing shapes.
        \begin{remark}
            The workflow of the original paper consists of iteratively training the model and creating new training data for poses that the current model struggles on.
        \end{remark}
        \begin{figure}[H]
            \centering
            \includegraphics[width=0.75\linewidth]{./img/motion_data.png}
        \end{figure}
    \item[Depth comparison features] \marginnote{Depth comparison features}
        Given a depth image $D$ and the offsets $\theta = (\Delta p, \Delta n)$, each pixel $x$ of $D$ produces a feature as follows:
        \[ f(x; D, (\Delta p, \Delta n)) = D\left[ x + \frac{\Delta p}{D[x]} \right] - D\left[ x + \frac{\Delta n}{D[x]} \right] \]
        In other words, each $x$ is described by the difference in depth between two points offset from $x$. The depth at background pixels is a large positive number.
        \begin{remark}
            As real-time processing is required, this approach allows to quickly compute features to discriminate parts of the body. However, it does not always produce a correct response.
        \end{remark}
        \begin{figure}[H]
            \centering
            \includegraphics[width=0.6\linewidth]{./img/_depth_comparison_features.pdf}
            \caption{Examples of feature computation}
        \end{figure}
        \begin{description}
            \item[Depth-invariant offsets]
                The denominator ($D[x]$) applied to the offsets allows obtaining depth-invariant offsets.
                Consider two objects at different depths. The focal length of the camera is $f$ and the world offset we want to apply is $o_w$. By changing depth $d$, the offset in the image plane $o_{di}$ changes due to the perspective projection rules. Therefore, to obtain the offset in the image place, we have that:
                \[ o_{di} : f = o_w : d \,\Rightarrow\, o_{di} = \frac{o_w f}{d} = \frac{\Delta p}{d} \]
                \begin{figure}[H]
                    \centering
                    \includegraphics[width=0.6\linewidth]{./img/_depth_invariant_offset.pdf}
                \end{figure}
        \end{description}
    \item[Decision tree] \marginnote{Decision tree}
        Depth comparison features are used to train decision trees.
        \begin{remark}
            Decision trees are unstable robust classifiers. They are able to achieve good performance but significantly change in structure if the training data is slightly perturbed (i.e., high variance).
        \end{remark}
        \begin{description}
            \item[Random forest] \marginnote{Random forest}
                Ensemble of $N$ decision trees that aims to reduce variance by averaging their predictions.
                \begin{remark}
                    For a random forest to be effective, its decision trees should be uncorrelated so that when averaging, the average of their errors tend to $0$.
                \end{remark}
                \begin{description}
                    \item[Bootstrap aggregating (bagging)] \marginnote{Bootstrap aggregating (bagging)}
                        Train each decision tree using a replica of the training set obtained by sampling with replacement.
                        \begin{figure}[H]
                            \raggedleft
                            \includegraphics[width=0.7\linewidth]{./img/_random_forest_bagging.pdf}
                        \end{figure}
                    \item[Random splitting] \marginnote{Random splitting}
                        Even though bagging reduces variance, if there is a subset of particularly predictive features, the resulting trees will be correlated. 
                        To avoid this, in a random forest, tree splitting is done on a different random subset of features each time.
                        \begin{figure}[H]
                            \raggedleft
                            \includegraphics[width=0.7\linewidth]{./img/_random_forest_random_splitting.pdf}
                        \end{figure}
                \end{description}
                \begin{remark}
                    Random forests are:
                    \begin{itemize}
                        \item Fast and parallelizable in both training and inference.
                        \item Robust to hyperparameters change.
                        \item Interpretable.
                    \end{itemize}
                \end{remark}
        \end{description}
 \end{description}