Add ML4CV DETR + kinect

This commit is contained in:
2024-10-28 21:34:39 +01:00
parent 17ef6f3c8d
commit e9b9b4835c
15 changed files with 279 additions and 7 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 63 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 MiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 620 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 835 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 351 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 132 KiB

View File

@ -1,4 +1,5 @@
\documentclass[11pt]{ainotes} \documentclass[11pt]{ainotes}
\usepackage{appendix}
\title{Machine Learning for Computer Vision} \title{Machine Learning for Computer Vision}
\date{2024 -- 2025} \date{2024 -- 2025}
@ -12,5 +13,6 @@
\include{./sections/_architectures.tex} \include{./sections/_architectures.tex}
\include{./sections/_transformers.tex} \include{./sections/_transformers.tex}
\include{./sections/_object_detection.tex} \include{./sections/_object_detection.tex}
\include{./sections/_segmentation.tex}
\end{document} \end{document}

View File

@ -644,7 +644,7 @@
\end{remark} \end{remark}
\subsection{Multi-scale detectors} \subsection{Multi-scale detectors} \label{sec:multiscale_detector}
\begin{description} \begin{description}
\item[Image pyramid multi-scale detection] \marginnote{Image pyramid multi-scale detection} \item[Image pyramid multi-scale detection] \marginnote{Image pyramid multi-scale detection}
@ -1012,11 +1012,9 @@
\begin{figure}[H] \begin{figure}[H]
\centering \centering
\includegraphics[width=0.8\linewidth]{./img/_object_detection_map_speed_plot.pdf} \includegraphics[width=0.75\linewidth]{./img/_object_detection_map_speed_plot.pdf}
\caption{ \caption{
\parbox[t]{0.5\linewidth}{ mAP -- speed comparison of the various object detection approaches
mAP -- speed comparison of the various object detection approaches
}
} }
\end{figure} \end{figure}
@ -1028,7 +1026,7 @@
Method based on two ideas: Method based on two ideas:
\begin{itemize} \begin{itemize}
\item Use transformers to predict a set of objects in a single pass (i.e., solve a set prediction problem). \item Use transformers to predict a set of objects in a single pass (i.e., solve a set prediction problem).
\item Use the bipartite matching (i.e., Hungarian) loss as set prediction loss that forces a unique matching between predictions and ground-truths. \item Use the Hungarian (i.e., bipartite matching) loss as set prediction loss that forces a unique matching between predictions and ground-truths.
\end{itemize} \end{itemize}
\begin{description} \begin{description}
@ -1043,5 +1041,155 @@
In object detection, there are no other generated output tokens to feed into the decoder for parallel decoding. Therefore, only the learned positional encodings are used. In object detection, there are no other generated output tokens to feed into the decoder for parallel decoding. Therefore, only the learned positional encodings are used.
\end{remark} \end{remark}
\end{description} \end{description}
\item[Architecture]
Given an input image $I$ of size $H \times W \times 3$, DETR does the following:
\begin{enumerate}
\item Pass the input $I$ through a CNN feature extractor to obtain an activation $A$ of shape $\frac{H}{32} \times \frac{W}{32} \times 2048$.
\item Use a $1 \times 1$ convolution to adjust the number of channels of $A$ from $2048$ to $d$.
\item Add positional encoding to $A$.
\item Pass the activation $A$ through the transformer encoder to obtain the keys and values for the decoder.
\item Pass the learned object queries through the decoder to obtain the outputs $O_i$.
\item Pass each output $O_i$ through an MLP to obtain class and box predictions.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_detr_architecture.pdf}
\end{figure}
\item[Hungarian loss] \marginnote{Hungarian loss}
Consider for simplicity a problem with $2$ classes (plus background). Given:
\begin{itemize}
\item $N$ predictions $\{ \hat{y}_i = (\hat{p}_i, \hat{b}_i) \}_{i=1}^N$ where $\hat{p}_i$ is the class probability distribution and $\hat{b}_i$ describes the bounding box normalized w.r.t. the image size.
\item $O$ ground-truth boxes padded to $N$ with background classes $\{ \hat{y}_i = (c_i, b_i) \}_{i=1}^O \cup \{ \hat{y}_i = \varnothing \}_{i=N - O}^{N}$ where $c_i$ is the class, $b_i$ describes the bounding box normalized w.r.t. the image size, and $\varnothing$ represents the background class.
\end{itemize}
The Hungarian loss is defined in two steps:
\begin{enumerate}
\item Solve the bipartite matching problem of finding the optimal permutation $\sigma^*$ that associates each prediction to a unique ground-truth box while minimizing the matching loss $\mathcal{L}_\text{match}$ defined as follows:
\[
\mathcal{L}_\text{match}(\hat{y}_i, y_j) = \begin{cases}
-\hat{p}_i[c_j] + \mathcal{L}_\text{box}(\hat{b}_i, b_j) & \text{if $c_j \neq \varnothing$} \\
0 & \text{otherwise}
\end{cases}
\]
where $\mathcal{L}_\text{box}$ is a loss based on the linear combination of the Huber loss and IoU.
The overall problem is the following:
\[ \sigma^* = \arg\min_\sigma \sum_{i=1}^{N} \mathcal{L}_\text{match}(\hat{y}_{\sigma(i)}, y_i) \]
\item Given the optimal permutation $\sigma^*$, compute the loss as:
\[ \mathcal{L}_\text{hungarian}(\hat{y}, y) = \sum_{i=1}^{N} \left( - \ln\left( \hat{p}_{\sigma^*(i)}(c_i) \right) + \mathbbm{1}[c_i \neq \varnothing] \mathcal{L}_\text{box}(\hat{b}_{\sigma^*(i)}, b_i) \right) \]
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/hungarian_loss.png}
\caption{
Possible permutations and optimal permutation (in orange).
}
\end{figure}
\end{description} \end{description}
\end{description}
\begin{remark}
Results show that faster R-CNN + FPN is better for smaller object and DETR performs best with larger objects.
\end{remark}
\begin{remark}[Visualization]
By analyzing the main components of DETR, the following can be observed:
\begin{descriptionlist}
\item[Encoder] The encoder tend to solve a segmentation problem (i.e., determine what the object is).
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/detr_encoder.png}
\caption{
\parbox[t]{0.75\linewidth}{Self-attention map of some pixels at the last encoder. Yellow tiles indicate that the analyzed pixel attends to that patch.}
}
\end{figure}
\item[Decoder] The decoder tend to attend at object boundaries (i.e., determine where the object is).
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/detr_decoder.png}
\caption{
\parbox[t]{0.75\linewidth}{Decoder attention. Highlighted areas have a higher attention weight.}
}
\end{figure}
\item[Object query] Each object query tend to be specialized in recognizing objects in specific areas.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/detr_object_query.png}
\caption{
\parbox[t]{0.75\linewidth}{Position of the predictions of each object query. Green dots represent small boxes, red large horizontal boxes, and blue large vertical boxes.}
}
\end{figure}
\end{descriptionlist}
\end{remark}
\end{description}
\begin{subappendices}
\section{EfficientDet}
\begin{remark}
When working with object detection, there are many options to scale a model:
\begin{descriptionlist}
\item[Backbone] Change the CNN to refine the input image.
\item[Image resolution] Change the resolutions produced by the CNN.
\item[Multi-scale feature representation] Change the architecture of the multi-scale detector.
\item[Detector head] Change the classification and regression heads.
\end{descriptionlist}
\end{remark}
\begin{description}
\item[EfficientDet] \marginnote{EfficientDet}
Similarly to MobileNet and EfficientNet, EfficientDet uses a compound scaling coefficient $\phi$ to scale up the model (using heuristic rules).
\begin{remark}
Before EfficientDec, other multi-scale feature representations have been introduced:
\begin{descriptionlist}
\item[FPN]
As described in \Cref{sec:multiscale_detector}.
\item[PANet] \marginnote{PANet}
Based on FPN with an additional bottom-up path to merge higher resolution features with coarser ones.
\item[NAS-FPN] \marginnote{NAS-FPN}
Base block found using neural architecture search. Multiple blocks are repeated to obtain the multi-scale features.
\end{descriptionlist}
\begin{figure}[H]
\centering
\includegraphics[width=0.65\linewidth]{./img/multiscale_comparison.png}
\end{figure}
\end{remark}
\begin{description}
\item[Weighted bi-directional feature pyramid network (BiFPN)] \marginnote{Weighted bi-directional feature pyramid network (BiFPN)}
Architecture to represent multi-scale features with the following characteristics:
\begin{itemize}
\item Applied as repeated blocks (i.e., as NAS-FPN).
\item Based on PANet with the following changes:
\begin{itemize}
\item Activations generated by only one parent activation are removed and the connections are adjusted.
\item Add a connection between input and output node of a block.
\item Use depth-wise separable convolutions.
\item Add learnable weights to weigh features at different resolutions.
\end{itemize}
The output $P_i^\text{out}$ at the $i$-th scale is therefore computed as:
\[
\begin{split}
P_i^\text{out} &= \texttt{conv}\left( \frac{w_{i, 1} P_i^\text{in} + w_{i, 2} P_i^\text{top-down} + w_{i, 3} \texttt{downsample}(P_{i-1}^\text{out})}{w_{i, 1} + w_{i, 2} + w_{i, 3} + \varepsilon} \right) \\
P_i^\text{top-down} &= \texttt{conv}\left( w_{i, 1}' P_i^\text{in} + w_{i+1, \text{td}} \texttt{upsample}(P_{i+1}^\text{top-down}) \right)
\end{split}
\]
\begin{figure}[H]
\centering
\includegraphics[width=0.15\linewidth]{./img/bifpn.png}
\end{figure}
\end{itemize}
\end{description}
\end{description}
\end{subappendices}

View File

@ -0,0 +1,122 @@
\chapter{Segmentation}
\section{Semantic segmentation}
\begin{description}
\item[Semantic segmentation] \marginnote{Semantic segmentation}
Given an input image, output a category for each pixel.
\item[Pixel-wise IoU] \marginnote{Pixel-wise IoU}
IoU generalized to pixel-wise segmentation masks. Given a class $c$, the ground-truths $y^{(i)}$, and predictions $\hat{y}^{(i)}$, the IoU w.r.t. $c$ is computed as follows:
\[
\begin{gathered}
TP_c = \sum_{i=1} | \text{pixels $(u, v): y_{(u, v)}^{(i)} = c \land \hat{y}_{(u, v)}^{(i)} = c$} | \\
\texttt{IoU}_c = \frac{TP_c}{\sum_{i=1} \left( | (u, v): \hat{y}_{(u, v)}^{(i)} = c | + | (u, v): y_{(u, v)}^{(i)} = c | \right) - TP_c}
\end{gathered}
\]
The mean IoU is computed as:
\[ \texttt{mIoU} = \frac{1}{C} \sum_{c=1}^{C} \texttt{IoU}_c \]
\end{description}
\subsection{Kinect human pose estimation}
\begin{description}
\item[Human pose detection] \marginnote{Human pose detection}
Task of detecting the position and orientation of a person.
\item[Pipeline]
Kinect pose detection is done in three phases:
\begin{enumerate}
\item Capture a depth image and remove the background to obtain the depth map of a person.
\item Classify each pixel into a body part.
\item Determine the position of the joints (i.e., skeleton) by finding local modes (i.e., center of mass).
\end{enumerate}
\item[Synthetic annotated data] \marginnote{Synthetic annotated data}
The data used to create the model is artificially generated. $100$k poses were captured using motion capture devices. Then, different mock-up body models (for which the ground-truth is known) were simulated and recorded through a virtual camera with the same intrinsic parameters of the Kinect camera. This allows to obtain robustness with different body and clothing shapes.
\begin{remark}
The workflow of the original paper consists of iteratively training the model and creating new training data for poses that the current model struggles on.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/motion_data.png}
\end{figure}
\item[Depth comparison features] \marginnote{Depth comparison features}
Given a depth image $D$ and the offsets $\theta = (\Delta p, \Delta n)$, each pixel $x$ of $D$ produces a feature as follows:
\[ f(x; D, (\Delta p, \Delta n)) = D\left[ x + \frac{\Delta p}{D[x]} \right] - D\left[ x + \frac{\Delta n}{D[x]} \right] \]
In other words, each $x$ is described by the difference in depth between two points offset from $x$. The depth at background pixels is a large positive number.
\begin{remark}
As real-time processing is required, this approach allows to quickly compute features to discriminate parts of the body. However, it does not always produce a correct response.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_depth_comparison_features.pdf}
\caption{Examples of feature computation}
\end{figure}
\begin{description}
\item[Depth-invariant offsets]
The denominator ($D[x]$) applied to the offsets allows obtaining depth-invariant offsets.
Consider two objects at different depths. The focal length of the camera is $f$ and the world offset we want to apply is $o_w$. By changing depth $d$, the offset in the image plane $o_{di}$ changes due to the perspective projection rules. Therefore, to obtain the offset in the image place, we have that:
\[ o_{di} : f = o_w : d \,\Rightarrow\, o_{di} = \frac{o_w f}{d} = \frac{\Delta p}{d} \]
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_depth_invariant_offset.pdf}
\end{figure}
\end{description}
\item[Decision tree] \marginnote{Decision tree}
Depth comparison features are used to train decision trees.
\begin{remark}
Decision trees are unstable robust classifiers. They are able to achieve good performance but significantly change in structure if the training data is slightly perturbed (i.e., high variance).
\end{remark}
\begin{description}
\item[Random forest] \marginnote{Random forest}
Ensemble of $N$ decision trees that aims to reduce variance by averaging their predictions.
\begin{remark}
For a random forest to be effective, its decision trees should be uncorrelated so that when averaging, the average of their errors tend to $0$.
\end{remark}
\begin{description}
\item[Bootstrap aggregating (bagging)] \marginnote{Bootstrap aggregating (bagging)}
Train each decision tree using a replica of the training set obtained by sampling with replacement.
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.7\linewidth]{./img/_random_forest_bagging.pdf}
\end{figure}
\item[Random splitting] \marginnote{Random splitting}
Even though bagging reduces variance, if there is a subset of particularly predictive features, the resulting trees will be correlated.
To avoid this, in a random forest, tree splitting is done on a different random subset of features each time.
\begin{figure}[H]
\raggedleft
\includegraphics[width=0.7\linewidth]{./img/_random_forest_random_splitting.pdf}
\end{figure}
\end{description}
\begin{remark}
Random forests are:
\begin{itemize}
\item Fast and parallelizable in both training and inference.
\item Robust to hyperparameters change.
\item Interpretable.
\end{itemize}
\end{remark}
\end{description}
\end{description}