diff --git a/src/year2/machine-learning-for-computer-vision/img/_depth_comparison_features.pdf b/src/year2/machine-learning-for-computer-vision/img/_depth_comparison_features.pdf new file mode 100644 index 0000000..9c0f384 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_depth_comparison_features.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_depth_invariant_offset.pdf b/src/year2/machine-learning-for-computer-vision/img/_depth_invariant_offset.pdf new file mode 100644 index 0000000..4e70ae8 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_depth_invariant_offset.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_detr_architecture.pdf b/src/year2/machine-learning-for-computer-vision/img/_detr_architecture.pdf new file mode 100644 index 0000000..56ed860 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_detr_architecture.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_random_forest_bagging.pdf b/src/year2/machine-learning-for-computer-vision/img/_random_forest_bagging.pdf new file mode 100644 index 0000000..9f1b38e Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_random_forest_bagging.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_random_forest_random_splitting.pdf b/src/year2/machine-learning-for-computer-vision/img/_random_forest_random_splitting.pdf new file mode 100644 index 0000000..f811ab5 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_random_forest_random_splitting.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/bifpn.png b/src/year2/machine-learning-for-computer-vision/img/bifpn.png new file mode 100644 index 0000000..ba6c3cb Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/bifpn.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/detr_decoder.png b/src/year2/machine-learning-for-computer-vision/img/detr_decoder.png new file mode 100644 index 0000000..a8f4ac5 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/detr_decoder.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/detr_encoder.png b/src/year2/machine-learning-for-computer-vision/img/detr_encoder.png new file mode 100644 index 0000000..d6cd43a Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/detr_encoder.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/detr_object_query.png b/src/year2/machine-learning-for-computer-vision/img/detr_object_query.png new file mode 100644 index 0000000..e86176c Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/detr_object_query.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/hungarian_loss.png b/src/year2/machine-learning-for-computer-vision/img/hungarian_loss.png new file mode 100644 index 0000000..c163af2 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/hungarian_loss.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/motion_data.png b/src/year2/machine-learning-for-computer-vision/img/motion_data.png new file mode 100644 index 0000000..7c04758 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/motion_data.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/multiscale_comparison.png b/src/year2/machine-learning-for-computer-vision/img/multiscale_comparison.png new file mode 100644 index 0000000..21a3539 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/multiscale_comparison.png differ diff --git a/src/year2/machine-learning-for-computer-vision/ml4cv.tex b/src/year2/machine-learning-for-computer-vision/ml4cv.tex index 45e4cc0..635279d 100644 --- a/src/year2/machine-learning-for-computer-vision/ml4cv.tex +++ b/src/year2/machine-learning-for-computer-vision/ml4cv.tex @@ -1,4 +1,5 @@ \documentclass[11pt]{ainotes} +\usepackage{appendix} \title{Machine Learning for Computer Vision} \date{2024 -- 2025} @@ -12,5 +13,6 @@ \include{./sections/_architectures.tex} \include{./sections/_transformers.tex} \include{./sections/_object_detection.tex} + \include{./sections/_segmentation.tex} \end{document} \ No newline at end of file diff --git a/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex b/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex index 9dad916..f970a5d 100644 --- a/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex +++ b/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex @@ -644,7 +644,7 @@ \end{remark} -\subsection{Multi-scale detectors} +\subsection{Multi-scale detectors} \label{sec:multiscale_detector} \begin{description} \item[Image pyramid multi-scale detection] \marginnote{Image pyramid multi-scale detection} @@ -1012,11 +1012,9 @@ \begin{figure}[H] \centering - \includegraphics[width=0.8\linewidth]{./img/_object_detection_map_speed_plot.pdf} + \includegraphics[width=0.75\linewidth]{./img/_object_detection_map_speed_plot.pdf} \caption{ - \parbox[t]{0.5\linewidth}{ - mAP -- speed comparison of the various object detection approaches - } + mAP -- speed comparison of the various object detection approaches } \end{figure} @@ -1028,7 +1026,7 @@ Method based on two ideas: \begin{itemize} \item Use transformers to predict a set of objects in a single pass (i.e., solve a set prediction problem). - \item Use the bipartite matching (i.e., Hungarian) loss as set prediction loss that forces a unique matching between predictions and ground-truths. + \item Use the Hungarian (i.e., bipartite matching) loss as set prediction loss that forces a unique matching between predictions and ground-truths. \end{itemize} \begin{description} @@ -1043,5 +1041,155 @@ In object detection, there are no other generated output tokens to feed into the decoder for parallel decoding. Therefore, only the learned positional encodings are used. \end{remark} \end{description} + + \item[Architecture] + Given an input image $I$ of size $H \times W \times 3$, DETR does the following: + \begin{enumerate} + \item Pass the input $I$ through a CNN feature extractor to obtain an activation $A$ of shape $\frac{H}{32} \times \frac{W}{32} \times 2048$. + \item Use a $1 \times 1$ convolution to adjust the number of channels of $A$ from $2048$ to $d$. + \item Add positional encoding to $A$. + \item Pass the activation $A$ through the transformer encoder to obtain the keys and values for the decoder. + \item Pass the learned object queries through the decoder to obtain the outputs $O_i$. + \item Pass each output $O_i$ through an MLP to obtain class and box predictions. + \end{enumerate} + + \begin{figure}[H] + \centering + \includegraphics[width=0.75\linewidth]{./img/_detr_architecture.pdf} + \end{figure} + + \item[Hungarian loss] \marginnote{Hungarian loss} + Consider for simplicity a problem with $2$ classes (plus background). Given: + \begin{itemize} + \item $N$ predictions $\{ \hat{y}_i = (\hat{p}_i, \hat{b}_i) \}_{i=1}^N$ where $\hat{p}_i$ is the class probability distribution and $\hat{b}_i$ describes the bounding box normalized w.r.t. the image size. + \item $O$ ground-truth boxes padded to $N$ with background classes $\{ \hat{y}_i = (c_i, b_i) \}_{i=1}^O \cup \{ \hat{y}_i = \varnothing \}_{i=N - O}^{N}$ where $c_i$ is the class, $b_i$ describes the bounding box normalized w.r.t. the image size, and $\varnothing$ represents the background class. + \end{itemize} + + The Hungarian loss is defined in two steps: + \begin{enumerate} + \item Solve the bipartite matching problem of finding the optimal permutation $\sigma^*$ that associates each prediction to a unique ground-truth box while minimizing the matching loss $\mathcal{L}_\text{match}$ defined as follows: + \[ + \mathcal{L}_\text{match}(\hat{y}_i, y_j) = \begin{cases} + -\hat{p}_i[c_j] + \mathcal{L}_\text{box}(\hat{b}_i, b_j) & \text{if $c_j \neq \varnothing$} \\ + 0 & \text{otherwise} + \end{cases} + \] + where $\mathcal{L}_\text{box}$ is a loss based on the linear combination of the Huber loss and IoU. + + The overall problem is the following: + \[ \sigma^* = \arg\min_\sigma \sum_{i=1}^{N} \mathcal{L}_\text{match}(\hat{y}_{\sigma(i)}, y_i) \] + + \item Given the optimal permutation $\sigma^*$, compute the loss as: + \[ \mathcal{L}_\text{hungarian}(\hat{y}, y) = \sum_{i=1}^{N} \left( - \ln\left( \hat{p}_{\sigma^*(i)}(c_i) \right) + \mathbbm{1}[c_i \neq \varnothing] \mathcal{L}_\text{box}(\hat{b}_{\sigma^*(i)}, b_i) \right) \] + \end{enumerate} + + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/hungarian_loss.png} + \caption{ + Possible permutations and optimal permutation (in orange). + } + \end{figure} \end{description} -\end{description} \ No newline at end of file + + \begin{remark} + Results show that faster R-CNN + FPN is better for smaller object and DETR performs best with larger objects. + \end{remark} + + \begin{remark}[Visualization] + By analyzing the main components of DETR, the following can be observed: + \begin{descriptionlist} + \item[Encoder] The encoder tend to solve a segmentation problem (i.e., determine what the object is). + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/detr_encoder.png} + \caption{ + \parbox[t]{0.75\linewidth}{Self-attention map of some pixels at the last encoder. Yellow tiles indicate that the analyzed pixel attends to that patch.} + } + \end{figure} + + \item[Decoder] The decoder tend to attend at object boundaries (i.e., determine where the object is). + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/detr_decoder.png} + \caption{ + \parbox[t]{0.75\linewidth}{Decoder attention. Highlighted areas have a higher attention weight.} + } + \end{figure} + + \item[Object query] Each object query tend to be specialized in recognizing objects in specific areas. + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/detr_object_query.png} + \caption{ + \parbox[t]{0.75\linewidth}{Position of the predictions of each object query. Green dots represent small boxes, red large horizontal boxes, and blue large vertical boxes.} + } + \end{figure} + \end{descriptionlist} + \end{remark} +\end{description} + + +\begin{subappendices} + \section{EfficientDet} + \begin{remark} + When working with object detection, there are many options to scale a model: + \begin{descriptionlist} + \item[Backbone] Change the CNN to refine the input image. + \item[Image resolution] Change the resolutions produced by the CNN. + \item[Multi-scale feature representation] Change the architecture of the multi-scale detector. + \item[Detector head] Change the classification and regression heads. + \end{descriptionlist} + \end{remark} + + \begin{description} + \item[EfficientDet] \marginnote{EfficientDet} + Similarly to MobileNet and EfficientNet, EfficientDet uses a compound scaling coefficient $\phi$ to scale up the model (using heuristic rules). + + \begin{remark} + Before EfficientDec, other multi-scale feature representations have been introduced: + \begin{descriptionlist} + \item[FPN] + As described in \Cref{sec:multiscale_detector}. + + \item[PANet] \marginnote{PANet} + Based on FPN with an additional bottom-up path to merge higher resolution features with coarser ones. + + \item[NAS-FPN] \marginnote{NAS-FPN} + Base block found using neural architecture search. Multiple blocks are repeated to obtain the multi-scale features. + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/multiscale_comparison.png} + \end{figure} + \end{remark} + + \begin{description} + \item[Weighted bi-directional feature pyramid network (BiFPN)] \marginnote{Weighted bi-directional feature pyramid network (BiFPN)} + Architecture to represent multi-scale features with the following characteristics: + \begin{itemize} + \item Applied as repeated blocks (i.e., as NAS-FPN). + \item Based on PANet with the following changes: + \begin{itemize} + \item Activations generated by only one parent activation are removed and the connections are adjusted. + \item Add a connection between input and output node of a block. + \item Use depth-wise separable convolutions. + \item Add learnable weights to weigh features at different resolutions. + \end{itemize} + The output $P_i^\text{out}$ at the $i$-th scale is therefore computed as: + \[ + \begin{split} + P_i^\text{out} &= \texttt{conv}\left( \frac{w_{i, 1} P_i^\text{in} + w_{i, 2} P_i^\text{top-down} + w_{i, 3} \texttt{downsample}(P_{i-1}^\text{out})}{w_{i, 1} + w_{i, 2} + w_{i, 3} + \varepsilon} \right) \\ + P_i^\text{top-down} &= \texttt{conv}\left( w_{i, 1}' P_i^\text{in} + w_{i+1, \text{td}} \texttt{upsample}(P_{i+1}^\text{top-down}) \right) + \end{split} + \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.15\linewidth]{./img/bifpn.png} + \end{figure} + \end{itemize} + \end{description} + \end{description} +\end{subappendices} \ No newline at end of file diff --git a/src/year2/machine-learning-for-computer-vision/sections/_segmentation.tex b/src/year2/machine-learning-for-computer-vision/sections/_segmentation.tex new file mode 100644 index 0000000..5c2202f --- /dev/null +++ b/src/year2/machine-learning-for-computer-vision/sections/_segmentation.tex @@ -0,0 +1,122 @@ +\chapter{Segmentation} + + +\section{Semantic segmentation} + +\begin{description} + \item[Semantic segmentation] \marginnote{Semantic segmentation} + Given an input image, output a category for each pixel. + + \item[Pixel-wise IoU] \marginnote{Pixel-wise IoU} + IoU generalized to pixel-wise segmentation masks. Given a class $c$, the ground-truths $y^{(i)}$, and predictions $\hat{y}^{(i)}$, the IoU w.r.t. $c$ is computed as follows: + \[ + \begin{gathered} + TP_c = \sum_{i=1} | \text{pixels $(u, v): y_{(u, v)}^{(i)} = c \land \hat{y}_{(u, v)}^{(i)} = c$} | \\ + \texttt{IoU}_c = \frac{TP_c}{\sum_{i=1} \left( | (u, v): \hat{y}_{(u, v)}^{(i)} = c | + | (u, v): y_{(u, v)}^{(i)} = c | \right) - TP_c} + \end{gathered} + \] + + The mean IoU is computed as: + \[ \texttt{mIoU} = \frac{1}{C} \sum_{c=1}^{C} \texttt{IoU}_c \] +\end{description} + + +\subsection{Kinect human pose estimation} + +\begin{description} + \item[Human pose detection] \marginnote{Human pose detection} + Task of detecting the position and orientation of a person. + + \item[Pipeline] + Kinect pose detection is done in three phases: + \begin{enumerate} + \item Capture a depth image and remove the background to obtain the depth map of a person. + \item Classify each pixel into a body part. + \item Determine the position of the joints (i.e., skeleton) by finding local modes (i.e., center of mass). + \end{enumerate} + + \item[Synthetic annotated data] \marginnote{Synthetic annotated data} + The data used to create the model is artificially generated. $100$k poses were captured using motion capture devices. Then, different mock-up body models (for which the ground-truth is known) were simulated and recorded through a virtual camera with the same intrinsic parameters of the Kinect camera. This allows to obtain robustness with different body and clothing shapes. + + \begin{remark} + The workflow of the original paper consists of iteratively training the model and creating new training data for poses that the current model struggles on. + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.75\linewidth]{./img/motion_data.png} + \end{figure} + + \item[Depth comparison features] \marginnote{Depth comparison features} + Given a depth image $D$ and the offsets $\theta = (\Delta p, \Delta n)$, each pixel $x$ of $D$ produces a feature as follows: + \[ f(x; D, (\Delta p, \Delta n)) = D\left[ x + \frac{\Delta p}{D[x]} \right] - D\left[ x + \frac{\Delta n}{D[x]} \right] \] + In other words, each $x$ is described by the difference in depth between two points offset from $x$. The depth at background pixels is a large positive number. + + \begin{remark} + As real-time processing is required, this approach allows to quickly compute features to discriminate parts of the body. However, it does not always produce a correct response. + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/_depth_comparison_features.pdf} + \caption{Examples of feature computation} + \end{figure} + + \begin{description} + \item[Depth-invariant offsets] + The denominator ($D[x]$) applied to the offsets allows obtaining depth-invariant offsets. + + Consider two objects at different depths. The focal length of the camera is $f$ and the world offset we want to apply is $o_w$. By changing depth $d$, the offset in the image plane $o_{di}$ changes due to the perspective projection rules. Therefore, to obtain the offset in the image place, we have that: + \[ o_{di} : f = o_w : d \,\Rightarrow\, o_{di} = \frac{o_w f}{d} = \frac{\Delta p}{d} \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/_depth_invariant_offset.pdf} + \end{figure} + \end{description} + + \item[Decision tree] \marginnote{Decision tree} + Depth comparison features are used to train decision trees. + + \begin{remark} + Decision trees are unstable robust classifiers. They are able to achieve good performance but significantly change in structure if the training data is slightly perturbed (i.e., high variance). + \end{remark} + + \begin{description} + \item[Random forest] \marginnote{Random forest} + Ensemble of $N$ decision trees that aims to reduce variance by averaging their predictions. + + \begin{remark} + For a random forest to be effective, its decision trees should be uncorrelated so that when averaging, the average of their errors tend to $0$. + \end{remark} + + \begin{description} + \item[Bootstrap aggregating (bagging)] \marginnote{Bootstrap aggregating (bagging)} + Train each decision tree using a replica of the training set obtained by sampling with replacement. + + \begin{figure}[H] + \raggedleft + \includegraphics[width=0.7\linewidth]{./img/_random_forest_bagging.pdf} + \end{figure} + + \item[Random splitting] \marginnote{Random splitting} + Even though bagging reduces variance, if there is a subset of particularly predictive features, the resulting trees will be correlated. + + To avoid this, in a random forest, tree splitting is done on a different random subset of features each time. + + \begin{figure}[H] + \raggedleft + \includegraphics[width=0.7\linewidth]{./img/_random_forest_random_splitting.pdf} + \end{figure} + \end{description} + + \begin{remark} + Random forests are: + \begin{itemize} + \item Fast and parallelizable in both training and inference. + \item Robust to hyperparameters change. + \item Interpretable. + \end{itemize} + \end{remark} + \end{description} +\end{description} \ No newline at end of file