unibo-ai-notes/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex

\chapter{Object detection}


\begin{description}
    \item[Object detection] \marginnote{Object detection}
        Given an RGB $W \times H$ image, determine a set of objects $\{ o_1, \dots, o_n \}$ contained in it. Each object $o_j$ is described by:
        \begin{itemize}
            \item A category $c_j \in \{ 1, \dots, C \}$ as in image classification.
            \item A bounding box $BB_j = [ x_j, y_j, w_j, h_j ]$ where $x_j, w_j \in [0, W-1]$ and $y_j, h_j \in [0, H-1]$. ($x_j$, $y_j$) is the center and ($w_j$, $h_j$) is the size of the box.
            \item A confidence score $\rho_j$.
        \end{itemize}

        \begin{figure}[H]
            \centering
            \includegraphics[width=0.45\linewidth]{./img/_object_detection_example.pdf}
        \end{figure}

        \begin{remark}
            Differently from classification, a model has to:
            \begin{itemize}
                \item Be able to output a variable number of results.
                \item Output both categorical and spatial information.
                \item Work on high resolution input images.
            \end{itemize}
        \end{remark}
\end{description}


\section{Metrics}

\begin{description}
    \item[Intersection over union (\texttt{IoU})]
    \marginnote{Intersection over union (\texttt{IoU})}
        Measures the amount of overlap between two boxes computed as the ratio of the area of intersection over the area of union:
        \[ \texttt{IoU}(BB_i, BB_j) = \frac{\vert BB_i \cap BB_j \vert}{\vert BB_i \vert + \vert BB_j \vert - \vert BB_i \cup BB_j \vert} \]

        \begin{description}
            \item[True/false positive criteria]
                Given a threshold $\rho_\texttt{IoU}$, a detection $BB_i$ is a true positive (\texttt{TP}) w.r.t. a ground truth $\hat{BB_j}$ if it is classified with the same class and:
                \[ \texttt{IoU}(BB_i, \hat{BB_j}) > \rho_\texttt{IoU} \]

                \begin{remark}
                    Confidence can also be considered when determining a match through a threshold $\rho_\text{min}$.
                \end{remark}
        \end{description}

    \item[Recall]
        Measures the number of ground truth objects that have been found:
        \[ \texttt{recall} = \frac{\vert \texttt{TP} \vert}{\vert \text{ground truth boxes} \vert} \]

    \item[Precision]
        Measures the number of correct detections among all the predictions:
        \[ \texttt{precision} = \frac{\vert \texttt{TP} \vert}{\vert \text{model detections} \vert} \]

    \begin{figure}[H]
        \centering
        \includegraphics[width=0.7\linewidth]{./img/obj_det_recall_precision.png}
        \caption{
            Recall and precision in different scenarios
        }
    \end{figure}

    \item[Precision-recall curve]
        Plot that relates precision and recall.

        \begin{example}
            Consider the following image and the bounding boxes found by a detector:
            \begin{figure}[H]
                \centering
                \includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve1.pdf}
                \caption{
                    \parbox[t]{0.6\linewidth}{
                        Ground truth (yellow boxes) and predictions (orange boxes) with their confidence score
                    }
                }
            \end{figure}

            By sorting the confidence scores, it is possible to plot the precision-recall curve by varying the threshold $\rho_\text{min}$:
            \begin{figure}[H]
                \centering
                \includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve2.pdf}
            \end{figure}

            \indenttbox
            \begin{remark}
                Recall is monotonically decreasing, while precision can both decrease and increase.
            \end{remark}
        \end{example}
\end{description}