unibo-ai-notes/src/year1/image-processing-and-computer-vision/module1/sections/_local_features.tex

\chapter{Local features}

\begin{description}
    \item[Correspondence points] \marginnote{Correspondence points}
        Image points projected from the same 3D point from different views of the scene.

        \begin{example}[Homography]
            Align two images of the same scene to create a larger image.
            An homography requires at least 4 correspondences.
            To find them, it does the following:
            \begin{itemize}
                \item Independently find salient points in the two images.
                \item Compute a local description of the salient points.
                \item Compare descriptions to find matching points.
            \end{itemize}
        \end{example}


    \item[Local invariant features] \marginnote{Local invariant features}
        Find correspondences in three steps:
        \begin{descriptionlist}
            \item[Detection] \marginnote{Detection}
                Find salient points (keypoints).

                The detector should have the following properties:
                \begin{descriptionlist}
                    \item[Repeatability] Find the same keypoints across different images.
                    \item[Saliency] Find keypoints surrounded by informative patterns.
                    \item[Fast] As it must scan the entire image.
                \end{descriptionlist}


            \item[Description] \marginnote{Description}
                Compute a descriptor for each salient point based on its neighborhood.

                A descriptor should have the following properties:
                \begin{descriptionlist}
                    \item[Invariant] Robust to as many transformations as possible (i.e. illumination, weather, scaling, viewpoint, \dots).
                    \item[Distinctiveness/robustness trade-off] The description should only capture important information around a keypoint and
                        ignore irrelevant features or noise.
                    \item[Compactness] The description should be concise.
                \end{descriptionlist}


            \item[Matching] \marginnote{Matching}
                Identify the same descriptor across different images.
        \end{descriptionlist}
\end{description}

\begin{remark}
    Edges are not good interest points as they are locally ambiguous (i.e. pixels are very similar along the direction of the gradient).

    Corners on the other hand are more suited as they have a larger variation along all directions.
\end{remark}


\section{Moravec's corner detector}
\marginnote{Moravec's corner detector}

Given a window $W$ of size $n \times n$,
the cornerness of a pixel $p$ is given by the minimum squared difference between
the intensity of $W$ centered on $p$ and the intensity of $W$ centered on each of its neighbors:
\[ C(p) = \min_{q \in \mathcal{N}(p)} \Vert W(p) - W(q) \Vert^2 \]

After computing the cornerness of each pixel, one can apply thresholding and then NMS to obtain a matrix where $1$ indicates a corner.

\begin{figure}[H]
    \centering
    \begin{subfigure}{0.3\linewidth}
        \includegraphics[width=0.9\linewidth]{./img/_corner_detector_example_flat.pdf}
        \caption{Flat region: $C(p)$ is low.}
    \end{subfigure}
    \begin{subfigure}{0.3\linewidth}
        \includegraphics[width=0.8\linewidth]{./img/_corner_detector_example_edge.pdf}
        \caption{Edge: $C(p)$ is low.}
    \end{subfigure}
    \begin{subfigure}{0.3\linewidth}
        \includegraphics[width=0.8\linewidth]{./img/_corner_detector_example_corner.pdf}
        \caption{Corner: $C(p)$ is high.}
    \end{subfigure}
\end{figure}

\begin{remark}
    Moravec corner detector is isotropic (i.e. independent of the direction).
\end{remark}


\section{Harris' corner detector}

\subsection{Structure matrix}

Harris' corner detector uses an error function formulated as the continuous version of Moravec's detector and
assumes an infinitesimal shift $(\Delta x, \Delta y)$ of the image:
\[ E(\Delta x, \Delta y) = \sum_{x, y} w(x, y) \big( I(x+\Delta x, y+\Delta y) - I(x, y) \big)^2 \]
where $w(x, y)$ represents a window and can be seen as a mask with $1$ when the pixel belongs to the window and $0$ otherwise.

By employing the Taylor's series $f(x + \Delta x) = f(x) + f'(x) \Delta x$,
we can expand the intensity difference as:
\[
    \begin{split}
        I(x+\Delta x, y+\Delta y) - I(x, y) &\approx \big( I(x, y) + \partial_x I(x, y)\Delta x + \partial_y I(x, y)\Delta y \big) - I(x, y) \\
        &= \partial_x I(x, y)\Delta x + \partial_y I(x, y)\Delta y
    \end{split}
\]

By developing the error function into matrix form, we obtain the following:
\[
    \begin{split}
        E(\Delta x, \Delta y) &= \sum_{x, y} w(x, y) \big( I(x+\Delta x, y+\Delta y) - I(x, y) \big)^2 \\
        &= \sum_{x, y} w(x, y) \big( \partial_x I(x, y)\Delta x + \partial_y I(x, y)\Delta y \big)^2 \\
        &= \sum_{x, y} w(x, y) \big( \partial_x^2 I(x, y)\Delta x^2 + 2 \partial_x I(x, y) \partial_y I(x, y) \Delta x \Delta y + \partial_y^2 I(x, y)\Delta y^2 \big) \\
        &= \sum_{x, y} w(x, y) \left(
            \begin{bmatrix} \Delta x & \Delta y \end{bmatrix}
            \begin{bmatrix}
                \partial_x^2 I(x, y)                    & \partial_x I(x, y) \partial_y I(x, y) \\
                \partial_x I(x, y) \partial_y I(x, y)   & \partial_y^2 I(x, y)
            \end{bmatrix}
            \begin{bmatrix} \Delta x \\ \Delta y \end{bmatrix}
        \right) \\
        &= \begin{bmatrix} \Delta x & \Delta y \end{bmatrix}
            \begin{bmatrix}
                \sum_{x, y} w(x, y) \partial_x^2 I(x, y)                        & \sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y)) \\
                \sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y))     & \sum_{x, y} w(x, y) \partial_y^2 I(x, y)
            \end{bmatrix}
            \begin{bmatrix} \Delta x \\ \Delta y \end{bmatrix} \\
        &= \begin{bmatrix} \Delta x & \Delta y \end{bmatrix}
            \matr{M}
            \begin{bmatrix} \Delta x \\ \Delta y \end{bmatrix} \\
    \end{split}
\]

\begin{description}
    \item[Structure matrix] \marginnote{Structure matrix}
        Matrix $\matr{M}_w$ that encodes the local structure of the image at the pixels within a window $w$.
        \[ \matr{M}_w = \begin{pmatrix}
            \sum_{x, y} w(x, y) \partial_x^2 I(x, y)                        & \sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y)) \\
            \sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y))     & \sum_{x, y} w(x, y) \partial_y^2 I(x, y)
        \end{pmatrix} \]

        $\matr{M}_w$ is real and symmetric, thus it is diagonalizable through an orthogonal matrix $\matr{R}$:
        \[ \matr{M}_w = \matr{R} \begin{pmatrix} \lambda_1^{(w)} & 0 \\ 0 & \lambda_2^{(w)} \end{pmatrix} \matr{R}^T \]
        $\matr{R}^T$ is the rotation matrix that aligns the image to the eigenvectors of $\matr{M}_w$,
        while the eigenvalues remain the same for any rotation of the same patch.

        Therefore, the eigenvalues $\lambda_1^{(w)}, \lambda_2^{(w)}$ of $\matr{M}_w$ allow to detect intensity changes along the shift directions:
        \[
            \begin{split}
                E(\Delta x, \Delta y) &= \begin{pmatrix} \Delta x & \Delta y \end{pmatrix}
                    \begin{pmatrix} \lambda_1^{(w)} & 0 \\ 0 & \lambda_2^{(w)} \end{pmatrix}
                    \begin{pmatrix} \Delta x \\ \Delta y \end{pmatrix} \\
                &= \lambda_1^{(w)} \Delta x^2 + \lambda_2^{(w)} \Delta y^2
            \end{split}
        \]

        \begin{figure}[H]
            \centering
            \includegraphics[width=0.6\linewidth]{./img/_harris_rotation.pdf}
            \caption{Eigenvalues relationship at different regions of an image.}
        \end{figure}
\end{description}


\subsection{Algorithm}
\marginnote{Harris' corner detector}

As computing the eigenvalues of $\matr{M}_w$ at each pixel is expensive, a more efficient cornerness function is the following:
\[
    \begin{split}
        C(x, y) &= \lambda_1^{(w)}\lambda_2^{(w)} - k(\lambda_1^{(w)} + \lambda_2^{(w)})^2 \\
        &= \det(\matr{M}_{w(x,y)}) - k \cdot \text{trace}(\matr{M}_{w(x,y)})^2
    \end{split}
\]
where $k$ is a hyperparameter (empirically in $[0.04, 0.06]$).

\begin{figure}[H]
    \centering
    \includegraphics[width=0.5\linewidth]{./img/_harris_efficient.pdf}
    \caption{Cornerness at different regions.}
\end{figure}

After computing the cornerness of each pixel, one can apply thresholding and then NMS.

\begin{remark}
    The window function $w(x, y)$ in the original work follows a Gaussian distribution.
\end{remark}


\subsection{Properties}

Harris' corner detector enjoys the following properties:
\begin{descriptionlist}
    \item[Rotation invariance]
        The eigenvalues are invariant to a rotation of the image.

    \item[No affine intensity change invariance]
        An affine intensity change of a signal consists of a gain factor and the addition of a bias (i.e. $I' = \alpha I + \beta$).
        \begin{description}
            \item[Invariance to bias]
                Harris' detector is invariant to an additive bias ($I' = I + \beta$) as a consequence of the approximate derivative computation:
                \[
                    \partial_x I'(i, j) = I'(i, j+1) - I'(i, j) = (I(i, j+1) + \cancel{\beta}) - (I(i, j) + \cancel{\beta})
                \]
            \item[No invariance to gain factor]
                Harris' detector is not invariant to a gain factor ($I' = \alpha I$) as the multiplicative factor is carried in the derivatives.
        \end{description}
        \begin{remark}
            In other words, Harris' detector is not illumination invariant.
        \end{remark}

    \item[No scale invariance]
        Harris' detector is not scale invariant as the use of a fixed window size makes it impossible to recognize the same features when the image is scaled.
\end{descriptionlist}


\section{Multi-scale feature detector}

% \subsection{Scale invariance}

Depending on the scale, an image may exhibit more or less details.
A naive approach consists of using a smaller window size for images with a smaller scale,
but this is not always able to capture the same features due to the details difference.

\begin{description}
    \item[Scale-space] \marginnote{Scale-space}
        One-parameter family of images obtained by increasingly smoothing the input image.

        \begin{remark}
            When smoothing, small details should disappear and no new structures should be introduced.
        \end{remark}

        \begin{remark}
            It is possible to use the same window size when working with scale-space images.
        \end{remark}

        \begin{description}
            \item[Gaussian scale-space] \marginnote{Gaussian scale-space}
                Scale-space obtained using Gaussian smoothing:
                \[ L(x, y, \sigma) = I(x, y) * G(x, y, \sigma) \]
                where $\sigma$ is the standard deviation but also the level of scaling.

                \begin{figure}[H]
                    \centering
                    \includegraphics[width=0.8\linewidth]{./img/_scale_space_example.pdf}
                    \caption{Gaussian scale-space example}
                \end{figure}
        \end{description}
\end{description}


\subsection{Scale-normalized LoG blob detection}

\begin{description}
    \item[Scale-normalized Laplacian of Gaussian] \marginnote{Scale-normalized Laplacian of Gaussian}
        LoG scaled by a factor of $\sigma^2$:
        \[ F(x, y, \sigma) = \sigma^2 \nabla^{(2)} L(x, y, \sigma) = \sigma^2 (I(x, y) * \nabla^{(2)} G(x, y, \sigma)) \]
        $\sigma^2$ avoids small derivatives when the scaling ($\sigma$) is large.

    \item[Characteristic scale] Scale $\sigma$ that produces a peak in the Laplacian response at a given pixel \cite{slides:scale_normalized_log}.

    \item[Algorithm] \marginnote{Scale-normalized LoG blob detection}
        Blob (circle) detection using scale-normalized LoG works as follows \cite{slides:scale_normalized_log}:
        \begin{enumerate}
            \item Create a Gaussian scale-space by applying the scale-normalized Laplacian of Gaussian with different values of $\sigma$.
            \item For each pixel, find the characteristic scale and its corresponding Laplacian response across the scale-space (automatic scale selection).
            \item Filter out the pixels whose response is lower than a threshold and find the peaks.
            \item The found pixels are the centers of the blobs.
                It can be shown that the radius is given by $r = \sigma\sqrt{2}$.
        \end{enumerate}
\end{description}


When detecting a peak, there are two cases:
\begin{descriptionlist}
    \item[Maximum] Dark blobs on a light background.
    \item[Minimum] Light blobs on a dark background.
\end{descriptionlist}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.6\linewidth]{./img/LOG_blob_detection_example.png}
    \caption{Example of application of the algorithm}
\end{figure}

\begin{remark}
    The intuitive idea of the detection algorithm is the following:
    \begin{itemize}
        \item $F(x, y, \sigma)$ keeps growing for $\sigma$s that capture areas within the blob (i.e. with similar intensity).
        \item The Laplacian reaches its peak when its weights capture the entire blob (virtually it detects an edge).
        \item After the peak, the LoG filter will also capture intensities outside the blob and therefore decrease.
    \end{itemize}
\end{remark}

\begin{remark}
    Using different scales creates the effect of searching in a 3D space.
\end{remark}

\begin{remark}
    Scale-normalized LoG blob detection is scale and rotation invariant.
\end{remark}

\begin{remark}
    It empirically holds that, given two points representing the centers of two blobs,
    the ratio between the two characteristic scales is approximately the ratio between the diameters of the two blobs.

    \begin{figure}[H]
        \centering
        \includegraphics[width=0.4\linewidth]{./img/_scaled_log_blob_diameter.pdf}
        \caption{
            \begin{varwidth}[t]{0.55\linewidth}
                Scale-normalized LoG computed on varying $\sigma$.\\
                Note that, in the second image, the characteristic scale is
                higher as the scale is larger.
            \end{varwidth}
        }
    \end{figure}
\end{remark}


\subsection{Difference of Gaussians blob detection}

\begin{description}
    \item[Difference of Gaussians (DoG)] \marginnote{Difference of Gaussians (DoG)}
        Approximation of the scale-normalized LoG computed as:
        \[
            \begin{split}
                \texttt{DoG}(x, y, \sigma) &= \big( G(x, y, k\sigma) - G(x, y, \sigma) \big) * I(x, y) \\
                    &= L(x, y, k\sigma) - L(x, y, \sigma)
            \end{split}
        \]

        \begin{theorem}
            It can be proved that the DoG kernel is a scaled version of the LoG kernel:
            \[ G(x, y, k\sigma) - G(x, y, \sigma) \approx (k-1)\sigma^2 \nabla^{(2)}G(x, y, \sigma) \]

            \begin{remark}
                As we are interested in extrema, the scaling factor is irrelevant.
            \end{remark}
        \end{theorem}

    \item[Extrema detection] \marginnote{DoG extrema}
        Given three DoG images with scales $\sigma_{i-1}$, $\sigma_i$ and $\sigma_{i+1}$,
        a pixel $(x, y, \sigma_i)$ is an extrema (i.e. keypoint) iff:
        \begin{itemize}
            \item It is an extrema in a $3 \times 3$ patch centered on it (8 pixels as $(x, y, \sigma_i)$ is excluded).
            \item It is an extrema in a $3 \times 3$ patch centered on the pixels at $(x, y, \sigma_{i-1})$ and at $(x, y, \sigma_{i+1})$ ($9+9$ pixels).
        \end{itemize}

        \begin{center}
            \includegraphics[width=0.35\linewidth]{./img/_DoG_extrema.pdf}
        \end{center}

    \item[Algorithm] \marginnote{DoG blob detection}
        To detect blob centers (i.e. a DoG extrema), an octave of $s$ DoG images is computed as follows:
        \begin{enumerate}
            \item Compute a scale-space $L$ of $s+1$ Gaussian smoothed images with $\sigma$ varying by a factor $k = 2^{1/s}$.
                As the extrema detection method requires checking the DoG images above and below,
                two additional Gaussians (with scales $k^{-1}\sigma$ and $k^{s+1}\sigma$) are computed.
            \item The DoG image $\texttt{DoG}(\cdot, \cdot, k^i\sigma)$ is obtained as the difference between the
                images $L(\cdot, \cdot, k^{i+1}\sigma)$ and $L(\cdot, \cdot, k^i\sigma)$ of the Gaussian scale-space.
                \begin{figure}[H]
                    \small
                    \centering
                    \includegraphics[width=0.6\linewidth]{./img/_DoG_octave.pdf}
                    \caption{Example of octave computation with $s=4$}
                \end{figure}
            \item Extrema detection is done as described above across the $s$ DoG images.
            \item Points with a weak DoG response can be pruned through thresholding.
                Furthermore, it has been observed that strong DoG points along edges are unstable and can be also pruned.
        \end{enumerate}

        Octaves should be computed using different starting $\sigma$s.
        Instead of recomputing the Gaussian scale-space,
        it is possible to simply down-sample the already computed Gaussians and compute the DoG images starting from shrunk smoothed images.

        \begin{remark}
            In the original work, the input image is first enlarged by a factor of 2.
            Then, four octaves are computed starting from the enlarged image
            (i.e. images of size factor $\times 2$, $\times 1$, $\times \frac{1}{2}$ and $\times \frac{1}{4}$ are considered).
        \end{remark}

        \begin{remark}
            The original work found out that the best hyperparameters are $s=3$ and $\sigma=1.6$.
        \end{remark}
\end{description}

\begin{remark}
    DoG blob detection is scale and rotation invariant.
\end{remark}


\section{Descriptor}

After finding the keypoints, a descriptor of a keypoint is computed from the pixels within a patch centered on it.


\subsection{DoG descriptor}

\begin{description}
    \item[Canonical / Characteristic orientation] \marginnote{Canonical / Characteristic orientation}
        Direction along which the magnitudes of the gradients of the neighboring pixels of a keypoint are the highest.

        Given a pixel $(x, y)$, its gradient magnitude and direction is computed from the Gaussian smoothed image $L$:
        \[
            \begin{split}
                \vert \nabla L(x, y) \vert &= \sqrt{ \big( L(x+1, y) - L(x-1, y) \big)^2 + \big( L(x, y+1) - L(x, y-1) \big)^2 } \\
                \theta_L(x, y) &= \arctan\left( \frac{L(x, y+1) - L(x, y-1)}{L(x+1, y) - L(x-1, y)} \right)
            \end{split}
        \]

        \begin{description}
            \item[Orientation histogram] \marginnote{Orientation histogram}
                By dividing the directions into bins (e.g. bins of size $10^\circ$),
                it is possible to define for each keypoint a histogram by considering its neighboring pixels within a patch.
                For each pixel $(x, y)$ neighboring a keypoint $(x_k, y_k)$, its contribution to the histogram along the direction $\theta_L(x, y)$ is given by:
                \[ G_{(x_k, y_k)}(x, y, \frac{3}{2} \sigma_s(x_k, y_k)) \cdot \vert \nabla L(x, y) \vert \]
                where $G_{(x_k, y_k)}$ is a Gaussian centered on the keypoint and  $\sigma_s(x_k, y_k)$ is the scale of the keypoint.

                The characteristic orientation of a keypoint is given by the highest peak of the orientation histogram.
                Other peaks that are higher than at least $80\%$ of the main one are also considered characteristic orientations
                (i.e. a keypoint might have multiple canonical orientations and, therefore, multiple descriptors).

                For a more accurate estimation, a parabola is interpolated on the neighborhood of each peak and
                the two bins adjacent to the peak of the parabola are considered.

                \begin{figure}[H]
                    \centering
                    \includegraphics[width=0.45\linewidth]{./img/_canonical_histogram.pdf}
                    \caption{Orientation histogram and parabola interpolation}
                \end{figure}
        \end{description}
\end{description}

\begin{description}
    \item[DoG descriptor] \marginnote{DoG descriptor}
        Keypoints are found using the DoG detector and the descriptors are computed through patches along the canonical orientations.

        \begin{remark}
            DoG descriptor is scale and rotation invariant.
        \end{remark}
\end{description}


\subsection{Scale invariant feature transform (SIFT) descriptor}
\marginnote{SIFT descriptor}

Given a keypoint, SIFT detector works as follows:
\begin{enumerate}
    \item Center on the keypoint a $16 \times 16$ grid divided into $4 \times 4$ regions.
    \item Compute for each region its orientation histogram with eight bins (i.e. bins of size $45^\circ$).
        The Gaussian weighting function is centered on the keypoint and has $\sigma$ equal to half the grid size.
    \item The descriptor is obtained by concatenating the histograms of each region.
        This results in a feature vector with $128$ elements ($(4 \cdot 4) \cdot 8$).
    \item Normalize the descriptor to unit length. Pixels larger than $0.2$ are saturated and normalized again (for illumination invariance).
\end{enumerate}

\begin{figure}[H]
    \centering
    \includegraphics[width=0.6\linewidth]{./img/sift.png}
    \caption{SIFT detector example}
\end{figure}

\begin{description}
    \item[Trilinear interpolation]
        Bins are assigned in a soft manner to avoid boundary effects.
        The contribution of a pixel is spread between its two adjacent bins weighted by the distance to the bin centers:\\
        \begin{minipage}{0.55\linewidth}
            \centering
            \includegraphics[width=0.6\linewidth]{./img/_sift_interpolation.pdf}
        \end{minipage}
        \begin{minipage}{0.3\linewidth}
            \[
                \begin{cases}
                    \text{weight}_k = 1 - d_k \\
                    \text{weight}_{k+1} = 1 - d_{k+1} \\
                \end{cases}
            \]
        \end{minipage}

        This is done both on the histogram within a region and on the histograms between the four neighboring regions.
\end{description}

\begin{remark}
    SIFT descriptor is scale, rotation and affine intensity change invariant.
\end{remark}


\section{Matching}

Matching the keypoints across different views is a nearest-neighbor search problem. \marginnote{Nearest-neighbor search problem}

Given a target image $T$ and a reference image $R$,
we want to match each keypoint in $T$ to the most similar one in $R$ (usually using the Euclidean distance).

\begin{description}
    \item[Matching criteria] \marginnote{Matching criteria}
        Given the distance $d_\text{NN}$ to the nearest-neighbor of a keypoint of $T$ in $R$,
        the match is accepted by respecting one of the following criteria:
        \begin{descriptionlist}
            \item[Threshold]
                Given a threshold $T$, the match is accepted iff:
                \[ d_\text{NN} \leq T \]

            \item[Ratio of distances]
                Given a threshold $T$ and the distance to the second nearest-neighbor $d_\text{NN2}$, the match is accepted iff:
                \[ \frac{d_\text{NN}}{d_\text{NN2}} \leq T \]
                This method allows to avoid ambiguity if two neighbors are too close.
                \begin{figure}[H]
                    \centering
                    \begin{subfigure}{0.4\linewidth}
                        \centering
                        \includegraphics[width=0.2\linewidth]{./img/_nn_matching_example1.pdf}
                        \caption{Non ambiguous match}
                    \end{subfigure}
                    \begin{subfigure}{0.4\linewidth}
                        \centering
                        \includegraphics[width=0.2\linewidth]{./img/_nn_matching_example2.pdf}
                        \caption{Ambiguous match}
                    \end{subfigure}
                \end{figure}
        \end{descriptionlist}

        \begin{remark}
            It has been empirically shown that a threshold $T=0.8$ allows to reject $90\%$ of wrong matches while missing $5\%$ of correct matches.
        \end{remark}

    \item[Efficient NN search] \marginnote{Efficient NN search}
        As an exhaustive search is inefficient, indexing techniques may be employed.

        The main indexing technique applied for feature matching is the k-d tree in the best bin first (BBF) variant.

        \begin{remark}
            Best bin first is not optimal.
        \end{remark}
\end{description}