mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-16 11:31:49 +01:00
Moved IPCV1 in year1
This commit is contained in:
@ -0,0 +1,539 @@
|
||||
\chapter{Local features}
|
||||
|
||||
\begin{description}
|
||||
\item[Correspondence points] \marginnote{Correspondence points}
|
||||
Image points projected from the same 3D point from different views of the scene.
|
||||
|
||||
\begin{example}[Homography]
|
||||
Align two images of the same scene to create a larger image.
|
||||
Homography requires at least 4 correspondences.
|
||||
To find them, it does the following:
|
||||
\begin{itemize}
|
||||
\item Independently find salient points in the two images.
|
||||
\item Compute a local description of the salient points.
|
||||
\item Compare descriptions to find matching points.
|
||||
\end{itemize}
|
||||
\end{example}
|
||||
|
||||
|
||||
\item[Local invariant features] \marginnote{Local invariant features}
|
||||
Find correspondences in three steps:
|
||||
\begin{descriptionlist}
|
||||
\item[Detection] \marginnote{Detection}
|
||||
Find salient points (keypoints).
|
||||
|
||||
The detector should have the following properties:
|
||||
\begin{descriptionlist}
|
||||
\item[Repeatability] Find the same keypoints across different images.
|
||||
\item[Saliency] Find keypoints surrounded by informative patterns.
|
||||
\item[Fast] As it must scan the entire image.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\item[Description] \marginnote{Description}
|
||||
Compute a descriptor for each salient point based on its neighborhood.
|
||||
|
||||
A descriptor should have the following properties:
|
||||
\begin{descriptionlist}
|
||||
\item[Invariant] Robust to as many transformations as possible (i.e. illumination, weather, scaling, viewpoint, \dots).
|
||||
\item[Distinctiveness/robustness trade-off] The description should only capture important information around a keypoint and
|
||||
ignore irrelevant features or noise.
|
||||
\item[Compactness] The description should be concise.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
\item[Matching] \marginnote{Matching}
|
||||
Identify the same descriptor across different images.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
Edges are not good interest points as they are locally ambiguous (i.e. pixels are very similar along the direction of the gradient).
|
||||
|
||||
Corners on the other hand are more suited as they have a larger variation along all directions.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\section{Moravec's corner detector}
|
||||
\marginnote{Moravec's corner detector}
|
||||
|
||||
Given a window $W$ of size $n \times n$,
|
||||
the cornerness of a pixel $p$ is given by the minimum squared difference between
|
||||
the intensity of $W$ centered on $p$ and the intensity of $W$ centered on each of its neighbors:
|
||||
\[ C(p) = \min_{q \in \mathcal{N}(p)} \Vert W(p) - W(q) \Vert^2 \]
|
||||
|
||||
After computing the cornerness of each pixel, one can apply thresholding and then NMS to obtain a matrix where $1$ indicates a corner.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.3\linewidth}
|
||||
\includegraphics[width=0.9\linewidth]{./img/_corner_detector_example_flat.pdf}
|
||||
\caption{Flat region: $C(p)$ is low.}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.3\linewidth}
|
||||
\includegraphics[width=0.8\linewidth]{./img/_corner_detector_example_edge.pdf}
|
||||
\caption{Edge: $C(p)$ is low.}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.3\linewidth}
|
||||
\includegraphics[width=0.8\linewidth]{./img/_corner_detector_example_corner.pdf}
|
||||
\caption{Corner: $C(p)$ is high.}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
Moravec corner detector is isotropic (i.e. independent of the direction).
|
||||
\end{remark}
|
||||
|
||||
|
||||
|
||||
\section{Harris' corner detector}
|
||||
|
||||
\subsection{Structure matrix}
|
||||
|
||||
Harris' corner detector uses an error function formulated as the continuous version of Moravec's detector and
|
||||
assumes an infinitesimal shift $(\Delta x, \Delta y)$ of the image:
|
||||
\[ E(\Delta x, \Delta y) = \sum_{x, y} w(x, y) \big( I(x+\Delta x, y+\Delta y) - I(x, y) \big)^2 \]
|
||||
where $w(x, y)$ in a window centered on $(x, y)$ and can be seen as a mask with $1$ when the pixel belongs to the window and $0$ otherwise.
|
||||
|
||||
By employing the Taylor's series $f(x + \Delta x) = f(x) + f'(x) \Delta x$,
|
||||
we can expand the intensity difference as:
|
||||
\[
|
||||
\begin{split}
|
||||
I(x+\Delta x, y+\Delta y) - I(x, y) &\approx \big( I(x, y) + \partial_x I(x, y)\Delta x + \partial_y I(x, y)\Delta y \big) - I(x, y) \\
|
||||
&= \partial_x I(x, y)\Delta x + \partial_y I(x, y)\Delta y
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
By developing the error function into matrix form, we obtain the following:
|
||||
\[
|
||||
\begin{split}
|
||||
E(\Delta x, \Delta y) &= \sum_{x, y} w(x, y) \big( I(x+\Delta x, y+\Delta y) - I(x, y) \big)^2 \\
|
||||
&= \sum_{x, y} w(x, y) \big( \partial_x I(x, y)\Delta x + \partial_y I(x, y)\Delta y \big)^2 \\
|
||||
&= \sum_{x, y} w(x, y) \big( \partial_x^2 I(x, y)\Delta x^2 + 2 \partial_x I(x, y) \partial_y I(x, y) \Delta x \Delta y + \partial_y^2 I(x, y)\Delta y^2 \big) \\
|
||||
&= \sum_{x, y} w(x, y) \left(
|
||||
\begin{bmatrix} \Delta x & \Delta y \end{bmatrix}
|
||||
\begin{bmatrix}
|
||||
\partial_x^2 I(x, y) & \partial_x I(x, y) \partial_y I(x, y) \\
|
||||
\partial_x I(x, y) \partial_y I(x, y) & \partial_y^2 I(x, y)
|
||||
\end{bmatrix}
|
||||
\begin{bmatrix} \Delta x \\ \Delta y \end{bmatrix}
|
||||
\right) \\
|
||||
&= \begin{bmatrix} \Delta x & \Delta y \end{bmatrix}
|
||||
\begin{bmatrix}
|
||||
\sum_{x, y} w(x, y) \partial_x^2 I(x, y) & \sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y)) \\
|
||||
\sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y)) & \sum_{x, y} w(x, y) \partial_y^2 I(x, y)
|
||||
\end{bmatrix}
|
||||
\begin{bmatrix} \Delta x \\ \Delta y \end{bmatrix} \\
|
||||
&= \begin{bmatrix} \Delta x & \Delta y \end{bmatrix}
|
||||
\matr{M}
|
||||
\begin{bmatrix} \Delta x \\ \Delta y \end{bmatrix} \\
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Structure matrix] \marginnote{Structure matrix}
|
||||
Matrix $\matr{M}_w$ that encodes the local structure of the image at the pixels within a window $w$.
|
||||
\[ \matr{M}_w = \begin{pmatrix}
|
||||
\sum_{x, y} w(x, y) \partial_x^2 I(x, y) & \sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y)) \\
|
||||
\sum_{x, y} w(x, y) (\partial_x I(x, y) \partial_y I(x, y)) & \sum_{x, y} w(x, y) \partial_y^2 I(x, y)
|
||||
\end{pmatrix} \]
|
||||
|
||||
$\matr{M}_w$ is real and symmetric, thus it is diagonalizable through an orthogonal matrix $\matr{R}$:
|
||||
\[ \matr{M}_w = \matr{R} \begin{pmatrix} \lambda_1^{(w)} & 0 \\ 0 & \lambda_2^{(w)} \end{pmatrix} \matr{R}^T \]
|
||||
$\matr{R}^T$ is the rotation matrix that aligns the image to the eigenvectors of $\matr{M}_w$,
|
||||
while the eigenvalues remain the same for any rotation of the same patch.
|
||||
|
||||
Therefore, the eigenvalues $\lambda_1^{(w)}, \lambda_2^{(w)}$ of $\matr{M}_w$ allow to detect intensity changes along the shift directions:
|
||||
\[
|
||||
\begin{split}
|
||||
E(\Delta x, \Delta y) &= \begin{pmatrix} \Delta x & \Delta y \end{pmatrix}
|
||||
\begin{pmatrix} \lambda_1^{(w)} & 0 \\ 0 & \lambda_2^{(w)} \end{pmatrix}
|
||||
\begin{pmatrix} \Delta x \\ \Delta y \end{pmatrix} \\
|
||||
&= \lambda_1^{(w)} \Delta x^2 + \lambda_2^{(w)} \Delta y^2
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_harris_rotation.pdf}
|
||||
\caption{Eigenvalues relationship at different regions of an image.}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Algorithm}
|
||||
\marginnote{Harris' corner detector}
|
||||
|
||||
As computing the eigenvalues of $\matr{M}_w$ at each pixel is expensive, a more efficient cornerness function is the following:
|
||||
\[
|
||||
\begin{split}
|
||||
C(x, y) &= \lambda_1^{(w)}\lambda_2^{(w)} - k(\lambda_1^{(w)} + \lambda_2^{(w)})^2 \\
|
||||
&= \det(\matr{M}_{w(x,y)}) - k \cdot \text{trace}(\matr{M}_{w(x,y)})^2
|
||||
\end{split}
|
||||
\]
|
||||
where $k$ is a hyperparameter (empirically in $[0.04, 0.06]$).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/_harris_efficient.pdf}
|
||||
\caption{Cornerness at different regions.}
|
||||
\end{figure}
|
||||
|
||||
After computing the cornerness of each pixel, one can apply thresholding and then NMS.
|
||||
|
||||
\begin{remark}
|
||||
The window function $w(x, y)$ in the original work follows a Gaussian distribution.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\subsection{Properties}
|
||||
|
||||
Harris' corner detector enjoys the following properties:
|
||||
\begin{descriptionlist}
|
||||
\item[Rotation invariance]
|
||||
The eigenvalues are invariant to a rotation of the image.
|
||||
|
||||
\item[No affine intensity change invariance]
|
||||
An affine intensity change of a signal consists of a gain factor and the addition of a bias (i.e. $I' = \alpha I + \beta$).
|
||||
\begin{description}
|
||||
\item[Invariance to bias]
|
||||
Harris' detector is invariant to an additive bias ($I' = I + \beta$) as a consequence of the approximate derivative computation:
|
||||
\[
|
||||
\partial_x I'(i, j) = I'(i, j+1) - I'(i, j) = (I(i, j+1) + \cancel{\beta}) - (I(i, j) + \cancel{\beta})
|
||||
\]
|
||||
\item[No invariance to gain factor]
|
||||
Harris' detector is not invariant to a gain factor ($I' = \alpha I$) as the multiplicative factor is carried in the derivatives.
|
||||
\end{description}
|
||||
\begin{remark}
|
||||
In other words, Harris' detector is not illumination invariant.
|
||||
\end{remark}
|
||||
|
||||
\item[No scale invariance]
|
||||
Harris' detector is not scale invariant as the use of a fixed window size makes it impossible to recognize the same features when the image is scaled.
|
||||
\end{descriptionlist}
|
||||
|
||||
|
||||
|
||||
\section{Multi-scale feature detector}
|
||||
|
||||
% \subsection{Scale invariance}
|
||||
|
||||
Depending on the scale, an image may exhibit more or less details.
|
||||
A naive approach consists of using a smaller window size for images with a smaller scale,
|
||||
but this is not always able to capture the same features due to the details difference.
|
||||
|
||||
\begin{description}
|
||||
\item[Scale-space] \marginnote{Scale-space}
|
||||
One-parameter family of images obtained by increasingly smoothing the input image.
|
||||
|
||||
\begin{remark}
|
||||
When smoothing, small details should disappear and no new structures should be introduced.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
It is possible to use the same window size when working with scale-space images.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Gaussian scale-space] \marginnote{Gaussian scale-space}
|
||||
Scale-space obtained using Gaussian smoothing:
|
||||
\[ L(x, y, \sigma) = I(x, y) * G(x, y, \sigma) \]
|
||||
where $\sigma$ is the standard deviation but also the level of scaling.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_scale_space_example.pdf}
|
||||
\caption{Gaussian scale-space example}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Scale-normalized LoG blob detection}
|
||||
|
||||
\begin{description}
|
||||
\item[Scale-normalized Laplacian of Gaussian] \marginnote{Scale-normalized Laplacian of Gaussian}
|
||||
LoG scaled by a factor of $\sigma^2$:
|
||||
\[ F(x, y, \sigma) = \sigma^2 \nabla^{(2)} L(x, y, \sigma) = \sigma^2 (I(x, y) * \nabla^{(2)} G(x, y, \sigma)) \]
|
||||
$\sigma^2$ avoids small derivatives when the scaling ($\sigma$) is large.
|
||||
|
||||
\item[Characteristic scale] Scale $\sigma$ that produces a peak in the Laplacian response at a given pixel \cite{slides:scale_normalized_log}.
|
||||
|
||||
\item[Algorithm] \marginnote{Scale-normalized LoG blob detection}
|
||||
Blob (circle) detection using scale-normalized LoG works as follows \cite{slides:scale_normalized_log}:
|
||||
\begin{enumerate}
|
||||
\item Create a Gaussian scale-space by applying the scale-normalized Laplacian of Gaussian with different values of $\sigma$.
|
||||
\item For each pixel, find the characteristic scale and its corresponding Laplacian response across the scale-space (automatic scale selection).
|
||||
\item Filter out the pixels whose response is lower than a threshold and apply NMS.
|
||||
\item The remaining pixels are the centers of the blobs.
|
||||
It can be shown that the radius is given by $r = \sigma\sqrt{2}$.
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
|
||||
|
||||
When detecting a peak, there are two cases:
|
||||
\begin{descriptionlist}
|
||||
\item[Maximum] Dark blobs on a light background.
|
||||
\item[Minimum] Light blobs on a dark background.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/LOG_blob_detection_example.png}
|
||||
\caption{Example of application of the algorithm}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
The intuitive idea of the detection algorithm is the following:
|
||||
\begin{itemize}
|
||||
\item $F(x, y, \sigma)$ keeps growing for $\sigma$s that capture areas within the blob (i.e. with similar intensity).
|
||||
\item The Laplacian reaches its peak when its weights capture the entire blob (virtually it detects an edge).
|
||||
\item After the peak, the LoG filter will also capture intensities outside the blob and therefore decrease.
|
||||
\end{itemize}
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Using different scales creates the effect of searching in a 3D space.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Scale-normalized LoG blob detection is scale and rotation invariant.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
It empirically holds that, given two points representing the centers of two blobs,
|
||||
the ratio between the two characteristic scales is approximately the ratio between the diameters of the two blobs.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/_scaled_log_blob_diameter.pdf}
|
||||
\caption{
|
||||
\begin{varwidth}[t]{0.55\linewidth}
|
||||
Scale-normalized LoG computed on varying $\sigma$.\\
|
||||
Note that, in the second image, the characteristic scale is
|
||||
higher as the scale is larger.
|
||||
\end{varwidth}
|
||||
}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
|
||||
|
||||
|
||||
\subsection{Difference of Gaussians blob detection}
|
||||
|
||||
\begin{description}
|
||||
\item[Difference of Gaussians (DoG)] \marginnote{Difference of Gaussians (DoG)}
|
||||
Approximation of the scale-normalized LoG computed as:
|
||||
\[
|
||||
\begin{split}
|
||||
\texttt{DoG}(x, y, \sigma) &= \big( G(x, y, k\sigma) - G(x, y, \sigma) \big) * I(x, y) \\
|
||||
&= L(x, y, k\sigma) - L(x, y, \sigma)
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{theorem}
|
||||
It can be proven that the DoG kernel is a scaled version of the LoG kernel:
|
||||
\[ G(x, y, k\sigma) - G(x, y, \sigma) \approx (k-1)\sigma^2 \nabla^{(2)}G(x, y, \sigma) \]
|
||||
|
||||
\begin{remark}
|
||||
As we are interested in extrema, the scaling factor is irrelevant.
|
||||
\end{remark}
|
||||
\end{theorem}
|
||||
|
||||
\item[Extrema detection] \marginnote{DoG extrema}
|
||||
Given three DoG images with scales $\sigma_i$, $\sigma_{i-1}$ and $\sigma_{i+1}$,
|
||||
a pixel $(x, y, \sigma_i)$ is an extrema (i.e. keypoint) iff:
|
||||
\begin{itemize}
|
||||
\item It is an extrema in a $3 \times 3$ patch centered on it (8 pixels as $(x, y, \sigma_i)$ is excluded).
|
||||
\item It is an extrema in a $3 \times 3$ patch centered on the pixels at $(x, y, \sigma_{i-1})$ and at $(x, y, \sigma_{i+1})$ ($9+9$ pixels).
|
||||
\end{itemize}
|
||||
|
||||
\begin{center}
|
||||
\includegraphics[width=0.35\linewidth]{./img/_DoG_extrema.pdf}
|
||||
\end{center}
|
||||
|
||||
\item[Algorithm] \marginnote{DoG blob detection}
|
||||
To detect blob centers (i.e. a DoG extrema), an octave of $s$ DoG images is computed as follows:
|
||||
\begin{enumerate}
|
||||
\item Compute a scale-space $L$ of $s+1$ Gaussian smoothed images with $\sigma$ varying by a factor $k = 2^{1/s}$.
|
||||
As the extrema detection method requires checking the DoG images above and below,
|
||||
two additional Gaussians (with scales $k^{-1}\sigma$ and $k^{s+1}\sigma$) are computed.
|
||||
\item The DoG image $\texttt{DoG}(\cdot, \cdot, k^i\sigma)$ is obtained as the difference between the
|
||||
images $L(\cdot, \cdot, k^{i+1}\sigma)$ and $L(\cdot, \cdot, k^i\sigma)$ of the Gaussian scale-space.
|
||||
\begin{figure}[H]
|
||||
\small
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_DoG_octave.pdf}
|
||||
\caption{Example of octave computation with $s=4$}
|
||||
\end{figure}
|
||||
\item Extrema detection is done as described above across the $s$ DoG images.
|
||||
\item Points with a weak DoG response can be pruned through thresholding.
|
||||
Furthermore, it has been observed that strong DoG points along edges are unstable and can be also pruned.
|
||||
\end{enumerate}
|
||||
|
||||
Octaves should be computed using different starting $\sigma$s.
|
||||
Instead of recomputing the Gaussian scale-space,
|
||||
it is possible to simply down-sample the already computed Gaussians and compute the DoG images starting from shrunk smoothed images.
|
||||
|
||||
\begin{remark}
|
||||
In the original work, the input image is first enlarged by a factor of 2.
|
||||
Then, four octaves are computed starting from the enlarged image
|
||||
(i.e. images of size factor $\times 2$, $\times 1$, $\times \frac{1}{2}$ and $\times \frac{1}{4}$ are considered).
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
The original work found out that the best hyperparameters are $s=3$ and $\sigma=1.6$.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
DoG blob detection is scale and rotation invariant.
|
||||
\end{remark}
|
||||
|
||||
|
||||
|
||||
\section{Descriptor}
|
||||
|
||||
After finding the keypoints, a descriptor of a keypoint is computed from the pixels within a patch centered on it.
|
||||
|
||||
|
||||
\subsection{DoG descriptor}
|
||||
|
||||
\begin{description}
|
||||
\item[Canonical / Characteristic orientation] \marginnote{Canonical / Characteristic orientation}
|
||||
Direction along which the magnitudes of the gradients of the neighboring pixels of a keypoint are the highest.
|
||||
|
||||
Given a pixel $(x, y)$, its gradient magnitude and direction is computed from the Gaussian smoothed image $L$:
|
||||
\[
|
||||
\begin{split}
|
||||
\vert \nabla L(x, y) \vert &= \sqrt{ \big( L(x+1, y) - L(x-1, y) \big)^2 + \big( L(x, y+1) - L(x, y-1) \big)^2 } \\
|
||||
\theta_L(x, y) &= \tan^{-1}\left( \frac{L(x, y+1) - L(x, y-1)}{L(x+1, y) - L(x-1, y)} \right)
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Orientation histogram] \marginnote{Orientation histogram}
|
||||
By dividing the directions into bins (e.g. bins of size $10^\circ$),
|
||||
it is possible to define for each keypoint a histogram by considering its neighboring pixels within a patch.
|
||||
For each pixel $(x, y)$ neighboring a keypoint $(x_k, y_k)$, its contribution to the histogram along the direction $\theta_L(x, y)$ is given by:
|
||||
\[ G_{(x_k, y_k)}(x, y, \frac{3}{2} \sigma_s(x_k, y_k)) \cdot \vert \nabla L(x, y) \vert \]
|
||||
where $G_{(x_k, y_k)}$ is a Gaussian centered on the keypoint and $\sigma_s(x_k, y_k)$ is the scale of the keypoint.
|
||||
|
||||
The characteristic orientation of a keypoint is given by the highest peak of the orientation histogram.
|
||||
Other peaks that are higher than at least $80\%$ of the main one are also considered characteristic orientations
|
||||
(i.e. a keypoint might have multiple canonical orientations and, therefore, multiple descriptors).
|
||||
|
||||
For a more accurate estimation, a parabola is interpolated on the neighborhood of each peak and
|
||||
the two bins adjacent to the peak of the parabola are considered.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/_canonical_histogram.pdf}
|
||||
\caption{Orientation histogram and parabola interpolation}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
\begin{description}
|
||||
\item[DoG descriptor] \marginnote{DoG descriptor}
|
||||
Keypoints are found using the DoG detector and the descriptors are computed through patches along the canonical orientations.
|
||||
|
||||
\begin{remark}
|
||||
DoG descriptor is scale and rotation invariant.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Scale invariant feature transform (SIFT) descriptor}
|
||||
\marginnote{SIFT descriptor}
|
||||
|
||||
Given a keypoint, SIFT detector works as follows:
|
||||
\begin{enumerate}
|
||||
\item Center on the keypoint a $16 \times 16$ grid divided into $4 \times 4$ regions.
|
||||
\item Compute for each region its orientation histogram with eight bins (i.e. bins of size $45^\circ$).
|
||||
The Gaussian weighting function is centered on the keypoint and has $\sigma$ equal to half the grid size.
|
||||
\item The descriptor is obtained by concatenating the histograms of each region.
|
||||
This results in a feature vector with $128$ elements ($(4 \cdot 4) \cdot 8$).
|
||||
\item Normalize the descriptor to unit length. Pixels larger than $0.2$ are saturated and normalized again (for illumination invariance).
|
||||
\end{enumerate}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/sift.png}
|
||||
\caption{SIFT detector example}
|
||||
\end{figure}
|
||||
|
||||
\begin{description}
|
||||
\item[Trilinear interpolation]
|
||||
Bins are assigned in a soft manner to avoid boundary effects.
|
||||
The contribution of a pixel is spread between its two adjacent bins weighted by the distance to the bin centers:\\
|
||||
\begin{minipage}{0.55\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_sift_interpolation.pdf}
|
||||
\end{minipage}
|
||||
\begin{minipage}{0.3\linewidth}
|
||||
\[
|
||||
\begin{cases}
|
||||
\text{weight}_k = 1 - d_k \\
|
||||
\text{weight}_{k+1} = 1 - d_{k+1} \\
|
||||
\end{cases}
|
||||
\]
|
||||
\end{minipage}
|
||||
|
||||
This is done both on the histogram within a region and on the histograms between the four neighboring regions.
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
SIFT descriptor is scale, rotation and affine intensity change invariant.
|
||||
\end{remark}
|
||||
|
||||
|
||||
|
||||
\section{Matching}
|
||||
|
||||
Matching the keypoints across different views is a nearest-neighbor search problem. \marginnote{Nearest-neighbor search problem}
|
||||
|
||||
Given a target image $T$ and a reference image $R$,
|
||||
we want to match each keypoint in $T$ to the most similar one in $R$ (usually using the Euclidean distance).
|
||||
|
||||
\begin{description}
|
||||
\item[Matching criteria] \marginnote{Matching criteria}
|
||||
Given the distance $d_\text{NN}$ to the nearest-neighbor of a keypoint of $T$ in $R$,
|
||||
the match is accepted by respecting one of the following criteria:
|
||||
\begin{descriptionlist}
|
||||
\item[Threshold]
|
||||
Given a threshold $T$, the match is accepted iff:
|
||||
\[ d_\text{NN} \leq T \]
|
||||
|
||||
\item[Ratio of distances]
|
||||
Given a threshold $T$ and the distance to the second nearest-neighbor $d_\text{NN2}$, the match is accepted iff:
|
||||
\[ \frac{d_\text{NN}}{d_\text{NN2}} \leq T \]
|
||||
This method allows to avoid ambiguity if two neighbors are too close.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.4\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.2\linewidth]{./img/_nn_matching_example1.pdf}
|
||||
\caption{Non ambiguous match}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.4\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.2\linewidth]{./img/_nn_matching_example2.pdf}
|
||||
\caption{Ambiguous match}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{remark}
|
||||
It has been empirically shown that a threshold $T=0.8$ allows to reject $90\%$ of wrong matches while missing $5\%$ of correct matches.
|
||||
\end{remark}
|
||||
|
||||
\item[Efficient NN search] \marginnote{Efficient NN search}
|
||||
As an exhaustive search is inefficient, indexing techniques may be employed.
|
||||
|
||||
The main indexing technique applied for feature matching is the k-d tree in the best bin first (BBF) variant.
|
||||
|
||||
\begin{remark}
|
||||
Best bin first is not optimal.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user