diff --git a/src/image-processing-and-computer-vision/module1/img/_DoG_extrema.pdf b/src/image-processing-and-computer-vision/module1/img/_DoG_extrema.pdf new file mode 100644 index 0000000..2e4b8fb Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/_DoG_extrema.pdf differ diff --git a/src/image-processing-and-computer-vision/module1/img/_DoG_octave.pdf b/src/image-processing-and-computer-vision/module1/img/_DoG_octave.pdf new file mode 100644 index 0000000..fd05511 Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/_DoG_octave.pdf differ diff --git a/src/image-processing-and-computer-vision/module1/img/_canonical_histogram.pdf b/src/image-processing-and-computer-vision/module1/img/_canonical_histogram.pdf new file mode 100644 index 0000000..28d7ba7 Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/_canonical_histogram.pdf differ diff --git a/src/image-processing-and-computer-vision/module1/img/_nn_matching_example1.pdf b/src/image-processing-and-computer-vision/module1/img/_nn_matching_example1.pdf new file mode 100644 index 0000000..cc5ec89 Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/_nn_matching_example1.pdf differ diff --git a/src/image-processing-and-computer-vision/module1/img/_nn_matching_example2.pdf b/src/image-processing-and-computer-vision/module1/img/_nn_matching_example2.pdf new file mode 100644 index 0000000..3ef3838 Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/_nn_matching_example2.pdf differ diff --git a/src/image-processing-and-computer-vision/module1/img/_sift_interpolation.pdf b/src/image-processing-and-computer-vision/module1/img/_sift_interpolation.pdf new file mode 100644 index 0000000..e893cda Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/_sift_interpolation.pdf differ diff --git a/src/image-processing-and-computer-vision/module1/img/sift.png b/src/image-processing-and-computer-vision/module1/img/sift.png new file mode 100644 index 0000000..b0b1ec8 Binary files /dev/null and b/src/image-processing-and-computer-vision/module1/img/sift.png differ diff --git a/src/image-processing-and-computer-vision/module1/sections/_edge_detection.tex b/src/image-processing-and-computer-vision/module1/sections/_edge_detection.tex index 0d0be37..d6908a9 100644 --- a/src/image-processing-and-computer-vision/module1/sections/_edge_detection.tex +++ b/src/image-processing-and-computer-vision/module1/sections/_edge_detection.tex @@ -354,9 +354,9 @@ the computation can be reduced to 1D convolutions: \end{description} -\subsection{Laplacian of Gaussian (LOG)} +\subsection{Laplacian of Gaussian (LoG)} -Laplacian of Gaussian (LOG) does the following: \marginnote{Laplacian of Gaussian (LOG)} +Laplacian of Gaussian (LoG) does the following: \marginnote{Laplacian of Gaussian (LoG)} \begin{enumerate} \item Gaussian smoothing. \item Second-order differentiation using the Laplacian filter. diff --git a/src/image-processing-and-computer-vision/module1/sections/_local_features.tex b/src/image-processing-and-computer-vision/module1/sections/_local_features.tex index 3d47cff..492b6fc 100644 --- a/src/image-processing-and-computer-vision/module1/sections/_local_features.tex +++ b/src/image-processing-and-computer-vision/module1/sections/_local_features.tex @@ -246,24 +246,21 @@ but this is not always able to capture the same features due to the details diff \caption{Gaussian scale-space example} \end{figure} \end{description} - - \item[Scale-normalized Laplacian of Gaussian] \marginnote{Scale-normalized Laplacian of Gaussian} - LOG scaled by a factor of $\sigma^2$: - \[ F(x, y, \sigma) = \sigma^2 \nabla^2 L(x, y, \sigma) = \sigma^2 (I(x, y) * \nabla^2 G(x, y, \sigma)) \] - $\sigma^2$ avoids small derivatives when the scaling ($\sigma$) is large. \end{description} -\subsection{Blob detection} -\marginnote{Scale-normalized LOG blob detection} - -Scale-normalized LOG allows the detection of blobs (circles) in an image. +\subsection{Scale-normalized LoG blob detection} \begin{description} + \item[Scale-normalized Laplacian of Gaussian] \marginnote{Scale-normalized Laplacian of Gaussian} + LoG scaled by a factor of $\sigma^2$: + \[ F(x, y, \sigma) = \sigma^2 \nabla^{(2)} L(x, y, \sigma) = \sigma^2 (I(x, y) * \nabla^{(2)} G(x, y, \sigma)) \] + $\sigma^2$ avoids small derivatives when the scaling ($\sigma$) is large. + \item[Characteristic scale] Scale $\sigma$ that produces a peak in the Laplacian response at a given pixel \cite{slides:scale_normalized_log}. - \item[Algorithm] - Blob detection using scale-normalized LOG works as follows \cite{slides:scale_normalized_log}: + \item[Algorithm] \marginnote{Scale-normalized LoG blob detection} + Blob (circle) detection using scale-normalized LoG works as follows \cite{slides:scale_normalized_log}: \begin{enumerate} \item Create a Gaussian scale-space by applying the scale-normalized Laplacian of Gaussian with different values of $\sigma$. \item For each pixel, find the characteristic scale and its corresponding Laplacian response across the scale-space (automatic scale selection). @@ -291,7 +288,7 @@ When detecting a peak, there are two cases: \begin{itemize} \item $F(x, y, \sigma)$ keeps growing for $\sigma$s that capture areas within the blob (i.e. with similar intensity). \item The Laplacian reaches its peak when its weights capture the entire blob (virtually it detects an edge). - \item After the peak, the LOG filter will also capture intensities outside the blob and therefore decrease. + \item After the peak, the LoG filter will also capture intensities outside the blob and therefore decrease. \end{itemize} \end{remark} @@ -299,19 +296,244 @@ When detecting a peak, there are two cases: Using different scales creates the effect of searching in a 3D space. \end{remark} +\begin{remark} + Scale-normalized LoG blob detection is scale and rotation invariant. +\end{remark} + \begin{remark} It empirically holds that, given two points representing the centers of two blobs, the ratio between the two characteristic scales is approximately the ratio between the diameters of the two blobs. + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/_scaled_log_blob_diameter.pdf} + \caption{ + \begin{varwidth}[t]{0.55\linewidth} + Scale-normalized LoG computed on varying $\sigma$.\\ + Note that, in the second image, the characteristic scale is + higher as the scale is larger. + \end{varwidth} + } + \end{figure} \end{remark} + + +\subsection{Difference of Gaussians blob detection} + +\begin{description} + \item[Difference of Gaussians (DoG)] \marginnote{Difference of Gaussians (DoG)} + Approximation of the scale-normalized LoG computed as: + \[ + \begin{split} + \texttt{DoG}(x, y, \sigma) &= \big( G(x, y, k\sigma) - G(x, y, \sigma) \big) * I(x, y) \\ + &= L(x, y, k\sigma) - L(x, y, \sigma) + \end{split} + \] + + \begin{theorem} + It can be proven that the DoG kernel is a scaled version of the LoG kernel: + \[ G(x, y, k\sigma) - G(x, y, \sigma) \approx (k-1)\sigma^2 \nabla^{(2)}G(x, y, \sigma) \] + + \begin{remark} + As we are interested in extrema, the scaling factor is irrelevant. + \end{remark} + \end{theorem} + + \item[Extrema detection] \marginnote{DoG extrema} + Given three DoG images with scales $\sigma_i$, $\sigma_{i-1}$ and $\sigma_{i+1}$, + a pixel $(x, y, \sigma_i)$ is an extrema (i.e. keypoint) iff: + \begin{itemize} + \item It is an extrema in a $3 \times 3$ patch centered on it (8 pixels as $(x, y, \sigma_i)$ is excluded). + \item It is an extrema in a $3 \times 3$ patch centered on the pixels at $(x, y, \sigma_{i-1})$ and at $(x, y, \sigma_{i+1})$ ($9+9$ pixels). + \end{itemize} + + \begin{center} + \includegraphics[width=0.35\linewidth]{./img/_DoG_extrema.pdf} + \end{center} + + \item[Algorithm] \marginnote{DoG blob detection} + To detect blob centers (i.e. a DoG extrema), an octave of $s$ DoG images is computed as follows: + \begin{enumerate} + \item Compute a scale-space $L$ of $s+1$ Gaussian smoothed images with $\sigma$ varying by a factor $k = 2^{1/s}$. + As the extrema detection method requires checking the DoG images above and below, + two additional Gaussians (with scales $k^{-1}\sigma$ and $k^{s+1}\sigma$) are computed. + \item The DoG image $\texttt{DoG}(\cdot, \cdot, k^i\sigma)$ is obtained as the difference between the + images $L(\cdot, \cdot, k^{i+1}\sigma)$ and $L(\cdot, \cdot, k^i\sigma)$ of the Gaussian scale-space. + \begin{figure}[H] + \small + \centering + \includegraphics[width=0.6\linewidth]{./img/_DoG_octave.pdf} + \caption{Example of octave computation with $s=4$} + \end{figure} + \item Extrema detection is done as described above across the $s$ DoG images. + \item Points with a weak DoG response can be pruned through thresholding. + Furthermore, it has been observed that strong DoG points along edges are unstable and can be also pruned. + \end{enumerate} + + Octaves should be computed using different starting $\sigma$s. + Instead of recomputing the Gaussian scale-space, + it is possible to simply down-sample the already computed Gaussians and compute the DoG images starting from shrunk smoothed images. + + \begin{remark} + In the original work, the input image is first enlarged by a factor of 2. + Then, four octaves are computed starting from the enlarged image + (i.e. images of size factor $\times 2$, $\times 1$, $\times \frac{1}{2}$ and $\times \frac{1}{4}$ are considered). + \end{remark} + + \begin{remark} + The original work found out that the best hyperparameters are $s=3$ and $\sigma=1.6$. + \end{remark} +\end{description} + +\begin{remark} + DoG blob detection is scale and rotation invariant. +\end{remark} + + + +\section{Descriptor} + +After finding the keypoints, a descriptor of a keypoint is computed from the pixels within a patch centered on it. + + +\subsection{DoG descriptor} + +\begin{description} + \item[Canonical / Characteristic orientation] \marginnote{Canonical / Characteristic orientation} + Direction along which the magnitudes of the gradients of the neighboring pixels of a keypoint are the highest. + + Given a pixel $(x, y)$, its gradient magnitude and direction is computed from the Gaussian smoothed image $L$: + \[ + \begin{split} + \vert \nabla L(x, y) \vert &= \sqrt{ \big( L(x+1, y) - L(x-1, y) \big)^2 + \big( L(x, y+1) - L(x, y-1) \big)^2 } \\ + \theta_L(x, y) &= \tan^{-1}\left( \frac{L(x, y+1) - L(x, y-1)}{L(x+1, y) - L(x-1, y)} \right) + \end{split} + \] + + \begin{description} + \item[Orientation histogram] \marginnote{Orientation histogram} + By dividing the directions into bins (e.g. bins of size $10^\circ$), + it is possible to define for each keypoint a histogram by considering its neighboring pixels within a patch. + For each pixel $(x, y)$ neighboring a keypoint $(x_k, y_k)$, its contribution to the histogram along the direction $\theta_L(x, y)$ is given by: + \[ G_{(x_k, y_k)}(x, y, \frac{3}{2} \sigma_s(x_k, y_k)) \cdot \vert \nabla L(x, y) \vert \] + where $G_{(x_k, y_k)}$ is a Gaussian centered on the keypoint and $\sigma_s(x_k, y_k)$ is the scale of the keypoint. + + The characteristic orientation of a keypoint is given by the highest peak of the orientation histogram. + Other peaks that are higher than at least $80\%$ of the main one are also considered characteristic orientations + (i.e. a keypoint might have multiple canonical orientations and, therefore, multiple descriptors). + + For a more accurate estimation, a parabola is interpolated on the neighborhood of each peak and + the two bins adjacent to the peak of the parabola are considered. + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{./img/_canonical_histogram.pdf} + \caption{Orientation histogram and parabola interpolation} + \end{figure} + \end{description} +\end{description} + +\begin{description} + \item[DoG descriptor] \marginnote{DoG descriptor} + Keypoints are found using the DoG detector and the descriptors are computed through patches along the canonical orientations. + + \begin{remark} + DoG descriptor is scale and rotation invariant. + \end{remark} +\end{description} + + +\subsection{Scale invariant feature transform (SIFT) descriptor} +\marginnote{SIFT descriptor} + +Given a keypoint, SIFT detector works as follows: +\begin{enumerate} + \item Center on the keypoint a $16 \times 16$ grid divided into $4 \times 4$ regions. + \item Compute for each region its orientation histogram with eight bins (i.e. bins of size $45^\circ$). + The Gaussian weighting function is centered on the keypoint and has $\sigma$ equal to half the grid size. + \item The descriptor is obtained by concatenating the histograms of each region. + This results in a feature vector with $128$ elements ($(4 \cdot 4) \cdot 8$). + \item Normalize the descriptor to unit length. Pixels larger than $0.2$ are saturated and normalized again (for illumination invariance). +\end{enumerate} + \begin{figure}[H] \centering - \includegraphics[width=0.4\linewidth]{./img/_scaled_log_blob_diameter.pdf} - \caption{ - \begin{varwidth}[t]{0.55\linewidth} - Scale-normalized LOG computed on varying $\sigma$.\\ - Note that, in the second image, the characteristic scale is - higher as the scale is larger. - \end{varwidth} - } -\end{figure} \ No newline at end of file + \includegraphics[width=0.6\linewidth]{./img/sift.png} + \caption{SIFT detector example} +\end{figure} + +\begin{description} + \item[Trilinear interpolation] + Bins are assigned in a soft manner to avoid boundary effects. + The contribution of a pixel is spread between its two adjacent bins weighted by the distance to the bin centers:\\ + \begin{minipage}{0.55\linewidth} + \centering + \includegraphics[width=0.6\linewidth]{./img/_sift_interpolation.pdf} + \end{minipage} + \begin{minipage}{0.3\linewidth} + \[ + \begin{cases} + \text{weight}_k = 1 - d_k \\ + \text{weight}_{k+1} = 1 - d_{k+1} \\ + \end{cases} + \] + \end{minipage} + + This is done both on the histogram within a region and on the histograms between the four neighboring regions. +\end{description} + +\begin{remark} + SIFT descriptor is scale, rotation and affine intensity change invariant. +\end{remark} + + + +\section{Matching} + +Matching the keypoints across different views is a nearest-neighbor search problem. \marginnote{Nearest-neighbor search problem} + +Given a target image $T$ and a reference image $R$, +we want to match each keypoint in $T$ to the most similar one in $R$ (usually using the Euclidean distance). + +\begin{description} + \item[Matching criteria] \marginnote{Matching criteria} + Given the distance $d_\text{NN}$ to the nearest-neighbor of a keypoint of $T$ in $R$, + the match is accepted by respecting one of the following criteria: + \begin{descriptionlist} + \item[Threshold] + Given a threshold $T$, the match is accepted iff: + \[ d_\text{NN} \leq T \] + + \item[Ratio of distances] + Given a threshold $T$ and the distance to the second nearest-neighbor $d_\text{NN2}$, the match is accepted iff: + \[ \frac{d_\text{NN}}{d_\text{NN2}} \leq T \] + This method allows to avoid ambiguity if two neighbors are too close. + \begin{figure}[H] + \centering + \begin{subfigure}{0.4\linewidth} + \centering + \includegraphics[width=0.2\linewidth]{./img/_nn_matching_example1.pdf} + \caption{Non ambiguous match} + \end{subfigure} + \begin{subfigure}{0.4\linewidth} + \centering + \includegraphics[width=0.2\linewidth]{./img/_nn_matching_example2.pdf} + \caption{Ambiguous match} + \end{subfigure} + \end{figure} + \end{descriptionlist} + + \begin{remark} + It has been empirically shown that a threshold $T=0.8$ allows to reject $90\%$ of wrong matches while missing $5\%$ of correct matches. + \end{remark} + + \item[Efficient NN search] \marginnote{Efficient NN search} + As an exhaustive search is inefficient, indexing techniques may be employed. + + The main indexing technique applied for feature matching is the k-d tree in the best bin first (BBF) variant. + + \begin{remark} + Best bin first is not optimal. + \end{remark} +\end{description} \ No newline at end of file diff --git a/utils/cropped_pdfs.sh b/utils/cropped_pdfs.sh index 85a7cf9..d9de11c 100644 --- a/utils/cropped_pdfs.sh +++ b/utils/cropped_pdfs.sh @@ -3,5 +3,5 @@ git ls-files --others --modified --exclude-standard `git rev-parse --show-toplevel`/*.pdf | while read -r pdf ; do echo "Rewriting $pdf" mv "$pdf" /tmp/ai_notes_cropped_pdf_processing.pdf - gs -dQUIET -sDEVICE=pdfwrite -o "$pdf" /tmp/ai_notes_cropped_pdf_processing.pdf + gs -dQUIET -sDEVICE=pdfwrite -dUseCropBox -o "$pdf" /tmp/ai_notes_cropped_pdf_processing.pdf done \ No newline at end of file