diff --git a/src/year1/image-processing-and-computer-vision/module2/img/_warp_application1.pdf b/src/year1/image-processing-and-computer-vision/module2/img/_warp_application1.pdf new file mode 100644 index 0000000..d67a753 Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/_warp_application1.pdf differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/_warp_application2.pdf b/src/year1/image-processing-and-computer-vision/module2/img/_warp_application2.pdf new file mode 100644 index 0000000..aeb70ab Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/_warp_application2.pdf differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/_warping_bilinear1.pdf b/src/year1/image-processing-and-computer-vision/module2/img/_warping_bilinear1.pdf new file mode 100644 index 0000000..998f6da Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/_warping_bilinear1.pdf differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/_warping_bilinear2.pdf b/src/year1/image-processing-and-computer-vision/module2/img/_warping_bilinear2.pdf new file mode 100644 index 0000000..436c24e Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/_warping_bilinear2.pdf differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/cifar10.png b/src/year1/image-processing-and-computer-vision/module2/img/cifar10.png new file mode 100644 index 0000000..dec3eee Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/cifar10.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/cifar100.png b/src/year1/image-processing-and-computer-vision/module2/img/cifar100.png new file mode 100644 index 0000000..8b03373 Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/cifar100.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/imagenet1k.png b/src/year1/image-processing-and-computer-vision/module2/img/imagenet1k.png new file mode 100644 index 0000000..83fac90 Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/imagenet1k.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/imagenet21k.png b/src/year1/image-processing-and-computer-vision/module2/img/imagenet21k.png new file mode 100644 index 0000000..9528e97 Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/imagenet21k.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/inverse_perspective_mapping.png b/src/year1/image-processing-and-computer-vision/module2/img/inverse_perspective_mapping.png new file mode 100644 index 0000000..19e5bfd Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/inverse_perspective_mapping.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/mnist.png b/src/year1/image-processing-and-computer-vision/module2/img/mnist.png new file mode 100644 index 0000000..70e8ba0 Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/mnist.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/pitch_yaw_compensation.png b/src/year1/image-processing-and-computer-vision/module2/img/pitch_yaw_compensation.png new file mode 100644 index 0000000..fbd6d7c Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/pitch_yaw_compensation.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/warp_zoom.png b/src/year1/image-processing-and-computer-vision/module2/img/warp_zoom.png new file mode 100644 index 0000000..7e99939 Binary files /dev/null and b/src/year1/image-processing-and-computer-vision/module2/img/warp_zoom.png differ diff --git a/src/year1/image-processing-and-computer-vision/module2/img/warping_bilinear.drawio b/src/year1/image-processing-and-computer-vision/module2/img/warping_bilinear.drawio new file mode 100644 index 0000000..119ec16 --- /dev/null +++ b/src/year1/image-processing-and-computer-vision/module2/img/warping_bilinear.drawio @@ -0,0 +1,280 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/year1/image-processing-and-computer-vision/module2/ipcv2.tex b/src/year1/image-processing-and-computer-vision/module2/ipcv2.tex index cfb0213..58301d4 100644 --- a/src/year1/image-processing-and-computer-vision/module2/ipcv2.tex +++ b/src/year1/image-processing-and-computer-vision/module2/ipcv2.tex @@ -9,5 +9,6 @@ \makenotesfront \input{./sections/_image_formation.tex} + \input{./sections/_classification.tex} \end{document} \ No newline at end of file diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex new file mode 100644 index 0000000..3c00b35 --- /dev/null +++ b/src/year1/image-processing-and-computer-vision/module2/sections/_classification.tex @@ -0,0 +1,105 @@ +\chapter{Image classification} + + +\section{Supervised datasets} + +\begin{description} + \item[Dataset] \marginnote{Dataset} + Given a set of labeled data, it can be split into: + \begin{descriptionlist} + \item[Train set] $D^\text{train} = \{ (\text{x}_\text{train}^{(i)}, y_\text{train}^{(i)}) \mid i = 1, \dots, N \}$. + \item[Test set] $D^\text{test} = \{ (\text{x}_\text{test}^{(i)}, y_\text{test}^{(i)}) \mid i = 1, \dots, M \}$. + \end{descriptionlist} + + It is assumed that the two sets contain i.i.d. samples drawn from the same unknown distribution. +\end{description} + + +\subsection{Modified NIST (MNIST)} + +\begin{minipage}{0.45\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{./img/mnist.png} +\end{minipage} +\begin{minipage}{0.5\linewidth} + \begin{descriptionlist} + \item[Content] Handwritten digits from 0 to 9. + \item[Number of classes] 10. + \item[Train set size] 50k. + \item[Test set size] 10k. + \item[Image format] $28 \times 28$ grayscale. + \end{descriptionlist} +\end{minipage} + + +\subsection{CIFAR10} + +\begin{minipage}{0.45\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{./img/cifar10.png} +\end{minipage} +\begin{minipage}{0.5\linewidth} + \begin{descriptionlist} + \item[Content] Objects of various categories. + \item[Number of classes] 10. + \item[Train set size] 50k. + \item[Test set size] 10k. + \item[Image size] $32 \times 32$ RGB. + \end{descriptionlist} +\end{minipage} + + +\subsection{CIFAR100} + +\begin{minipage}{0.45\linewidth} + \centering + \includegraphics[width=0.7\linewidth]{./img/cifar100.png} +\end{minipage} +\begin{minipage}{0.5\linewidth} + \begin{descriptionlist} + \item[Content] Objects of various categories. + \item[Number of classes] 100 (20 super-classed with 5 sub-classes). + \item[Train set size] 50k. + \item[Test set size] 10k. + \item[Image size] $32 \times 32$ RGB. + \end{descriptionlist} +\end{minipage} + + +\subsection{ImageNet 21k} + +\begin{descriptionlist} + \item[Content] Objects of various categories. + \item[Number of classes] 21k synsets from WordNet organized hierarchically. + \item[Dataset size] 14 millions. + \item[Image size] Variable resolution RGB. Average size of $400 \times 350$. +\end{descriptionlist} + +\begin{figure}[H] + \centering + \includegraphics[width=0.85\linewidth]{./img/imagenet21k.png} +\end{figure} + + +\subsection{ImageNet 1k} + +\begin{minipage}{0.45\linewidth} + \centering + \includegraphics[width=\linewidth]{./img/imagenet1k.png} +\end{minipage} +\begin{minipage}{0.5\linewidth} + \begin{descriptionlist} + \item[Content] Objects of various categories. + \item[Number of classes] 1000. + \item[Train set size] $1.3$ millions. + \item[Validation set size] 50k. + \item[Test set size] 100k. + \item[Image size] Variable resolution RGB. Often resized to $256 \times 256$. + \end{descriptionlist} +\end{minipage} + +\begin{remark} + Performance is usually measured as top-5 accuracy as making a single prediction might be ambiguous due to the fact that the images can contain multiple objects. +\end{remark} + + diff --git a/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex b/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex index 82b9099..a141cb6 100644 --- a/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex +++ b/src/year1/image-processing-and-computer-vision/module2/sections/_image_formation.tex @@ -477,7 +477,7 @@ Therefore, the complete workflow for image formation becomes the following: \end{description} \item[Initial homographies guess] - For each image $i$, compute an initial guess of its homography $H_i$. + For each image $i$, compute an initial guess of its homography $\matr{H}_i$. Due to the choice of the $z$-axis position, the perspective projection matrix and the WRF points can be simplified: \[ @@ -842,6 +842,185 @@ Starting from the output image coordinates, use the inverse of the warping funct The computed input coordinates might be continuous. Possible discretization strategies are: \begin{itemize} \item Truncation. - \item Nearest neighbor (i.e. rounding). - \item Interpolation between the 4 closest points (e.g. bilinear, bicubic, \dots). -\end{itemize} \ No newline at end of file + \item Nearest neighbor. + \item Interpolation between the 4 closest pixels of the continuous point (e.g. bilinear, bicubic, \dots). +\end{itemize} + + +\begin{description} + \item[Bilinear interpolation] \marginnote{Bilinear interpolation} + Given a continuous coordinate $(u, v)$ and + its closest four pixels $(u_1, v_1), \dots, (u_4, v_4)$ with intensities denoted for simplicity as $I_i = I(u_i, v_i)$, + bilinear interpolation works as follows: + \begin{enumerate} + \item Compute the offset of $(u,v)$ w.r.t. the top-left pixel: + \[ \Delta u = u - u_1 \hspace{2em} \Delta v = v - v_1 \] + \begin{figure}[H] + \centering + \includegraphics[width=0.25\linewidth]{./img/_warping_bilinear1.pdf} + \end{figure} + + \item Interpolate a point $(u_a, v_a)$ between $(u_1, v_1)$ and $(u_2, v_2)$ in such a way that it is perpendicular to $(u,v)$. + Do the same for a point $(u_b, v_b)$ between $(u_3, v_3)$ and $(u_4, v_4)$. + The intensities of the new points are computed by interpolating the intensities of their extrema: + \[ I_a = I_1 + (I_2 - I_1) \Delta u \hspace{2em} I_b = I_3 + (I_4 - I_3) \Delta u \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/_warping_bilinear2.pdf} + \caption{In the figure, it is assumed that $I_1 < I_2$ and $I_3 > I_4$} + \end{figure} + + \item The intensity $I(\Delta u, \Delta v) = I'(u', v')$ in the warped image is obtained by interpolating the intensities of $I_a$ and $I_b$: + \[ + \begin{split} + I'(u', v') &= I_a + (I_b - I_a) \Delta v \\ + &= \Big( I_1 + (I_2 - I_1) \Delta u \Big) + \Big( \big( I_3 + (I_4 - I_3) \Delta u \big) - \big( I_1 + (I_2 - I_1) \Delta u \big) \Big) \Delta v \\ + &= (1-\Delta u)(1 - \Delta v) I_1 + \Delta u (1-\Delta v) I_2 + (1-\Delta u) \Delta v I_3 + \Delta u \Delta v I_4 + \end{split} + \] + \end{enumerate} + + \begin{remark}[Zoom] + Zooming using nearest-neighbor produces sharper edges while bilinear interpolation results in smoother images. + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/warp_zoom.png} + \end{figure} + \end{remark} + + \begin{remark} + Nearest-neighbor is suited to preserve transition (e.g. zoom a binary mask while maintaining the 0s and 1s). + \end{remark} +\end{description} + + +\subsection{Undistort warping} + +Once a camera has been calibrated, the lens distortion parameters can be used to obtain the undistorted image through backward warping. +\[ + \begin{split} + w_u &= u_\text{undist} + (k_1 r^2 + k_2 r^4)(u_\text{undist} - u_0) \\ + w_v &= v_\text{undist} + (k_1 r^2 + k_2 r^4)(v_\text{undist} - v_0) \\ + \end{split} +\] +\[ + I'(u_\text{undist}, v_\text{undist}) = I\big( w^{-1}_u(u_\text{undist}, v_\text{undist}), w^{-1}_v(u_\text{undist}, v_\text{undist}) \big) +\] + +Undistorted images enjoy some properties: +\begin{descriptionlist} + \item[Planar warping] \marginnote{Planar warping} + Any two images without lens distortion of a planar world scene ($z_W=0$) are related by a homography. + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/_warp_application1.pdf} + \end{figure} + Given two images containing the same world point, their image points (in projective space) are respectively given by a homography $\matr{H}_1$ and $\matr{H}_2$ + (note that with $z_w=0$, the PPM is a $3 \times 3$ matrix and therefore a homography):\\[-0.5em] + \begin{minipage}{0.5\linewidth} + \[ + \begin{split} + \tilde{\vec{m}}_1 &= \matr{H}_1 \tilde{\vec{M}}_W \\ + \tilde{\vec{m}}_1 &= \matr{H}_1 \matr{H}_2^{-1} \tilde{\vec{m}}_2 \\ + \end{split} + \] + \end{minipage} + \begin{minipage}{0.5\linewidth} + \[ + \begin{split} + \tilde{\vec{m}}_2 &= \matr{H}_2 \tilde{\vec{M}}_W \\ + \tilde{\vec{m}}_2 &= \matr{H}_2 \matr{H}_1^{-1} \tilde{\vec{m}}_1 \\ + \end{split} + \] + \end{minipage}\\[0.5em] + Then, $\matr{H}_1 \matr{H}_2^{-1} = \matr{H}_{21} = \matr{H}_{12}^{-1}$ is the homography that relates $\tilde{\vec{m}}_2$ to $\tilde{\vec{m}}_1$ + and $\matr{H}_2 \matr{H}_1^{-1} = \matr{H}_{12} = \matr{H}_{21}^{-1}$ relates $\tilde{\vec{m}}_1$ to $\tilde{\vec{m}}_2$. + + \begin{remark} + Only ground points on the planar section of the image can be correctly warped. + \end{remark} + + \begin{example}[Inverse Perspective Mapping] + In autonomous driving, it is usually useful to have a bird-eye view of the road. + + In a controlled environment, a calibrated camera can be mounted on a car to take a picture of the road in front of it. + Then, a (virtual) image of the road viewed from above is generated. + By finding the homography that relates the two images, it is possible to produce a bird-eye view of the road from the camera mounted on the vehicle. + + Note that the homography needs to be computed only once. + + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/inverse_perspective_mapping.png} + \end{figure} + \end{example} + + \item[Rotation warping] \marginnote{Rotation warping} + Any two images without lens distortion taken by rotating the camera about its optical center are related by a homography. + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/_warp_application2.pdf} + \end{figure} + It is assumed that the first image is taken in such a way that the WRF and CRF are the same (i.e. no extrinsic parameters). + Then, a second image is taken by rotating the camera about its optical center. + It holds that:\\[-0.5em] + \begin{minipage}{0.5\linewidth} + \[ + \begin{split} + \tilde{\vec{m}}_1 &= \matr{A} [\matr{I} | \nullvec] \tilde{\vec{M}}_W = \matr{A}\tilde{\vec{M}}_W \\ + \tilde{\vec{m}}_1 &= \matr{A}\matr{R}^{-1}\matr{A}^{-1} \tilde{\vec{m}}_2 \\ + \end{split} + \] + \end{minipage} + \begin{minipage}{0.5\linewidth} + \[ + \begin{split} + \tilde{\vec{m}}_2 &= \matr{A} [\matr{R} | \nullvec] \tilde{\vec{M}}_W = \matr{A}\matr{R}\tilde{\vec{M}}_W \\ + \tilde{\vec{m}}_2 &= \matr{A}\matr{R}\matr{A}^{-1} \tilde{\vec{m}}_1 \\ + \end{split} + \] + \end{minipage}\\[0.5em] + Then, $\matr{A}\matr{R}^{-1}\matr{A}^{-1} = \matr{H}_{21} = \matr{H}_{12}^{-1}$ is the homography that relates $\tilde{\vec{m}}_2$ to $\tilde{\vec{m}}_1$ + and $\matr{A}\matr{R}\matr{A}^{-1} = \matr{H}_{12} = \matr{H}_{21}^{-1}$ relates $\tilde{\vec{m}}_1$ to $\tilde{\vec{m}}_2$. + + \begin{remark} + Any point of the image can be correctly warped. + \end{remark} + + \begin{example}[Compensate pitch or yaw] + In autonomous driving, cameras should be ideally mounted with the optical axis parallel to the road plane and aligned with the direction of motion. + It is usually very difficult to obtain perfect alignment physically + but a calibrated camera can help to compensate pitch (i.e. rotation around the $x$-axis) + and yaw (i.e. rotation around the $y$-axis) by estimating the vanishing point of the lane lines. + + \begin{figure}[H] + \centering + \includegraphics[width=0.85\linewidth]{./img/pitch_yaw_compensation.png} + \end{figure} + + It is assumed that the vehicle is driving straight w.r.t. the lines and + that the WRF is attached to the vehicle in such a way that the $z$-axis is pointing in front of the vehicle. + It holds that any line parallel to the $z$-axis has direction $\begin{bmatrix} 0 & 0 & 1 \end{bmatrix}^T$ + and their point at infinity in perspective space is at $\begin{bmatrix} 0 & 0 & 1 & 0 \end{bmatrix}^T$. + + The coordinates of the vanishing point are then obtained as: + \[ + \vec{m}_\infty \equiv \matr{A}[\matr{R} | 0] \begin{bmatrix} 0 \\ 0 \\ 1 \\ 0 \end{bmatrix} + \equiv \matr{A}\vec{r}_3 + \equiv \matr{A} \begin{bmatrix} 0 \\ \sin\beta \\ \cos\beta \end{bmatrix} + \] + where $\vec{r}_3$ is the third column of the rotation matrix $\matr{R}_\text{pitch} = \begin{bmatrix} + 1 & 0 & 0 \\ 0 & \cos\beta & \sin\beta \\ 0 & -\sin\beta & \cos\beta + \end{bmatrix}$ that applies a rotation of $\beta$ degree around the $x$-axis. + + By computing the point at infinity, it is possible to estimate $\vec{r}_3 = \frac{\matr{A}^{-1} \vec{m}_\infty}{\Vert \matr{A}^{-1} \vec{m}_\infty \Vert_2}$ + (as $\vec{r}_3$ is a unit vector) and from it we can find the entire rotation matrix $\matr{R}_\text{pitch}$. + + Finally, the homography $\matr{A}\matr{R}_\text{pitch}\matr{A}^{-1}$ relates the pitched image to the ideal image. + + \begin{remark} + The same procedure can be done for the yaw. + \end{remark} + \end{example} +\end{descriptionlist} \ No newline at end of file