diff --git a/src/year2/machine-learning-for-computer-vision/img/_clip_generation_conditioning.pdf b/src/year2/machine-learning-for-computer-vision/img/_clip_generation_conditioning.pdf new file mode 100644 index 0000000..9f31730 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_clip_generation_conditioning.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_clip_inference.pdf b/src/year2/machine-learning-for-computer-vision/img/_clip_inference.pdf new file mode 100644 index 0000000..e5864af Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_clip_inference.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift.pdf b/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift.pdf new file mode 100644 index 0000000..12c78dd Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift_datasets.pdf b/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift_datasets.pdf new file mode 100644 index 0000000..f496723 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift_datasets.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/_clip_training.pdf b/src/year2/machine-learning-for-computer-vision/img/_clip_training.pdf new file mode 100644 index 0000000..9b7fa88 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/_clip_training.pdf differ diff --git a/src/year2/machine-learning-for-computer-vision/img/generative_task.png b/src/year2/machine-learning-for-computer-vision/img/generative_task.png new file mode 100644 index 0000000..49150b9 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/generative_task.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/image_manifold.png b/src/year2/machine-learning-for-computer-vision/img/image_manifold.png new file mode 100644 index 0000000..e28e906 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/image_manifold.png differ diff --git a/src/year2/machine-learning-for-computer-vision/img/latent_for_generation.png b/src/year2/machine-learning-for-computer-vision/img/latent_for_generation.png new file mode 100644 index 0000000..a3a959f Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/latent_for_generation.png differ diff --git a/src/year2/machine-learning-for-computer-vision/ml4cv.tex b/src/year2/machine-learning-for-computer-vision/ml4cv.tex index 2dcb0c3..d0fb591 100644 --- a/src/year2/machine-learning-for-computer-vision/ml4cv.tex +++ b/src/year2/machine-learning-for-computer-vision/ml4cv.tex @@ -16,5 +16,6 @@ \include{./sections/_segmentation.tex} \include{./sections/_depth_estimation.tex} \include{./sections/_metric_learning.tex} + \include{./sections/_generative_models.tex} \end{document} \ No newline at end of file diff --git a/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex new file mode 100644 index 0000000..1fba79c --- /dev/null +++ b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex @@ -0,0 +1,92 @@ +\chapter{Generative models} + + +\begin{description} + \item[Generative task] \marginnote{Generative task} + Given the training data $\{ x^{(i)} \}$, learn the distribution of the data so that a model can sample new examples: + \[ \hat{x}^{(i)} \sim p_\text{gen}(x; \matr{\theta}) \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/generative_task.png} + \end{figure} + + \begin{remark} + Generative tasks are hard as natural images lay on a low dimensional subspace (i.e., only a tiny subset of all the possible RGB images makes sense). + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/image_manifold.png} + \end{figure} + \end{remark} + + \item[Latent vector] \marginnote{Latent vector} + Low-dimensional representation to encode an image. + + \begin{example} + Face expression depends on 42 muscles. A latent representation for different poses of a face can be represented with a 42-dimensional vector. + \end{example} + + It is assumed that the factors of a latent vector are independent or mildly correlated, and can be sampled from a known distribution. + + \item[Generative model] \marginnote{Generative model} + Model that takes as input a latent representation and maps it into an output image. + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/latent_for_generation.png} + \end{figure} + + \begin{remark} + An ideal generative model should have the following properties: + \begin{itemize} + \item Be computationally efficient when sampling. + \item Produce high-quality samples. + \item Represent the entire training distribution. + \item Produce a plausible output from any latent input. Smooth changes to the input should be reflected on the output. + \item Have a disentangled latent space (i.e., changing a dimension of the latent space corresponds to interpretable changes in the output image). + \item Have the possibility to calculate the probability of the produced images (when the model is probabilistic). + \end{itemize} + \end{remark} +\end{description} + + + +\section{Metrics} + +\begin{description} + \item[Expectation/Expected value] \marginnote{Expectation/Expected value} + Informally, it is the generalization of the weighted average: + \[ + \mathbb{E}_{x \sim p}[ f(\cdot) ] = \sum_{x \in \mathbb{X}} \prob{x} f(x) + \qquad + \mathbb{E}_{x \sim p}[ f(\cdot) ] = \int_{x \in \mathbb{X}} \prob{x} f(x) \,dx + \] + + \begin{description} + \item[Monte Carlo approximation] \marginnote{Monte Carlo approximation} + Approximation for expectation using $N$ i.i.d. samples drawn from $p(x)$: + \[ \mathbb{E}_{x \sim p}[f(\cdot)] \approx \frac{1}{N} \sum_{x_i \sim p(x)} f(x_i) \] + \end{description} + + \item[Self-information] \marginnote{Self-information} + Given a probability mass function of an event, the self-information of an event $x$ is defined as: + \[ I(x) = -\log_b(\prob{x}) = \log_b\left( \frac{1}{\prob{x}} \right) \] + Intuitively, it can be seen as a measure of surprise. + + \begin{example} + Consider the toss of a fair coin. The self-information for the outcomes are: + \[ I(\texttt{heads}) = I(\texttt{tails}) = \log_2\left( \frac{1}{0.5} \right) = 1 \] + If the coin is loaded toward tails with probability: + \[ + \prob{\texttt{heads}} = 0.05 + \qquad + \prob{\texttt{heads}} = 0.95 + \] + The self-information is: + \[ + I(\texttt{heads}) = \log_2\left( \frac{1}{0.05} \right) = 4.31 + \qquad + I(\texttt{tails}) = \log_2\left( \frac{1}{0.95} \right) = 0.07 + \] + \end{example} +\end{description} \ No newline at end of file diff --git a/src/year2/machine-learning-for-computer-vision/sections/_metric_learning.tex b/src/year2/machine-learning-for-computer-vision/sections/_metric_learning.tex index 19477b7..91937fa 100644 --- a/src/year2/machine-learning-for-computer-vision/sections/_metric_learning.tex +++ b/src/year2/machine-learning-for-computer-vision/sections/_metric_learning.tex @@ -375,4 +375,107 @@ \begin{remark} Empirical studies found out that the different metric learning losses all perform similarly with fixed hyperparameters while avoiding test set feedback (i.e., avoid leaking test data into training). +\end{remark} + + +% \begin{description} +% \item[GrokNet] +% \end{description} + + +\section{Zero-shot classification} + +\begin{description} + \item[Zero-shot classification] \marginnote{Zero-shot classification} + Classify images from the test set of a dataset without training on its training set. + + \begin{remark} + Natural language supervision works well for zero-shot classification by connecting the representation of images and texts. An easy way to obtain image-text pairs is to use the \texttt{alt} tag of HTML images (i.e., the description of the image used by screen readers). + \end{remark} +\end{description} + + +\subsection{Contrastive language-image pre-training (CLIP)} + +\begin{description} + \item[Contrastive language-image pre-training (CLIP)] \marginnote{Contrastive language-image pre-training (CLIP)} + Network composed of: + \begin{descriptionlist} + \item[Text encoder] + Transformer encoder where the \texttt{[EOS]} token is used as the representation of the sequence. + + \item[Image encoder] + ResNet with global average pooling or ViT where the \texttt{[CLS]} token is used as representation. + \end{descriptionlist} + A linear projection is used to match the shapes of the two encoders. + + \begin{description} + \item[Training] + Given a batch of text-image pairs $(t_1, i_1), \dots, (t_N, i_N)$, texts and images are processed by their respective encoders. The embeddings $T_j$, $I_j$ are then compared pairwise: text and image embeddings corresponding to the same entity are considered a positive class, otherwise, they represent a negative class. + NT-Xent loss is computed across the images by fixing a text or vice versa. + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/_clip_training.pdf} + \caption{ + \parbox[t]{0.6\linewidth}{CLIP training flow. NT-Xent loss is applied column or row-wise in the dot product matrix.} + } + \end{figure} + + \begin{remark} + It has been seen that as bag-of-words approach for the text encoder is better than using transformers. + \end{remark} + + \begin{remark} + As the NT-Xent loss is used, a large batch size is used. + \end{remark} + + \begin{remark} + CLIP with ViT-L/14 is the ``standard'' version. It has been pre-trained for an additional epoch on images with a higher resolution, similarly to FixRes (i.e., deal with different train and test resolutions). + \end{remark} + + \item[Inference] + Given an image to classify, it is embedded and compared with the embeddings of prompts referencing the classes (e.g., \texttt{a photo of a [object]}). The closest one is considered as the predicted class. + \begin{figure}[H] + \centering + \includegraphics[width=0.85\linewidth]{./img/_clip_inference.pdf} + \end{figure} + \end{description} + + \begin{remark} + CLIP features are robust to distribution shifts (i.e., the network does not take ``shortcuts'' when classifying). + + \indenttbox + \begin{example} + As lesions in x-ray images are marked by the doctors, a network trained to detect lesions might actually learn to predict the mark put by the doctors. + \end{example} + + \begin{figure}[H] + \centering + \begin{subfigure}{0.35\linewidth} + \centering + \includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift.pdf} + \end{subfigure} + \hfill + \begin{subfigure}{0.6\linewidth} + \centering + \includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift_datasets.pdf} + \end{subfigure} + \end{figure} + \end{remark} +\end{description} + +\begin{remark}[CLIP for text-conditioned generative models] + CLIP can be used as a loss for a generative model to condition generation based on a prompt. Given a desired prompt, the steps are the following: + \begin{enumerate} + \item Freeze both the generator and CLIP. + \item Feed the generator a random input $z$ to generate an image. + \item Embed the generated image with CLIP and compare it to the embedding of the prompt. + \item Gradient ascent is used to improve $z$ aiming to maximize the similarity. + \end{enumerate} + + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/_clip_generation_conditioning.pdf} + \end{figure} \end{remark} \ No newline at end of file