Add ML4CV CLIP + generative models intro

2025-12-14 18:51:52 +01:00 · 2024-11-25 21:19:58 +01:00
parent 8457ccb892
commit 2236cc91fd
11 changed files with 196 additions and 0 deletions
--- a/src/year2/machine-learning-for-computer-vision/img/_clip_generation_conditioning.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_clip_generation_conditioning.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_clip_inference.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_clip_inference.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift_datasets.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_clip_resnet_distributional_shift_datasets.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_clip_training.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_clip_training.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/generative_task.png
+++ b/src/year2/machine-learning-for-computer-vision/img/generative_task.png
--- a/src/year2/machine-learning-for-computer-vision/img/image_manifold.png
+++ b/src/year2/machine-learning-for-computer-vision/img/image_manifold.png
--- a/src/year2/machine-learning-for-computer-vision/img/latent_for_generation.png
+++ b/src/year2/machine-learning-for-computer-vision/img/latent_for_generation.png
--- a/src/year2/machine-learning-for-computer-vision/ml4cv.tex
+++ b/src/year2/machine-learning-for-computer-vision/ml4cv.tex
@ -16,5 +16,6 @@
    \include{./sections/_segmentation.tex}
    \include{./sections/_depth_estimation.tex}
    \include{./sections/_metric_learning.tex}
+    \include{./sections/_generative_models.tex}

 \end{document}
--- a/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex
@ -0,0 +1,92 @@
+\chapter{Generative models}
+
+
+\begin{description}
+    \item[Generative task] \marginnote{Generative task}
+        Given the training data $\{ x^{(i)} \}$, learn the distribution of the data so that a model can sample new examples:
+        \[ \hat{x}^{(i)} \sim p_\text{gen}(x; \matr{\theta}) \]
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.4\linewidth]{./img/generative_task.png}
+        \end{figure}
+
+        \begin{remark}
+            Generative tasks are hard as natural images lay on a low dimensional subspace (i.e., only a tiny subset of all the possible RGB images makes sense).
+
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.6\linewidth]{./img/image_manifold.png}
+            \end{figure}
+        \end{remark}
+
+    \item[Latent vector] \marginnote{Latent vector}
+        Low-dimensional representation to encode an image.
+
+        \begin{example}
+            Face expression depends on 42 muscles. A latent representation for different poses of a face can be represented with a 42-dimensional vector.
+        \end{example}
+
+        It is assumed that the factors of a latent vector are independent or mildly correlated, and can be sampled from a known distribution.
+
+    \item[Generative model] \marginnote{Generative model}
+        Model that takes as input a latent representation and maps it into an output image.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.7\linewidth]{./img/latent_for_generation.png}
+        \end{figure}
+
+        \begin{remark}
+            An ideal generative model should have the following properties:
+            \begin{itemize}
+                \item Be computationally efficient when sampling.
+                \item Produce high-quality samples.
+                \item Represent the entire training distribution.
+                \item Produce a plausible output from any latent input. Smooth changes to the input should be reflected on the output.
+                \item Have a disentangled latent space (i.e., changing a dimension of the latent space corresponds to interpretable changes in the output image).
+                \item Have the possibility to calculate the probability of the produced images (when the model is probabilistic). 
+            \end{itemize}
+        \end{remark}
+\end{description}
+
+
+
+\section{Metrics}
+
+\begin{description}
+    \item[Expectation/Expected value] \marginnote{Expectation/Expected value}
+        Informally, it is the generalization of the weighted average:
+        \[ 
+            \mathbb{E}_{x \sim p}[ f(\cdot) ] = \sum_{x \in \mathbb{X}} \prob{x} f(x) 
+            \qquad
+            \mathbb{E}_{x \sim p}[ f(\cdot) ] = \int_{x \in \mathbb{X}} \prob{x} f(x) \,dx
+        \]
+
+        \begin{description}
+            \item[Monte Carlo approximation] \marginnote{Monte Carlo approximation}
+                Approximation for expectation using $N$ i.i.d. samples drawn from $p(x)$:
+                \[ \mathbb{E}_{x \sim p}[f(\cdot)] \approx \frac{1}{N} \sum_{x_i \sim p(x)} f(x_i) \]
+        \end{description}
+
+    \item[Self-information] \marginnote{Self-information}
+        Given a probability mass function of an event, the self-information of an event $x$ is defined as:
+        \[ I(x) = -\log_b(\prob{x}) = \log_b\left( \frac{1}{\prob{x}} \right) \]
+        Intuitively, it can be seen as a measure of surprise.
+
+        \begin{example}
+            Consider the toss of a fair coin. The self-information for the outcomes are:
+            \[ I(\texttt{heads}) = I(\texttt{tails}) = \log_2\left( \frac{1}{0.5} \right) = 1 \]
+            If the coin is loaded toward tails with probability:
+            \[ 
+                \prob{\texttt{heads}} = 0.05 
+                \qquad 
+                \prob{\texttt{heads}} = 0.95 
+            \]
+            The self-information is:
+            \[ 
+                I(\texttt{heads}) = \log_2\left( \frac{1}{0.05} \right) = 4.31
+                \qquad
+                I(\texttt{tails}) = \log_2\left( \frac{1}{0.95} \right) = 0.07
+            \]
+        \end{example}
+\end{description}
--- a/src/year2/machine-learning-for-computer-vision/sections/_metric_learning.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_metric_learning.tex
@ -375,4 +375,107 @@

 \begin{remark}
    Empirical studies found out that the different metric learning losses all perform similarly with fixed hyperparameters while avoiding test set feedback (i.e., avoid leaking test data into training).
+\end{remark}
+
+
+% \begin{description}
+%     \item[GrokNet]
+% \end{description}
+
+
+\section{Zero-shot classification}
+
+\begin{description}
+    \item[Zero-shot classification] \marginnote{Zero-shot classification}
+        Classify images from the test set of a dataset without training on its training set.
+
+        \begin{remark}
+            Natural language supervision works well for zero-shot classification by connecting the representation of images and texts. An easy way to obtain image-text pairs is to use the \texttt{alt} tag of HTML images (i.e., the description of the image used by screen readers).
+        \end{remark}
+\end{description}
+
+
+\subsection{Contrastive language-image pre-training (CLIP)}
+
+\begin{description}
+    \item[Contrastive language-image pre-training (CLIP)] \marginnote{Contrastive language-image pre-training (CLIP)}
+        Network composed of:
+        \begin{descriptionlist}
+            \item[Text encoder] 
+                Transformer encoder where the \texttt{[EOS]} token is used as the representation of the sequence.
+
+            \item[Image encoder] 
+                ResNet with global average pooling or ViT where the \texttt{[CLS]} token is used as representation.
+        \end{descriptionlist}
+        A linear projection is used to match the shapes of the two encoders.
+
+        \begin{description}
+            \item[Training] 
+                Given a batch of text-image pairs $(t_1, i_1), \dots, (t_N, i_N)$, texts and images are processed by their respective encoders. The embeddings $T_j$, $I_j$ are then compared pairwise: text and image embeddings corresponding to the same entity are considered a positive class, otherwise, they represent a negative class.
+                NT-Xent loss is computed across the images by fixing a text or vice versa.
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.6\linewidth]{./img/_clip_training.pdf}
+                    \caption{
+                        \parbox[t]{0.6\linewidth}{CLIP training flow. NT-Xent loss is applied column or row-wise in the dot product matrix.}
+                    }
+                \end{figure}
+
+                \begin{remark}
+                    It has been seen that as bag-of-words approach for the text encoder is better than using transformers.
+                \end{remark}
+
+                \begin{remark}
+                    As the NT-Xent loss is used, a large batch size is used.
+                \end{remark}
+
+                \begin{remark}
+                    CLIP with ViT-L/14 is the ``standard'' version. It has been pre-trained for an additional epoch on images with a higher resolution, similarly to FixRes (i.e., deal with different train and test resolutions).
+                \end{remark}
+
+            \item[Inference]
+                Given an image to classify, it is embedded and compared with the embeddings of prompts referencing the classes (e.g., \texttt{a photo of a [object]}). The closest one is considered as the predicted class.
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.85\linewidth]{./img/_clip_inference.pdf}
+                \end{figure}
+        \end{description}
+
+        \begin{remark}
+            CLIP features are robust to distribution shifts (i.e., the network does not take ``shortcuts'' when classifying).
+
+            \indenttbox
+            \begin{example}
+                As lesions in x-ray images are marked by the doctors, a network trained to detect lesions might actually learn to predict the mark put by the doctors.
+            \end{example}
+
+            \begin{figure}[H]
+                \centering
+                \begin{subfigure}{0.35\linewidth}
+                    \centering
+                    \includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift.pdf}
+                \end{subfigure}
+                \hfill
+                \begin{subfigure}{0.6\linewidth}
+                    \centering
+                    \includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift_datasets.pdf}
+                \end{subfigure}
+            \end{figure}
+        \end{remark}
+\end{description}
+
+\begin{remark}[CLIP for text-conditioned generative models]
+    CLIP can be used as a loss for a generative model to condition generation based on a prompt. Given a desired prompt, the steps are the following:
+    \begin{enumerate}
+        \item Freeze both the generator and CLIP.
+        \item Feed the generator a random input $z$ to generate an image.
+        \item Embed the generated image with CLIP and compare it to the embedding of the prompt.
+        \item Gradient ascent is used to improve $z$ aiming to maximize the similarity.
+    \end{enumerate}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.5\linewidth]{./img/_clip_generation_conditioning.pdf}
+    \end{figure}
 \end{remark}