Add ML4CV CLIP + generative models intro

This commit is contained in:
2024-11-25 21:19:58 +01:00
parent 8457ccb892
commit 2236cc91fd
11 changed files with 196 additions and 0 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 250 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 318 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 270 KiB

View File

@ -16,5 +16,6 @@
\include{./sections/_segmentation.tex}
\include{./sections/_depth_estimation.tex}
\include{./sections/_metric_learning.tex}
\include{./sections/_generative_models.tex}
\end{document}

View File

@ -0,0 +1,92 @@
\chapter{Generative models}
\begin{description}
\item[Generative task] \marginnote{Generative task}
Given the training data $\{ x^{(i)} \}$, learn the distribution of the data so that a model can sample new examples:
\[ \hat{x}^{(i)} \sim p_\text{gen}(x; \matr{\theta}) \]
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/generative_task.png}
\end{figure}
\begin{remark}
Generative tasks are hard as natural images lay on a low dimensional subspace (i.e., only a tiny subset of all the possible RGB images makes sense).
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/image_manifold.png}
\end{figure}
\end{remark}
\item[Latent vector] \marginnote{Latent vector}
Low-dimensional representation to encode an image.
\begin{example}
Face expression depends on 42 muscles. A latent representation for different poses of a face can be represented with a 42-dimensional vector.
\end{example}
It is assumed that the factors of a latent vector are independent or mildly correlated, and can be sampled from a known distribution.
\item[Generative model] \marginnote{Generative model}
Model that takes as input a latent representation and maps it into an output image.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/latent_for_generation.png}
\end{figure}
\begin{remark}
An ideal generative model should have the following properties:
\begin{itemize}
\item Be computationally efficient when sampling.
\item Produce high-quality samples.
\item Represent the entire training distribution.
\item Produce a plausible output from any latent input. Smooth changes to the input should be reflected on the output.
\item Have a disentangled latent space (i.e., changing a dimension of the latent space corresponds to interpretable changes in the output image).
\item Have the possibility to calculate the probability of the produced images (when the model is probabilistic).
\end{itemize}
\end{remark}
\end{description}
\section{Metrics}
\begin{description}
\item[Expectation/Expected value] \marginnote{Expectation/Expected value}
Informally, it is the generalization of the weighted average:
\[
\mathbb{E}_{x \sim p}[ f(\cdot) ] = \sum_{x \in \mathbb{X}} \prob{x} f(x)
\qquad
\mathbb{E}_{x \sim p}[ f(\cdot) ] = \int_{x \in \mathbb{X}} \prob{x} f(x) \,dx
\]
\begin{description}
\item[Monte Carlo approximation] \marginnote{Monte Carlo approximation}
Approximation for expectation using $N$ i.i.d. samples drawn from $p(x)$:
\[ \mathbb{E}_{x \sim p}[f(\cdot)] \approx \frac{1}{N} \sum_{x_i \sim p(x)} f(x_i) \]
\end{description}
\item[Self-information] \marginnote{Self-information}
Given a probability mass function of an event, the self-information of an event $x$ is defined as:
\[ I(x) = -\log_b(\prob{x}) = \log_b\left( \frac{1}{\prob{x}} \right) \]
Intuitively, it can be seen as a measure of surprise.
\begin{example}
Consider the toss of a fair coin. The self-information for the outcomes are:
\[ I(\texttt{heads}) = I(\texttt{tails}) = \log_2\left( \frac{1}{0.5} \right) = 1 \]
If the coin is loaded toward tails with probability:
\[
\prob{\texttt{heads}} = 0.05
\qquad
\prob{\texttt{heads}} = 0.95
\]
The self-information is:
\[
I(\texttt{heads}) = \log_2\left( \frac{1}{0.05} \right) = 4.31
\qquad
I(\texttt{tails}) = \log_2\left( \frac{1}{0.95} \right) = 0.07
\]
\end{example}
\end{description}

View File

@ -375,4 +375,107 @@
\begin{remark}
Empirical studies found out that the different metric learning losses all perform similarly with fixed hyperparameters while avoiding test set feedback (i.e., avoid leaking test data into training).
\end{remark}
% \begin{description}
% \item[GrokNet]
% \end{description}
\section{Zero-shot classification}
\begin{description}
\item[Zero-shot classification] \marginnote{Zero-shot classification}
Classify images from the test set of a dataset without training on its training set.
\begin{remark}
Natural language supervision works well for zero-shot classification by connecting the representation of images and texts. An easy way to obtain image-text pairs is to use the \texttt{alt} tag of HTML images (i.e., the description of the image used by screen readers).
\end{remark}
\end{description}
\subsection{Contrastive language-image pre-training (CLIP)}
\begin{description}
\item[Contrastive language-image pre-training (CLIP)] \marginnote{Contrastive language-image pre-training (CLIP)}
Network composed of:
\begin{descriptionlist}
\item[Text encoder]
Transformer encoder where the \texttt{[EOS]} token is used as the representation of the sequence.
\item[Image encoder]
ResNet with global average pooling or ViT where the \texttt{[CLS]} token is used as representation.
\end{descriptionlist}
A linear projection is used to match the shapes of the two encoders.
\begin{description}
\item[Training]
Given a batch of text-image pairs $(t_1, i_1), \dots, (t_N, i_N)$, texts and images are processed by their respective encoders. The embeddings $T_j$, $I_j$ are then compared pairwise: text and image embeddings corresponding to the same entity are considered a positive class, otherwise, they represent a negative class.
NT-Xent loss is computed across the images by fixing a text or vice versa.
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_clip_training.pdf}
\caption{
\parbox[t]{0.6\linewidth}{CLIP training flow. NT-Xent loss is applied column or row-wise in the dot product matrix.}
}
\end{figure}
\begin{remark}
It has been seen that as bag-of-words approach for the text encoder is better than using transformers.
\end{remark}
\begin{remark}
As the NT-Xent loss is used, a large batch size is used.
\end{remark}
\begin{remark}
CLIP with ViT-L/14 is the ``standard'' version. It has been pre-trained for an additional epoch on images with a higher resolution, similarly to FixRes (i.e., deal with different train and test resolutions).
\end{remark}
\item[Inference]
Given an image to classify, it is embedded and compared with the embeddings of prompts referencing the classes (e.g., \texttt{a photo of a [object]}). The closest one is considered as the predicted class.
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/_clip_inference.pdf}
\end{figure}
\end{description}
\begin{remark}
CLIP features are robust to distribution shifts (i.e., the network does not take ``shortcuts'' when classifying).
\indenttbox
\begin{example}
As lesions in x-ray images are marked by the doctors, a network trained to detect lesions might actually learn to predict the mark put by the doctors.
\end{example}
\begin{figure}[H]
\centering
\begin{subfigure}{0.35\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift.pdf}
\end{subfigure}
\hfill
\begin{subfigure}{0.6\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift_datasets.pdf}
\end{subfigure}
\end{figure}
\end{remark}
\end{description}
\begin{remark}[CLIP for text-conditioned generative models]
CLIP can be used as a loss for a generative model to condition generation based on a prompt. Given a desired prompt, the steps are the following:
\begin{enumerate}
\item Freeze both the generator and CLIP.
\item Feed the generator a random input $z$ to generate an image.
\item Embed the generated image with CLIP and compare it to the embedding of the prompt.
\item Gradient ascent is used to improve $z$ aiming to maximize the similarity.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_clip_generation_conditioning.pdf}
\end{figure}
\end{remark}