mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add ML4CV CLIP + generative models intro
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 250 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 318 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 270 KiB |
@ -16,5 +16,6 @@
|
||||
\include{./sections/_segmentation.tex}
|
||||
\include{./sections/_depth_estimation.tex}
|
||||
\include{./sections/_metric_learning.tex}
|
||||
\include{./sections/_generative_models.tex}
|
||||
|
||||
\end{document}
|
||||
@ -0,0 +1,92 @@
|
||||
\chapter{Generative models}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Generative task] \marginnote{Generative task}
|
||||
Given the training data $\{ x^{(i)} \}$, learn the distribution of the data so that a model can sample new examples:
|
||||
\[ \hat{x}^{(i)} \sim p_\text{gen}(x; \matr{\theta}) \]
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/generative_task.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
Generative tasks are hard as natural images lay on a low dimensional subspace (i.e., only a tiny subset of all the possible RGB images makes sense).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/image_manifold.png}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
|
||||
\item[Latent vector] \marginnote{Latent vector}
|
||||
Low-dimensional representation to encode an image.
|
||||
|
||||
\begin{example}
|
||||
Face expression depends on 42 muscles. A latent representation for different poses of a face can be represented with a 42-dimensional vector.
|
||||
\end{example}
|
||||
|
||||
It is assumed that the factors of a latent vector are independent or mildly correlated, and can be sampled from a known distribution.
|
||||
|
||||
\item[Generative model] \marginnote{Generative model}
|
||||
Model that takes as input a latent representation and maps it into an output image.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/latent_for_generation.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
An ideal generative model should have the following properties:
|
||||
\begin{itemize}
|
||||
\item Be computationally efficient when sampling.
|
||||
\item Produce high-quality samples.
|
||||
\item Represent the entire training distribution.
|
||||
\item Produce a plausible output from any latent input. Smooth changes to the input should be reflected on the output.
|
||||
\item Have a disentangled latent space (i.e., changing a dimension of the latent space corresponds to interpretable changes in the output image).
|
||||
\item Have the possibility to calculate the probability of the produced images (when the model is probabilistic).
|
||||
\end{itemize}
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Metrics}
|
||||
|
||||
\begin{description}
|
||||
\item[Expectation/Expected value] \marginnote{Expectation/Expected value}
|
||||
Informally, it is the generalization of the weighted average:
|
||||
\[
|
||||
\mathbb{E}_{x \sim p}[ f(\cdot) ] = \sum_{x \in \mathbb{X}} \prob{x} f(x)
|
||||
\qquad
|
||||
\mathbb{E}_{x \sim p}[ f(\cdot) ] = \int_{x \in \mathbb{X}} \prob{x} f(x) \,dx
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Monte Carlo approximation] \marginnote{Monte Carlo approximation}
|
||||
Approximation for expectation using $N$ i.i.d. samples drawn from $p(x)$:
|
||||
\[ \mathbb{E}_{x \sim p}[f(\cdot)] \approx \frac{1}{N} \sum_{x_i \sim p(x)} f(x_i) \]
|
||||
\end{description}
|
||||
|
||||
\item[Self-information] \marginnote{Self-information}
|
||||
Given a probability mass function of an event, the self-information of an event $x$ is defined as:
|
||||
\[ I(x) = -\log_b(\prob{x}) = \log_b\left( \frac{1}{\prob{x}} \right) \]
|
||||
Intuitively, it can be seen as a measure of surprise.
|
||||
|
||||
\begin{example}
|
||||
Consider the toss of a fair coin. The self-information for the outcomes are:
|
||||
\[ I(\texttt{heads}) = I(\texttt{tails}) = \log_2\left( \frac{1}{0.5} \right) = 1 \]
|
||||
If the coin is loaded toward tails with probability:
|
||||
\[
|
||||
\prob{\texttt{heads}} = 0.05
|
||||
\qquad
|
||||
\prob{\texttt{heads}} = 0.95
|
||||
\]
|
||||
The self-information is:
|
||||
\[
|
||||
I(\texttt{heads}) = \log_2\left( \frac{1}{0.05} \right) = 4.31
|
||||
\qquad
|
||||
I(\texttt{tails}) = \log_2\left( \frac{1}{0.95} \right) = 0.07
|
||||
\]
|
||||
\end{example}
|
||||
\end{description}
|
||||
@ -375,4 +375,107 @@
|
||||
|
||||
\begin{remark}
|
||||
Empirical studies found out that the different metric learning losses all perform similarly with fixed hyperparameters while avoiding test set feedback (i.e., avoid leaking test data into training).
|
||||
\end{remark}
|
||||
|
||||
|
||||
% \begin{description}
|
||||
% \item[GrokNet]
|
||||
% \end{description}
|
||||
|
||||
|
||||
\section{Zero-shot classification}
|
||||
|
||||
\begin{description}
|
||||
\item[Zero-shot classification] \marginnote{Zero-shot classification}
|
||||
Classify images from the test set of a dataset without training on its training set.
|
||||
|
||||
\begin{remark}
|
||||
Natural language supervision works well for zero-shot classification by connecting the representation of images and texts. An easy way to obtain image-text pairs is to use the \texttt{alt} tag of HTML images (i.e., the description of the image used by screen readers).
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Contrastive language-image pre-training (CLIP)}
|
||||
|
||||
\begin{description}
|
||||
\item[Contrastive language-image pre-training (CLIP)] \marginnote{Contrastive language-image pre-training (CLIP)}
|
||||
Network composed of:
|
||||
\begin{descriptionlist}
|
||||
\item[Text encoder]
|
||||
Transformer encoder where the \texttt{[EOS]} token is used as the representation of the sequence.
|
||||
|
||||
\item[Image encoder]
|
||||
ResNet with global average pooling or ViT where the \texttt{[CLS]} token is used as representation.
|
||||
\end{descriptionlist}
|
||||
A linear projection is used to match the shapes of the two encoders.
|
||||
|
||||
\begin{description}
|
||||
\item[Training]
|
||||
Given a batch of text-image pairs $(t_1, i_1), \dots, (t_N, i_N)$, texts and images are processed by their respective encoders. The embeddings $T_j$, $I_j$ are then compared pairwise: text and image embeddings corresponding to the same entity are considered a positive class, otherwise, they represent a negative class.
|
||||
NT-Xent loss is computed across the images by fixing a text or vice versa.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_clip_training.pdf}
|
||||
\caption{
|
||||
\parbox[t]{0.6\linewidth}{CLIP training flow. NT-Xent loss is applied column or row-wise in the dot product matrix.}
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
It has been seen that as bag-of-words approach for the text encoder is better than using transformers.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
As the NT-Xent loss is used, a large batch size is used.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
CLIP with ViT-L/14 is the ``standard'' version. It has been pre-trained for an additional epoch on images with a higher resolution, similarly to FixRes (i.e., deal with different train and test resolutions).
|
||||
\end{remark}
|
||||
|
||||
\item[Inference]
|
||||
Given an image to classify, it is embedded and compared with the embeddings of prompts referencing the classes (e.g., \texttt{a photo of a [object]}). The closest one is considered as the predicted class.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.85\linewidth]{./img/_clip_inference.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
CLIP features are robust to distribution shifts (i.e., the network does not take ``shortcuts'' when classifying).
|
||||
|
||||
\indenttbox
|
||||
\begin{example}
|
||||
As lesions in x-ray images are marked by the doctors, a network trained to detect lesions might actually learn to predict the mark put by the doctors.
|
||||
\end{example}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.35\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift.pdf}
|
||||
\end{subfigure}
|
||||
\hfill
|
||||
\begin{subfigure}{0.6\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_clip_resnet_distributional_shift_datasets.pdf}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}[CLIP for text-conditioned generative models]
|
||||
CLIP can be used as a loss for a generative model to condition generation based on a prompt. Given a desired prompt, the steps are the following:
|
||||
\begin{enumerate}
|
||||
\item Freeze both the generator and CLIP.
|
||||
\item Feed the generator a random input $z$ to generate an image.
|
||||
\item Embed the generated image with CLIP and compare it to the embedding of the prompt.
|
||||
\item Gradient ascent is used to improve $z$ aiming to maximize the similarity.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/_clip_generation_conditioning.pdf}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
Reference in New Issue
Block a user