diff --git a/src/year2/machine-learning-for-computer-vision/img/diffusion_kernel_example1.jpg b/src/year2/machine-learning-for-computer-vision/img/diffusion_kernel_example1.jpg new file mode 100644 index 0000000..13d9a83 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/diffusion_kernel_example1.jpg differ diff --git a/src/year2/machine-learning-for-computer-vision/img/diffusion_kernel_example2.jpg b/src/year2/machine-learning-for-computer-vision/img/diffusion_kernel_example2.jpg new file mode 100644 index 0000000..be50b96 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/diffusion_kernel_example2.jpg differ diff --git a/src/year2/machine-learning-for-computer-vision/img/diffusion_model.jpg b/src/year2/machine-learning-for-computer-vision/img/diffusion_model.jpg new file mode 100644 index 0000000..841c8d6 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/diffusion_model.jpg differ diff --git a/src/year2/machine-learning-for-computer-vision/img/diffusion_model_trajectory.jpg b/src/year2/machine-learning-for-computer-vision/img/diffusion_model_trajectory.jpg new file mode 100644 index 0000000..2ed3873 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/diffusion_model_trajectory.jpg differ diff --git a/src/year2/machine-learning-for-computer-vision/img/jensen_inequality.jpg b/src/year2/machine-learning-for-computer-vision/img/jensen_inequality.jpg new file mode 100644 index 0000000..7c223d3 Binary files /dev/null and b/src/year2/machine-learning-for-computer-vision/img/jensen_inequality.jpg differ diff --git a/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex index b42544d..5e84a66 100644 --- a/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex +++ b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex @@ -545,4 +545,198 @@ \begin{remark} Adversarial losses can also be used in supervised problems (e.g., generate a colored version of a black-and-white image). -\end{remark} \ No newline at end of file +\end{remark} + + + +\section{Diffusion models} + +\begin{description} + \item[Diffusion model] \marginnote{Diffusion model} + Architecture that generates an image by iteratively denoising the input latent vector. + + \begin{remark} + Empirical results show that the generation quality is generally better than other models. However, inference is slow. + \end{remark} + + \item[Training] + Given an image $\matr{x}_0$, training is done in two steps: + \begin{description} + \item[Forward process] + The original image $\matr{x}_0$ is iteratively transformed into a latent image $\matr{x}_T$ by adding noise (i.e., transform the complex distribution $q(\matr{x}_0)$ of the original image into a simpler one $q(\matr{x}_T)$). + \item[Reverse process] + The latent image $\matr{x}_T$ is iteratively denoised to reconstruct the original image $\matr{x}_0$. + \end{description} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/diffusion_model.jpg} + \end{figure} +\end{description} + + +\subsection{Forward process} + +\begin{description} + \item[Forward process] \marginnote{Forward process} + Given an image $\matr{x}_{t-1}$, produce a noisier version of it as: + \[ + \begin{gathered} + \matr{x}_t = \sqrt{1-\beta_t} \matr{x}_{t-1} + \sqrt{\beta_t}\matr{\varepsilon}_t \\ + \matr{x}_t \sim q(\matr{x}_t \mid \matr{x}_{t-1}) = \mathcal{N}(\sqrt{1-\beta_t}\matr{x}_{t-1}, \beta_t\matr{I}) + \end{gathered} + \] + where: + \begin{itemize} + \item $\matr{\varepsilon}_t \sim \mathcal{N}(0; \matr{I})$ is the noise + \item $\beta_t \in [0,1)$ is a hyperparameter (noise schedule) and represents the variance. + \item $\sqrt{1-\beta_t} \matr{x}_{t-1}$ is the mean. + \end{itemize} + + \begin{remark} + $\sqrt{1-\beta_t} \matr{x}_{t-1}$ and $\beta_t$ are the mean and variance due to the fact that sampling a vector $\vec{x}$ from a Gaussian distribution with mean $\vec{\mu}$ and covariance matrix $\matr{\Sigma}$ is equivalent to: + \[ \vec{x} = \vec{\mu} + \matr{\Sigma}^{\frac{1}{2}}\vec{y} \qquad \text{where } \vec{y} \sim \mathcal{N}(0; \matr{I}) \] + If $\matr{\Sigma} = \sigma^2\matr{I}$, it holds that $\matr{\Sigma}^{\frac{1}{2}} = \sigma \matr{I}$ and we have that: + \[ \vec{x} = \vec{\mu} + (\sigma\matr{I})\vec{y} \] + \end{remark} + + \begin{remark} + This step does not have learnable parameters. + \end{remark} + + \item[Diffusion kernel] \marginnote{Diffusion kernel} + It is possible to generate the latent vector $\matr{x}_t$ at time $t$ directly from $\matr{x}_0$ as: + \[ \matr{x}_t = \sqrt{\prod_{i=1}^{t}(1-\beta_i)} \cdot \matr{x}_0 + \sqrt{1-\prod_{i=1}^{t}(1-\beta_i)} \cdot \vec{\varepsilon} \qquad \text{where } \matr{\varepsilon} \sim \mathcal{N}(0; \matr{I}) \] + By setting the intermediate constant $\alpha_t = \prod_{i=1}^{t}(1-\beta_i)$, we have that: + \[ + \begin{gathered} + \matr{x}_t = \sqrt{\alpha_t} \matr{x}_0 + \sqrt{1-\alpha_t}\matr{\varepsilon} \\ + \matr{x}_t \sim q(\matr{x}_t \mid \matr{x}_0) = \mathcal{N}(\sqrt{\alpha_t}\matr{x}_0; (1-\alpha_t)\matr{I}) + \end{gathered} + \] + + \begin{remark} + As $\beta_t < 1$, it holds that $\lim\limits_{t \rightarrow +\infty} \alpha_t = 0$. In other words, for large $t = T$, only noise remains in the latent vector: + \[ q(\matr{x}_T \mid \matr{x}_0) = q(\matr{x}_T) = \mathcal{N}(0; \matr{I}) \] + Which achieves the goal of transforming a complex distribution $q(\matr{x}_0)$ into a simpler one (i.e., Gaussian). + \end{remark} + + \begin{example} + Consider the 1D case where $x$ represents a pixel. By using a linear scheduling for $\beta_t$ as follows: + \begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/diffusion_kernel_example1.jpg} + \end{figure} + We obtain that some diffusion kernels for varying $t$ with $x_0 = 1$ are the following (note that the signal converges to $\mathcal{N}(0; 1)$): + \begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/diffusion_kernel_example2.jpg} + \end{figure} + \end{example} + + \begin{remark} + As the forward process is stochastic, the same starting pixel can produce a different resulting pixel. Therefore, diffusion models work with trajectories in latent space. + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/diffusion_model_trajectory.jpg} + \caption{ + \parbox[t]{0.6\linewidth}{ + Trajectories starting from $x_0 = 1$. The dashed lines mark the $\mu_t \pm 3\sigma_t$ area. + } + } + \end{figure} + \end{remark} +\end{description} + + +\subsection{Reverse process} + +\begin{remark} + In principle, one could invert the forward process by applying Bayes rule: + \[ q(\matr{x}_{t-1} \mid \matr{x}_t) = q(\matr{x}_t \mid \matr{x}_{t-1}) \frac{q(\matr{x}_{t-1})}{q(\matr{x}_t)} \] + However, closed-form expressions for $q(\matr{x}_{t-1})$ and $q(\matr{x}_{t})$ are not available. + + By exploiting the Markov chain properties, it is possible to compute the conditional distribution w.r.t. $\matr{x}_0$, which is available at training time, as: + \[ + q(\matr{x}_{t-1} \mid \matr{x}_t, \matr{x}_0) = + q(\matr{x}_t \mid \matr{x}_{t-1}, \matr{x}_0) \frac{q(\matr{x}_{t-1} \mid \matr{x}_0)}{q(\matr{x}_t \mid \matr{x}_0)} = + \underbrace{{q(\matr{x}_t \mid \matr{x}_{t-1})}}_{\text{Forward process}} + \underbrace{\frac{q(\matr{x}_{t-1} \mid \matr{x}_0)}{q(\matr{x}_t \mid \matr{x}_0)}}_{\text{Diffusion kernels}} + \] + It can be shown that this is equivalent to: + \[ q(\matr{x}_{t-1} \mid \matr{x}_t, \matr{x}_0) = \mathcal{N}\left( \frac{1-\alpha_{t-1}}{1-\alpha_t}\sqrt{1-\beta_t}\matr{x}_t + \frac{\sqrt{\alpha_{t-1}}}{1-\alpha_t}\beta_t \matr{x}_0; \frac{\beta_t(1-\alpha_{t-1})}{1-\alpha_t} \matr{I} \right) \] + However, this formulation requires knowing $\matr{x}_0$, which is only available at training time, making inference impossible. +\end{remark} + +\begin{description} + \item[Learned reverse process] \marginnote{Learned reverse process} + Learn a Markov chain of probabilistic mappings to reconstruct the original image $\matr{x}_0$ starting from the latent vector $\matr{x}_T$: + \[ + \begin{split} + p(\matr{x}_T) &= \mathcal{N}(0; \matr{I}) = q(\matr{x}_T) \\ + p(\matr{x}_{t-1} \mid \matr{x}_t) &= \mathcal{N}(\mu_t(\matr{x}_t; \matr{\theta}_t); \sigma_t\matr{I}) + \end{split} + \] + where: + \begin{itemize} + \item $\mu_t(\matr{x}_t; \matr{\theta}_t)$ is a neural network to estimate the mean of $p(\matr{x}_{t-1} \mid \matr{x}_t)$. + \item $\sigma_t$ is, for the case of simple diffusion models, predetermined. + \end{itemize} + + \begin{remark} + In general, $p(\matr{x}_{t-1} \mid \matr{x}_t)$ does not necessarily follow a Gaussian distribution as this is only true for $\beta_t \rightarrow 0$. However, by using small $\beta_t$ and large $T$, it can be approximately considered Gaussian. + \end{remark} + + \begin{description} + \item[Training] + The training objective for a set of real images $\{ \matr{x}_0^{(i)} \}_{i=1}^{I}$ is: + \[ \matr{\theta}_1^*, \dots, \matr{\theta}_T^* = \arg\max_{\matr{\theta}_1, \dots, \matr{\theta}_T} \sum_{i=1}^{I} \log\left( p(\matr{x}_0^{(i)} \mid \matr{\theta}_1, \dots, \matr{\theta}_T) \right) \] + As each image is obtained as a sequence of latents, we have that: + \[ + \begin{aligned} + p&(\matr{x}_0, \matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_1, \dots, \matr{\theta}_T) \\ + &= p(\matr{x}_0 \mid \matr{x}_1, \dots, \matr{x}_T, \matr{\theta}_1, \dots, \matr{\theta}_T) p(\matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_1, \dots, \matr{\theta}_T) + & p(x, y | z) = p(x | y, z)p(y | z) \\ + &= p(\matr{x}_0 \mid \matr{x}_1, \matr{\theta}_1) p(\matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_2, \dots, \matr{\theta}_T) + & \text{Markov chain} \\ + &= \dots & \text{Repeat} \\ + &= p(\matr{x}_0 \mid \matr{x}_1, \matr{\theta}_1) \left( \prod_{t=2}^{T} p(\matr{x}_{t-1} \mid \matr{x}_t, \matr{\theta}_t) \right) p(\matr{x}_T) + \end{aligned} + \] + And the likelihood of $\matr{x}_0$ can be computed through marginalization over the latent images as follows: + \[ p(\matr{x}_0^{(i)} \mid \matr{\theta}_1, \dots, \matr{\theta}_T) = \int p(\matr{x}_0, \matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_1, \dots, \matr{\theta}_T) \, d\matr{x}_1 \dots d\matr{x}_T \] + However, in practice this approach is computationally intractable due to the high number and high dimensionality of the latent variables. + + \begin{description} + \item[Evidence lower bound (ELBO)] \marginnote{Evidence lower bound (ELBO)} + Method to compute a lower-bound of the log-likelihood. During training, we aim to maximize this bound as a proxy to maximize the likelihood. + + \begin{lemma}[Jensen's inequality] + Given a concave function $f(\cdot)$ and the expectation of the data $x$. It holds that: + \[ f(\mathbb{E}_{x \sim p(x)}[x]) \geq \mathbb{E}_{x \sim p(x)}[f(x)] \] + + \begin{example} + Consider the logarithm function and a discrete random variable. It holds that: + \[ \log\left( \mathbb{E}_{x \sim p(x)}[x] \right) \geq \mathbb{E}_{x \sim p(x)}[\log(x)] \Rightarrow \log\left( \sum_{x \in \mathbb{X}} p(x)x \right) \geq \sum_{x \in \mathbb{X}} (p(x)\log(x)) \] + \end{example} + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/jensen_inequality.jpg} + \caption{Visualization of the Jensen's inequality} + \end{figure} + \end{lemma} + + ELBO is computed as follows: + \[ + \begin{split} + \log&(p(\matr{x}_0 \mid \matr{\theta}_1, \dots, \matr{\theta}_T)) \\ + &= \log\left( \int p(\matr{x}_0, \matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_1, \dots, \matr{\theta}_T) \, d\matr{x}_1 \dots d\matr{x}_T \right) \\ + &= \log\left( \int \frac{q(\matr{x}_1, \dots, \matr{x}_T \mid \matr{x}_0)}{q(\matr{x}_1, \dots, \matr{x}_T \mid \matr{x}_0)} p(\matr{x}_0, \matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_1, \dots, \matr{\theta}_T) \, d\matr{x}_1 \dots d\matr{x}_T \right) \\ + &= \log\left( \mathbb{E}_{q(\matr{x}_1, \dots, \matr{x}_T \mid \matr{x}_0)}\left[ \frac{p(\matr{x}_0, \matr{x}_1, \dots, \matr{x}_T \mid \matr{\theta}_1, \dots, \matr{\theta}_T)}{q(\matr{x}_1, \dots, \matr{x}_T \mid \matr{x}_0)} \right] \right) \\ + \end{split} + \] + \end{description} + \end{description} +\end{description} \ No newline at end of file