diff --git a/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex
index 2dcae85..b232faa 100644
--- a/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_generative_models.tex
@@ -96,7 +96,7 @@
 
     \item[Entropy] \marginnote{Entropy}
         Expected value of the self-information of a probability mass function:
-        \[ H(p(\cdot)) = \mathbb{E}_{x \sim p} \left[ - \log(p(\cdot)) \right] \approx -\sum_{x \in \mathbb{X}} p(x) \log(p(x)) \]
+        \[ H(p(\cdot)) = \mathbb{E}_{x \sim p} \left[ - \log(p(x)) \right] \approx -\sum_{x \in \mathbb{X}} p(x) \log(p(x)) \]
         Intuitively, it measures the average surprise of a distribution.
         
         \begin{example}
@@ -218,9 +218,10 @@
             \begin{split}
                 D_\text{EMD}(p || q) = \min_{\matr{P}}\left[ \sum_{i, j} \matr{P}_{i, j} |i-j| \right] \\
                 \begin{split}
-                    \text{subject to}& \sum_{i} \matr{P}_{i, j} = p(i) \,\land \\
-                        &\sum_j \matr{P}_{i,j} = q(j) \,\land \\
-                        &\matr{P}_{i,j} \geq 0
+                    \text{subject to}
+                        & \sum_{j} \matr{P}_{i, j} = p(i) \,\land \\
+                        & \sum_{i} \matr{P}_{i,j} = q(j) \,\land \\
+                        & \matr{P}_{i,j} \geq 0
                 \end{split}
             \end{split}
         \]
@@ -929,7 +930,7 @@
 
 \begin{description}
     \item[Generation architecture]
-        Standard U-Net or transformers to predict the noise.
+        Standard U-Net or transformer to predict the noise.
 
         \begin{description}
             \item[U-Net with self-attention]
@@ -1248,7 +1249,7 @@
             \begin{split}
                 \varepsilon_t^{\text{cls}}(\x_t, c; \params) 
                 &= \varepsilon_t(\x_t, c; \params) - w \nabla_{x_t}[ \log(p_\text{cls}(c \mid \x_t, t)) ] \\
-                &= - \big( - \varepsilon_t(\x_t, c; \params) + w \nabla_{x_t}[ \log(p_\text{cls}(c \mid \x_t, t)) ] \big)
+                % &= - \big( - \varepsilon_t(\x_t, c; \params) + w \nabla_{x_t}[ \log(p_\text{cls}(c \mid \x_t, t)) ] \big)
             \end{split}
         \]
         By applying Bayes' rule on the second term, we have that: