Add SMM statistics

2026-02-04 07:41:43 +01:00 · 2023-10-12 20:10:51 +02:00
parent 3d4987f9bf
commit ebcc421664
3 changed files with 349 additions and 36 deletions
--- a/src/ainotes.cls
+++ b/src/ainotes.cls
@ -57,6 +57,7 @@
 \theoremstyle{definition}
 \newtheorem{theorem}{Theorem}[section]
 \newtheorem{corollary}{Corollary}[theorem]
+\newtheorem{lemma}[theorem]{Lemma}
 \newtheorem*{example}{Example}
 \theoremstyle{definition}
 \newtheorem*{definition}{Def}
--- a/src/statistical-and-mathematical-methods-for-ai/img/_joint_probability_example.pdf
+++ b/src/statistical-and-mathematical-methods-for-ai/img/_joint_probability_example.pdf
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
@ -1,4 +1,12 @@
-\chapter{Probability}
+\chapter{Probability and statistics}
+
+
+\begin{description}
+    \item[Probability]
+        model of a process where the underlying uncertainty is captured by random variables.
+    \item[Statistics] 
+        determine the underlying process that explains an observation.
+\end{description}


 \section{Probability}
@ -14,9 +22,9 @@
        Set of possible results (i.e. $A$ is an event if $A \subseteq \Omega$)

    \item[Probability] \marginnote{Probability}
-        Let $\mathbb{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
+        Let $\mathcal{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
        The probability is a function:
-        \[ \prob{A}: \mathbb{E} \rightarrow [0, 1] \]
+        \[ \prob{A}: \mathcal{E} \rightarrow [0, 1] \]
        \begin{example}
            Let $\Omega$ be as above.
            Given an event $A = \{ (\text{T}, \text{H}), (\text{H}, \text{T}) \}$, 
@ -120,43 +128,32 @@
        \end{example}

    \item[Probability mass function (PMF)] \marginnote{Probability mass function (PMF)}
-        Given a discrete random variable $X$, its probability mass function is a function $f_X: \mathcal{T}_X \rightarrow [0, 1]$ such that:
-        \[ f_X(x) = \prob{X = x}, \forall x \in \mathcal{T}_X \]
+        Given a discrete random variable $X$, its probability mass function is a function $p_X: \mathcal{T}_X \rightarrow [0, 1]$ such that:
+        \[ p_X(x) = \prob{X = x}, \forall x \in \mathcal{T}_X \]

        A PMF has the following properties:
        \begin{enumerate}
-            \item $f_X(x) \geq 0, \forall x \in \mathcal{T}_X$
-            \item $\sum_{x \in \mathcal{T}_X} f_X(x) = 1$
-            \item Let $A \subseteq \Omega$, $\prob{X = x \in A} = \sum_{x \in A} f_X(x)$
+            \item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$
+            \item $\sum_{x \in \mathcal{T}_X} p_X(x) = 1$
+            \item Let $A \subseteq \Omega$, $\prob{X = x \in A} = \sum_{x \in A} p_X(x)$
        \end{enumerate}

+        We denote with $X \sim p_X$ a random variable $X$ with PMF $p_X$.
+
        \begin{example}
            Let $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$.
            Given a random variable $X = \{ \text{number of heads} \}$ with $\mathcal{T}_X = \{ 0, 1, 2 \}$.
            The PMF is:
            \[
                \begin{split}
-                    f_X &= \prob{X = 0} = \frac{1}{4} \\
-                    f_X &= \prob{X = 1} = \frac{2}{4} \\
-                    f_X &= \prob{X = 2} = \frac{1}{4}
+                    p_X &= \prob{X = 0} = \frac{1}{4} \\
+                    p_X &= \prob{X = 1} = \frac{2}{4} \\
+                    p_X &= \prob{X = 2} = \frac{1}{4}
                \end{split}  
            \]
        \end{example}
 \end{description}

-\subsubsection{Common distributions}
-\begin{descriptionlist}
-    \item[Uniform distribution] \marginnote{Uniform distribution}
-        Given a discrete random variable $X$ with $\#(\mathcal{T}_X) = N$,
-        $X$ has an uniform distribution if:
-        \[ f_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
-    
-    \item[Poisson distribution] \marginnote{Poisson distribution}
-        Given a discrete random variable $X$ with mean $\lambda$,
-        $X$ has a poisson distribution if:
-        \[ f_X(x) = e^{-\lambda} \frac{\lambda^x}{x!}, \forall x \in \mathcal{T}_X \]
-\end{descriptionlist}
-

 \subsection{Continuous random variables}

@ -172,39 +169,354 @@

    \item[Probability density function (PDF)] \marginnote{Probability density function (PDF)}
        Given a continuous random variable $X$, 
-        its probability density function is a function $f_X: \mathcal{T}_X \rightarrow \mathbb{R}$ such that:
-        \[ \prob{X \in A} = \int_{A} f_X(x) \,dx \]
-        \[ \prob{a \leq X \leq b} = \int_{a}^{b} f_X(x) \,dx \]
-        Note that $\prob{X = a} = \prob{a \leq X \leq a} = \int_{a}^{a} f_X(x) \,dx = 0$
+        its probability density function is a function $p_X: \mathcal{T}_X \rightarrow \mathbb{R}$ such that:
+        \[ \prob{X \in A} = \int_{A} p_X(x) \,dx \]
+        \[ \prob{a \leq X \leq b} = \int_{a}^{b} p_X(x) \,dx \]
+        Note that $\prob{X = a} = \prob{a \leq X \leq a} = \int_{a}^{a} p_X(x) \,dx = 0$

        A PDF has the following properties:
        \begin{enumerate}
-            \item $f_X(x) \geq 0, \forall x \in \mathcal{T}_X$ 
-            \item $\int_{x \in  \mathcal{T}_X} f_X(x) \,dx = 1$
-            \item $\prob{X \in A} =  \int_{A} f_X(x) \,dx$
+            \item $p_X(x) \geq 0, \forall x \in \mathcal{T}_X$ 
+            \item $\int_{x \in  \mathcal{T}_X} p_X(x) \,dx = 1$
+            \item $\prob{X \in A} =  \int_{A} p_X(x) \,dx$
        \end{enumerate}
+
+        We denote with $X \sim p_X$ a random variable $X$ with PDF $p_X$.
+    \end{description}
+
+
+
+\section{Discrete joint distribution}
+
+\begin{description}
+    \item[Univariate distribution] \marginnote{Univariate distribution}
+        Distribution with one random variable.
+    
+    \item[Multivariate distribution] \marginnote{Multivariate distribution}
+        Distribution with multiple random variables.
+    
+    \item[Joint probability] \marginnote{Joint probability}
+        Let $X$ and $Y$ be random variables respectively with target space $\mathcal{T}_X$ and $\mathcal{T}_Y$.
+        The joint probability of $X$ and $Y$ has target space $\mathcal{T}_{XY} = \mathcal{T}_X \times \mathcal{T}_Y$
+        and its PMF is:
+        \[ p_{XY}(x_i, y_j) = \prob{X = x_i \cap Y = y_j} \]
+
+        $p_X(x)$ and $p_Y(y)$ are the \textbf{marginal probabilities}. \marginnote{Marginal probability}
+
+        \begin{example}
+            Let $X$ and $Y$ be random variables respectively with five and three possible states.
+            \begin{center}
+                \includegraphics[width=0.4\textwidth]{img/_joint_probability_example.pdf}
+            \end{center}
+            We denote with:
+            \begin{itemize}
+                \item $N$ the number of events
+                \item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p(x, y) = n_{ij}$)
+                \item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column
+                \item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row
+            \end{itemize}
+
+            The marginal probabilities are:\\
+            \begin{minipage}{.48\linewidth}
+                \centering
+                \[ p(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
+            \end{minipage}
+            \begin{minipage}{.48\linewidth}
+                \centering
+                \[ p(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
+            \end{minipage}
+
+            The conditional probabilities can be computed as:
+            \[ \prob{Y = y_j \vert X = x_i} = \frac{p(x_i, y_i)}{p(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
+            \[ \prob{X = x_i \vert Y = y_j} = \frac{p(x_i, y_i)}{p(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
+        \end{example}
 \end{description}

-\subsubsection{Common distributions}
+
+
+\section{Rules of probability}
+
+\subsection{Sum rule}
+\marginnote{Sum rule\\Marginalization property}
+Given $X$ and $Y$ random variables. The sum rule states that:
+\[
+    p(\bm{x}) =
+    \begin{cases}
+        \sum_{\bm{y} \in \mathcal{T}_Y} p(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
+        \int_{\mathcal{T}_Y} p(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
+    \end{cases}
+\]
+
+The sum rule relates the joint distribution and a marginal distribution.
+In fact, the sum rule can be applied to any subset of the random variables of a joint distribution.
+Given $\bm{x} = \begin{pmatrix} x_1, \dots, x_D \end{pmatrix}^T$, 
+the marginal w.r.t. $x_i$ can be obtained by integrating/summing out all random variables except $x_i$:
+\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\backslash i} \]
+
+\subsection{Product rule}
+\marginnote{Product rule}
+\[ p(\bm{x}, \bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) = p(\bm{x} \vert \bm{y}) p(\bm{y}) \]
+
+
+
+\section{Bayes' theorem}
+\begin{theorem}
+    \marginnote{Bayes' theorem}
+    Given two random variables $X$ and $Y$:
+    \[
+        \overbrace{p(\bm{x} \vert \bm{y})}^{\mathclap{\text{posterior}}} = 
+            \frac
+                { \overbrace{p(\bm{y} \vert \bm{x})}^{\mathclap{\text{likelihood }}}  \overbrace{p(\bm{x})}^{\mathclap{\text{ prior}}} }
+                {\underbrace{p(\bm{y})}_{\mathclap{\text{evidence}}}} 
+    \]
+    where:
+    \begin{descriptionlist}
+        \item[Prior] \marginnote{Prior}
+            is the prior knowledge of the unobserved data $\bm{x}$.
+
+        \item[Likelihood] \marginnote{Likelihood}
+            describes the relation between $\bm{x}$ and $\bm{y}$.
+
+        \item[Posterior] \marginnote{Posterior}
+            represents the quantity of interest (i.e. knowledge on $\bm{x}$ after observing $\bm{y}$).
+        
+        \item[Evidence/Marginal likelihood] \marginnote{Evidence/Marginal likelihood}
+            normalizes the posterior. It is defined independently from $\bm{x}$ (i.e. is constant) as:
+            \[ p(\bm{y}) = \int p(\bm{y} \vert \bm{x}) p(\bm{x}) \,d\bm{x} \]
+    \end{descriptionlist}
+\end{theorem}
+\begin{proof}
+    This is a direct consequence of the product rule:
+    \[ 
+        p(\bm{x} \vert \bm{y}) p(\bm{y}) = p(\bm{y} \vert \bm{x}) p(\bm{x}) \iff
+        p(\bm{x} \vert \bm{y}) p(\bm{y}) = \frac{p(\bm{y} \vert \bm{x}) p(\bm{x})}{p(\bm{y})}
+    \]
+\end{proof}
+
+Note: sometimes, instead of the full posterior, the maximum is considered (with loss of information):
+\[ \max_x p(x \vert y) = \max_x \frac{p(y \vert x) p(x)}{\underbrace{p(y)}_{\mathclap{\text{constant}}}} = \max_x p(y \vert x) p(x) \]
+
+
+
+\section{Statistics}
+
+\begin{description}
+    \item[Statistic] \marginnote{Statistic}
+        A statistic of a random variable is a deterministic function of it. 
+\end{description}
+
+
+\subsection{Mean}
+\begin{description}
+    \item[Expected value (univariate)] \marginnote{Expected value (univariate)}
+        Given a function $g$ of a random variable $X \sim p(x)$,
+        its expected value is:
+        \[ 
+            \mathbb{E}_X[g(x)] = 
+            \begin{cases}
+                \sum_{x \in \mathcal{T}_X} g(x)p(x) & \text{if } $X$ \text{ is discrete} \\
+                \int_{\mathcal{T}_X} g(x)p(x) \,dx  & \text{if } $X$ \text{ is continuous} \\
+            \end{cases}
+        \]
+
+    \item[Expected value (multivariate)] \marginnote{Expected value (multivariate)}
+        A multivariate random variable $X$ can be seen as 
+        a vector of univariate random variables $\begin{pmatrix} X_1, \dots, X_D \end{pmatrix}^T$.
+        Its expected value can be computed element wise as:
+        \[ 
+            \mathbb{E}_X[g(\bm{x})] = 
+            \begin{pmatrix} \mathbb{E}_{X_1}[g(x_1)] \\ \vdots \\ \mathbb{E}_{X_D}[g(x_D)] \end{pmatrix} \in \mathbb{R}^D
+        \]
+
+    \item[Mean] \marginnote{Mean}
+        Given a random variable $X \sim p(x)$,
+        the mean of $X$ is its expected value with $g$ defined as the identity:
+        \[ 
+            \mathbb{E}_X[x] = 
+            \begin{cases}
+                \sum_{x \in \mathcal{T}_X} x \cdot p(x) & \text{if } $X$ \text{ is discrete} \\
+                \int_{\mathcal{T}_X} x \cdot p(x) \,dx  & \text{if } $X$ \text{ is continuous} \\
+            \end{cases}
+        \]
+\end{description}
+
+
+\subsection{Variance}
+\begin{description}
+    \item[Covariance (univariate)] \marginnote{Covariance (univariate)}
+        Given two univariate random variables $X$ and $Y$, their covariance is:
+        \[ \text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[(x - \mathbb{E}_X[x])(y - \mathbb{E}_Y[y])] \]
+
+        \begin{lemma} 
+            $\text{Cov}_{XY}[x, y] = \mathbb{E}_{XY}[x, y] - \mathbb{E}_{X}[x]\mathbb{E}_{Y}[y]$
+        \end{lemma}
+
+    \item[Variance (univariate)] \marginnote{Variance (univariate)}
+        The variance of a univariate random variable is given by:
+        \[ \mathbb{V}_X[x] = \text{Cov}_X[x, x] \]
+        Its square root is the standard deviation $\sigma(x)$.
+
+    \item[Covariance (multivariate)] \marginnote{Covariance (multivariate)}
+        Given two multivariate random variables 
+        $X$ and $Y$ with states $\bm{x} \in \mathbb{R}^D$ and $\bm{y} \in \mathbb{R}^E$,
+        their covariance is:
+        \[ 
+            \text{Cov}_{XY}[\bm{x}, \bm{y}] = \text{Cov}_{XY}[\bm{y}, \bm{x}]^T =
+            \mathbb{E}_{XY}[\bm{xy}^T] - \mathbb{E}_{X}[\bm{x}]\mathbb{E}_{Y}[\bm{y}]^T \in \mathbb{R}^{D \times E}
+        \]
+
+
+    \item[Variance (multivariate)] \marginnote{Variance (multivariate)}
+        Given a multivariate random variable $X$ with 
+        states $\bm{x} \in \mathbb{R}^D$ and mean vector $\bm{\mu} \in \mathbb{R}^D$.
+        Its variance is given by:
+        \[
+            \begin{split}
+                \mathbb{V}_X[\bm{x}] &= \text{Cov}_X[\bm{x}, \bm{x}] \\
+                    &= \mathbb{E}_X[\bm{xx}^T] - \mathbb{E}_X[\bm{x}]\mathbb{E}_X[\bm{x}]^T \\
+                    &= 
+                    \begin{pmatrix}
+                        \text{Cov}[x_1, x_1] & \text{Cov}[x_1, x_2] & \cdots & \text{Cov}[x_1, x_D] \\
+                        \text{Cov}[x_2, x_1] & \text{Cov}[x_2, x_2] & \cdots & \text{Cov}[x_2, x_D] \\
+                        \vdots & \vdots & \ddots & \vdots \\
+                        \text{Cov}[x_D, x_1] & \text{Cov}[x_D, x_2] & \cdots & \text{Cov}[x_D, x_D] \\
+                    \end{pmatrix} \in \mathbb{R}^{D \times D}
+            \end{split}
+        \]
+        This matrix is called covariance matrix and is symmetric positive semidefinite.
+
+    \item[Correlation] \marginnote{Correlation}
+        Given two random variables $X$ and $Y$, their correlation is:
+        \[ \text{corr}[x, y] = \frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}} \in [-1, 1] \]
+        \begin{itemize}
+            \item When $\text{corr}[x, y] \rightarrow +1$, $x$ and $y$ are expected to grow together.
+            \item When $\text{corr}[x, y] \rightarrow -1$, $x$ grows when $y$ decreases and vice versa.
+            \item When $\text{corr}[x, y] \rightarrow 0$, $x$ and $y$ are not correlated.
+        \end{itemize}
+\end{description}
+
+
+\subsection{Empirical mean and variance}
+In practice, it is not always possible to compute statistics on the real population.
+Empirical observations can be made on a (finite) subset of the real population sampled as 
+a finite number of identical random variables $X_1, \dots, X_N$.
+
+\begin{description}
+    \item[Empirical mean] \marginnote{Empirical mean}
+        \[ \bar{x} = \frac{1}{N} \sum_{n=1}^{N}x_n \]
+    \item[Empirical variance] \marginnote{Empirical variance}
+        \[ \sigma^2 = \frac{1}{N} \sum_{n=1}^{N}(x_n - \bar{x})^2 \]
+\end{description}
+
+
+
+\section{Random variables properties}
+
+\subsection{Manipulations}
+\begin{itemize}
+    \item $\mathbb{E}[\bm{x} + \bm{y}] = \mathbb{E}[\bm{x}] + \mathbb{E}[\bm{y}]$
+    \marginnote{Manipulations of random variables}
+    \item $\mathbb{E}[\bm{x} - \bm{y}] = \mathbb{E}[\bm{x}] - \mathbb{E}[\bm{y}]$
+    \item $\mathbb{V}[\bm{x} + \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] + \text{Cov}[\bm{x}, \bm{y}] + \text{Cov}[\bm{y}, \bm{x}]$
+    \item $\mathbb{V}[\bm{x} - \bm{y}] = \mathbb{V}[\bm{x}] + \mathbb{V}[\bm{y}] - \text{Cov}[\bm{x}, \bm{y}] - \text{Cov}[\bm{y}, \bm{x}]$
+\end{itemize}
+
+
+\subsection{Statistical independence}
+\marginnote{Statistical independence}
+Two random variables $X$ and $Y$ are statistically independent iff:
+\[ p(\bm{x}, \bm{y}) = p(\bm{x})p(\bm{y}) \]
+
+\begin{theorem}
+    If $X$ and $Y$ are statistically independent, then:
+    \begin{itemize}
+        \item $p(\bm{x} \vert \bm{y}) = p(\bm{x})$ and $p(\bm{y} \vert \bm{x}) = p(\bm{y})$
+        \item $\mathbb{V}_{XY}[\bm{x} + \bm{y}] = \mathbb{V}_X[\bm{x}] + \mathbb{V}_Y[\bm{y}]$
+        \item $\text{Cov}_{XY}[\bm{x}, \bm{y}] = \nullvec$
+    \end{itemize}
+\end{theorem}
+
+
+\subsection{Conditional independence}
+\marginnote{Conditional independence}
+Two random variables $X$ and $Y$ are conditionally independent given $Z$ iff:
+\[ p(\bm{x}, \bm{y} \vert \bm{z}) = p(\bm{x} \vert \bm{z}) p(\bm{y} \vert \bm{z}) \, \forall \bm{z} \in \mathcal{T}_Z \]
+
+
+\subsection{Inner product}
+\marginnote{Inner product of random variables}
+Given two zero mean random variables $X$ and $Y$, their inner product is defined as:
+\[ \left\langle X, Y \right\rangle = \text{Cov}[x, y] \]
+The covariance matrix is symmetric, positive definite.
+
+Moreover, we have that:
+\begin{itemize}
+    \item $\Vert X \Vert = \sqrt{\langle X, X \rangle} = \sqrt{\text{Cov}[x, x]} = \sqrt{\mathbb{V}[x]} = \sigma[x]$
+    \item 
+        $\cos\theta = \frac{\langle X, Y \rangle}{\Vert X \Vert \cdot \Vert Y \Vert} = 
+        \frac{\text{Cov}[x, y]}{\sqrt{\mathbb{V}[x]\mathbb{V}[y]}}$, where $\theta$ is the angle between $X$ and $Y$.
+    \item $X \perp Y \iff \langle X, Y \rangle = 0 \iff \text{Cov}[x, y] = 0 \iff X \text{ and } Y \text{ uncorrelated}$
+\end{itemize}
+
+
+
+\section{Common distributions}
+
+\subsection{Discrete random variables}
+\begin{descriptionlist}
+    \item[Uniform distribution] \marginnote{Uniform distribution}
+        Given a discrete random variable $X$ with $\#(\mathcal{T}_X) = N$,
+        $X$ has an uniform distribution if:
+        \[ p_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
+    
+    \item[Poisson distribution] \marginnote{Poisson distribution}
+        Given a discrete random variable $X$ with mean $\lambda$,
+        $X$ has a poisson distribution if:
+        \[ p_X(x) = e^{-\lambda} \frac{\lambda^x}{x!}, \forall x \in \mathcal{T}_X \]
+
+        A poisson distribution has $\mathbb{E}[x] = \lambda$ and $\mathbb{V}[x] = \lambda$.
+\end{descriptionlist}
+
+
+\subsection{Continuous random variables}
 \begin{descriptionlist}
    \item[Continuous uniform distribution] \marginnote{Continuous uniform distribution}
        Given a continuous random variable $X$ with $\mathcal{T}_X = [a, b]$,
        $X$ has a continuous uniform distribution if:
-        \[ f_X(x) = \frac{1}{b-a}, \forall x \in \mathcal{T}_X \]
+        \[ p_X(x) = \frac{1}{b-a}, \forall x \in \mathcal{T}_X \]
    
    \item[Normal distribution] \marginnote{Normal distribution}
        Given a continuous random variable $X$ and the parameters $\mu$ (mean) and $\sigma$ (variance).
        $X$ has a normal distribution if:
-        \[ f_X(x) = \frac{1}{\sigma \sqrt{2\pi}} e^{\frac{-(x-\mu)^2}{2\sigma^2}} , \forall x \in \mathcal{T}_X\]
+        \[ p_X(x) = \frac{1}{\sigma \sqrt{2\pi}} e^{\frac{-(x-\mu)^2}{2\sigma^2}} , \forall x \in \mathcal{T}_X\]
+
+        In the multivariate case, it is defined as:
+        \[ 
+            p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}, \matr{\Sigma}) = 
+                (2\pi)^{-\frac{D}{2}} \vert \matr{\Sigma} \vert^{-\frac{1}{2}} e^{(-\frac{1}{2}(\bm{x} - \bm{\mu})^T\matr{\Sigma}^{-1}(\bm{x}-\bm{\mu}))}
+                \in \mathbb{R}
+        \]
+        where $\bm{\mu}$ is the mean vector and  $\matr{\Sigma}$ the covariance matrix.

        \begin{description}
            \item[Standard normal distribution] \marginnote{Standard normal distribution}
-                Normal distribution with $\mu = 0$ and $\sigma = 1$.
+                Normal distribution with $\mu = 0$ and $\sigma = 1$ (univariate) or 
+                $\bm{\mu} = \nullvec$ and $\matr{\Sigma} = \matr{I}$ (multivariate).
        \end{description}

        \begin{figure}[ht]
            \centering
-            \includegraphics[width=0.5\textwidth]{img/normal_distribution.png}
+            \includegraphics[width=0.40\textwidth]{img/normal_distribution.png}
            \caption{Normal distributions and standard normal distribution}
        \end{figure}
+
+
+        \begin{theorem}[Linearity]
+            \marginnote{Gaussian sum and linear transformations}
+            Given $X$ and $Y$ independent Gaussian random variables with
+            $p(\bm{x}) = \mathcal{N}(\bm{x} \vert \bm{\mu}_x, \matr{\Sigma}_x)$ and
+            $p(\bm{y}) = \mathcal{N}(\bm{y} \vert \bm{\mu}_y, \matr{\Sigma}_y)$.
+            It holds that:
+            \[ p(a\bm{x} + b\bm{y}) = \mathcal{N}(a\bm{\mu}_x + b\bm{\mu}_y, a^2\matr{\Sigma}_x + b^2\matr{\Sigma}_y) \]
+        \end{theorem}
 \end{descriptionlist}