Fix typos <noupdate>

2026-02-04 07:41:43 +01:00 · 2023-12-30 13:39:00 +01:00
parent aad5d7b029
commit e171ef313a
8 changed files with 131 additions and 105 deletions
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
@ -6,7 +6,7 @@

 \begin{description}
    \item[Measure error] \marginnote{Measure error}
-        Precision of the measurement instrument.
+        Precision of the measuring instrument.

    \item[Arithmetic error] \marginnote{Arithmetic error}
        Propagation of rounding errors in each step of an algorithm.
@ -37,7 +37,7 @@ Let $x$ be a value and $\hat{x}$ its approximation. Then:
        Note that, out of context, the absolute error is meaningless.
    \item[Relative error] 
        \[
-            E_{a} = \frac{\hat{x} - x}{x} 
+            E_{r} = \frac{\hat{x} - x}{x} 
            \marginnote{Relative error}
        \] 
 \end{descriptionlist}
@ -148,7 +148,7 @@ In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest repre
 \subsection{IEEE standard}
 IEEE 754 defines two floating-point formats:
 \begin{descriptionlist}
-    \item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{float32}
+    \item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{\texttt{float32}}
        \begin{center}
            \small
            \begin{tabular}{|c|c|c|}
@ -158,7 +158,7 @@ IEEE 754 defines two floating-point formats:
            \end{tabular}
        \end{center}

-    \item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{float64}
+    \item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{\texttt{float64}}
        \begin{center}
            \small
            \begin{tabular}{|c|c|c|}
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
@ -11,11 +11,13 @@ Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in

    \item[Local minimum] \marginnote{Local minimum}
        $\vec{x}^* \in \mathbb{R}^N$ is a local minimum of $f$ iff:
-        \[ f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
+        \[ \exists \varepsilon \in \mathbb{R} \text{ s.t. } 
+            f(\vec{x}^*) \leq f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
        
    \item[Strict local minimum] \marginnote{Strict local minimum}
        $\vec{x}^* \in \mathbb{R}^N$ is a strict local minimum of $f$ iff:
-        \[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]
+        \[ \exists \varepsilon \in \mathbb{R} \text{ s.t. }
+            f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N: \Vert \vec{x} - \vec{x}^* \Vert < \varepsilon \]

    \item[Global minimum] \marginnote{Global minimum}
        $\vec{x}^* \in \mathbb{R}^N$ is a global minimum of $f$ iff:
@ -26,7 +28,7 @@ Let $f: \mathbb{R}^N \rightarrow \mathbb{R}$ be continuous and differentiable in
        \[ f(\vec{x}^*) < f(\vec{x}) \text{ } \forall \vec{x} \in \mathbb{R}^N \]
 \end{descriptionlist}

-Note that $\max f(x) = \min -f(x)$.
+Note that $\max \{ f(x) \} = \min \{ -f(x)$ \}. 


 \subsection{Optimality conditions}
@ -52,7 +54,7 @@ As the second order condition requires to compute the Hessian matrix, which is e

 \marginnote{Descent methods}
 Descent methods are iterative methods that have the property:
-\[ f(\vec{x}_k) < f(\vec{x}_{k+1}) \]
+\[ f(\vec{x}_k) < f(\vec{x}_{k-1}) \]

 The iteration is defined as:
 \[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
@ -107,7 +109,7 @@ Note: descent methods usually converge to a local minimum.
        but it can be proved that this does not guarantee convergence.
    
    \item[Backtracking procedure] \marginnote{Backtracking procedure}
-        $\alpha_k$ is chose such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
+        $\alpha_k$ is chosen such that it respects the Wolfe condition\footnote{\url{https://en.wikipedia.org/wiki/Wolfe_conditions}}:
        \begin{lstlisting}[mathescape=true, belowskip = -0.8\baselineskip]
            def backtracking($\tau$, $c_1$):
                $\alpha_k$ = 1 # Initial guess
@ -121,7 +123,7 @@ Note: descent methods usually converge to a local minimum.

 \subsection{Stopping condition}
 \marginnote{Stopping condition}
-We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, $\nabla f(\vec{x}_k) \approx \nullvec$.
+We can stop iterating when $\vec{x}_k \approx \vec{x}^*$, that is, when $\nabla f(\vec{x}_k) \approx \nullvec$.
 We can verify this by checking the norm of the gradient against a tolerance $\tau$:
 \begin{descriptionlist}
    \item[Absolute condition] $\Vert \nabla f(x_k) \Vert_2 < \tau$ 
@ -152,7 +154,7 @@ A generic gradient-like method can then be defined as:
        it may cause numerical instabilities or bad results.
        Heuristics can be used to select an adequate starting point.

-    \item[Flag regions and local optima] \marginnote{Flag regions and local optima}
+    \item[Flat regions and local optima] \marginnote{Flat regions and local optima}
        Flat regions slow down the learning speed,
        while a local optima causes the method to converge at a poor solution.
        \begin{figure}[ht]
@ -164,7 +166,7 @@ A generic gradient-like method can then be defined as:
    \item[Differential curvature]
        Different magnitudes of the partial derivatives may cause the problem of
        vanishing and exploding gradient. \marginnote{Vanishing gradient\\Exploding gradient}
-        This causes the learning process to require more iterations to correct the direction.
+        This causes the learning process to require more iterations to adjust the direction.

        In practice, as the gradient of complex functions is only an instantaneous direction of best decrease and
        does not represent the direction to the minimum in the long term, 
@ -254,7 +256,7 @@ A generic gradient-like method can then be defined as:


 \subsection{Properties}
-\marginnote{Convex properties}
+% \marginnote{Convex properties}
 \begin{itemize}
    \item $\text{if } f \text{ convex} \Rightarrow \text{any local minimum of } f \text{ is also global}$
    \item $\text{if } f \text{ strictly convex} \Rightarrow \text{the global minimum of } f \text{ is unique}$
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
@ -324,9 +324,9 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is
 \end{theorem}


-\begin{theorem}[Spectral theorem] \marginnote{Spectral theorem}
+\begin{theorem}[Spectral theorem] \label{th:spectral_theorem} \marginnote{Spectral theorem}
    Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
-    Its eigenvectors form a orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
+    Its eigenvectors form an orthonormal basis and its eigenvalues are all in $\mathbb{R}$.
 \end{theorem}


--- a/src/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
@ -45,16 +45,22 @@ where:
 \section{Square linear systems}
 \marginnote{Square linear system}
 A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$
-has an unique solution iff one of the following conditions is satisfied:
+has a unique solution iff one of the following conditions is satisfied:
 \begin{enumerate}
    \item $\matr{A}$ is non-singular (invertible)
    \item $\text{rank}(\matr{A}) = n$ (full rank)
-    \item $\matr{A}\vec{x}$ admits only the solution $\vec{x} = \nullvec$
+    \item $\matr{A}\vec{x}$ only admits the solution $\vec{x} = \nullvec$
 \end{enumerate}

 The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
 \[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
 However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
+Therefore, numerical methods are usually more suited.
+The two main families of methods are:
+\begin{itemize}
+    \item Direct methods.
+    \item Iterative methods.
+\end{itemize}



@ -70,15 +76,19 @@ The most common approach consists in factorizing the matrix $\matr{A}$.
 Given a square linear system $\matr{A}\vec{x} = \vec{b}$, 
 the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that:
 \begin{itemize}
-    \item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix
-    \item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix
+    \item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix.
+    \item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix.
 \end{itemize}
 %
-The system can be decomposed to:
+The system can be decomposed into:
 \[
    \begin{split}
        \matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
-            & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \vec{b}
+            & \iff 
+            \begin{cases}
+                \matr{L}\vec{y} = \vec{b} \\
+                \vec{y} = \matr{U}\vec{x} 
+            \end{cases}
    \end{split}
 \]
 To find the solution, it is sufficient to solve in order:
@ -89,7 +99,7 @@ To find the solution, it is sufficient to solve in order:

 The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\
 $O(\frac{n^3}{3})$ is the time complexity of the LU factorization. 
-$O(n^2)$ is the complexity to directly solving a system with a triangular matrix (forward or backward substitutions).
+$O(n^2)$ is the complexity to directly solve a system with a triangular matrix (forward or backward substitutions).


 \subsection{Gaussian factorization with pivoting}
@ -97,15 +107,19 @@ $O(n^2)$ is the complexity to directly solving a system with a triangular matrix
 During the computation of $\matr{A} = \matr{L}\matr{U}$ 
 (using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}), 
 a division by 0 may occur.
-A method to prevent this problem (and to lower the algorithmic error) is to change the order of the rows of $\matr{A}$ before decomposing it.
+A method to prevent this problem (and to lower the algorithmic error (i.e. overflows)) is to change the order of the rows of $\matr{A}$ before decomposing it.
 This is achieved by using a permutation matrix $\matr{P}$, which is obtained as a permutation of the identity matrix.

 The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
-The system can be decomposed to:
+The system can be decomposed into:
 \[
    \begin{split}
        \matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
-            & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \matr{P}\vec{b}
+            & \iff 
+            \begin{cases}
+                \matr{L}\vec{y} = \matr{P}\vec{b} \\
+                \vec{y} = \matr{U}\vec{x}
+            \end{cases}
    \end{split}
 \]

@ -117,7 +131,7 @@ The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can


 \subsection{Cholesky factorization}
-Given a symmetric definite positive matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
+Given a symmetric positive definite matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
 It is possible to decompose $\matr{A}$ as:
 \[ \matr{A} = \matr{L}\matr{L}^T \]
 where $\matr{L}$ is lower triangular.
@ -183,7 +197,7 @@ This problem is independent from the algorithm and is estimated using exact arit
 Given a system $\matr{A}\vec{x} = \vec{b}$, we perturbate $\matr{A}$ and/or $\vec{b}$ and study the inherited error.
 For instance, if we perturbate $\vec{b}$, we obtain the following system:
 \[ \matr{A}\tilde{\vec{x}} = (\vec{b} + \Delta\vec{b}) \]
-After finding $\tilde{\vec{x}}$, we can compute the inherited error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
+After finding $\tilde{\vec{x}}$, we can compute the inherent error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.

 By comparing $\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert$ and $\left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert$, 
 we can compute the error introduced by the perturbation.
@ -201,4 +215,28 @@ Otherwise it is \textbf{well-conditioned}. \marginnote{Well-conditioned}


 \section{Linear least squares problem}
-See \Cref{sec:lls}.
+
+A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$ 
+does not generally have a solution.
+\marginnote{Linear least squares}
+Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
+\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
+In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
+This problem is usually formulated as:
+\[ 
+    \tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
+\]
+It always admits a solution and, depending on $\text{rank}(\matr{A})$, there are two possible cases:
+\begin{descriptionlist}
+    \item[$\text{rank}(\matr{A}) = n$] 
+        The solution is unique for each $b \in \mathbb{R}^m$.
+        \marginnote{Normal equation}
+        It is found by solving the normal equation:
+        \[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
+        $\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
+    
+    \item[$\text{rank}(\matr{A}) < n$]
+        The system admits infinite solutions.
+        Of all the solutions $S$, we are interested in the one with minimum norm:
+        \[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
+\end{descriptionlist}
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
@ -98,8 +98,8 @@ The parameters are determined as the most likely to predict the correct label gi
 \begin{description}
    \item[Negative log-likelihood] \marginnote{Negative log-likelihood}
        \sloppy
-        Given a random variable $\bm{x}$, a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$
-        and a predictor, the negative log-likelihood of $\bm{x}$ is:
+        Given a random variable $\bm{x}$ and a probability density $p_\vec{\uptheta}(\bm{x})$ parametrized by $\vec{\uptheta}$, 
+        the negative log-likelihood of $\bm{x}$ is:
        \[ \mathcal{L}_{\bm{x}}(\vec{\uptheta}) = -\log p_\vec{\uptheta}(\bm{x}) \]
        Note that:
        \begin{itemize}
@ -118,7 +118,7 @@ The parameters are determined as the most likely to predict the correct label gi
        Moreover, as the dataset is identically distributed, 
        each $p_\vec{\uptheta}(y_n \vert \bm{x}_n)$ of the product has the same distribution.

-        By applying the logarithm, we have that the negative log-likelihood of a i.i.d. dataset is define as:
+        By applying the logarithm, we have that the negative log-likelihood of a i.i.d. dataset is defined as:
        \[ \mathcal{L}(\vec{\uptheta}) = -\sum_{n=1}^{N} \log p_\vec{\uptheta}(y_n \vert \bm{x}_n) \]
        and to find good parameters $\vec{\uptheta}$, we solve the problem:
        \[ 
@ -173,7 +173,7 @@ The parameters are determined as the most likely to predict the correct label gi
                \caption{When the parameters are bad, the label will be far the mean}
            \end{subfigure}

-            \caption{Geometric interpretation of the Gaussian likelihood. (not sure if this is correct)}
+            \caption{Geometric interpretation of the Gaussian likelihood}
        \end{figure}
 \end{description}

@ -191,7 +191,7 @@ By applying the Bayes' theorem, the problem becomes:
    \begin{split}
        \min_{\vec{\uptheta} \in \mathbb{R}^D} 
            -\frac{p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta})}{\underbrace{p(\vec{y} \vert \matr{X})}_{\mathclap{\text{constant}}}} &=
-        \min_{\vec{\uptheta} \in \mathbb{R}^D} -p(Y \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
+        \min_{\vec{\uptheta} \in \mathbb{R}^D} -p(\vec{y} \vert \matr{X}, \vec{\uptheta}) p(\vec{\uptheta}) \\
        &= \min_{\vec{\uptheta} \in \mathbb{R}^D} \{ -\log p(\vec{y} \vert \matr{X}, \vec{\uptheta}) -\log p(\vec{\uptheta}) \}
    \end{split}
 \]
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
@ -10,6 +10,8 @@ then $\matr{A} \in \mathbb{R}^{n \times n}$ can be decomposed into:
 where $\matr{P} \in \mathbb{R}^{n \times n}$ contains the eigenvectors of $\matr{A}$ as its columns and 
 $\matr{D}$ is a diagonal matrix whose diagonal contains the eigenvalues of $\matr{A}$.

+Note that a symmetric matrix can always be decomposed (\Cref{th:spectral_theorem})
+


 \section{Singular value decomposition}
@ -40,10 +42,10 @@ The singular value decomposition (SVD) of $\matr{A}$ is always possible and has
 where:
 \begin{itemize}
    \item 
-        $\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix with columns $\vec{u}_i$ called left-singular vectors.
+        $\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix whose columns $\vec{u}_i$ are called left-singular vectors.
    
    \item 
-        $\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix with columns $\vec{v}_i$ called right-singular vectors.
+        $\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix whose columns $\vec{v}_i$ are called right-singular vectors.
    
    \item 
        $\matr{\Sigma} \in \mathbb{R}^{m \times n}$ is a matrix with $\matr{\Sigma}_{i,j} = 0$ (i.e. diagonal if it was a square matrix) and
@ -79,8 +81,8 @@ For $\matr{A}^T\matr{A}$, we can compute:
 \]
 As $\matr{V}$ is orthogonal ($\matr{V}^T = \matr{V}^{-1}$), we can apply the eigendecomposition theorem:
 \begin{itemize}
-    \item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$
-    \item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$
+    \item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$.
+    \item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$.
 \end{itemize}

 The same process holds for $\matr{A}\matr{A}^T$. In this case, the columns of $\matr{U}$ (left-singular vectors) are the eigenvectors.
@ -99,7 +101,8 @@ We can compute the 2-norm as:
 \[ \Vert \matr{A} \Vert_2 = \sqrt{\rho(\matr{A}^T\matr{A})} = \sqrt{\rho(\matr{A}^2)} = \sqrt{\max\{\sigma_1^2, \dots, \sigma_r^2\}} = \sigma_1 \]
 \[ 
    \Vert \matr{A}^{-1} \Vert_2 = \sqrt{\rho((\matr{A}^{-1})^T(\matr{A}^{-1}))} = 
-    \sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} = \sqrt{\max\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2}\}} = \frac{1}{\sigma_r}
+    \sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} = 
+    \sqrt{\max \left\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2} \right\}} = \frac{1}{\sigma_r}
 \]
 Furthermore, we can compute the condition number of $\matr{A}$ as:
 \[ K(\matr{A}) = \Vert \matr{A} \Vert_2 \cdot \Vert \matr{A}^{-1} \Vert_2 = \sigma_1 \cdot \frac{1}{\sigma_r} \]
@ -126,7 +129,7 @@ By considering only the first $k < r$ singular values, we can obtain a rank-$k$
        \hat{\matr{A}}(k) = \arg \min_{\matr{B} \in \mathbb{R}^{m \times n}, \text{rank}(\matr{B}) = k} \Vert \matr{A} - \matr{B} \Vert_2 
    \]
 \end{theorem}
-In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closer one to $\matr{A}$.
+In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closest one to $\matr{A}$.
 Moreover, the error of the rank-$k$ approximation is:
 \[
    \Vert \matr{A} - \hat{\matr{A}}(k) \Vert_2 = 
@ -152,32 +155,15 @@ Therefore, the compression factor is given by: \marginnote{Compression factor}


 \subsection{Application: Linear least squares problem} \label{sec:lls}
-A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$ 
-does not generally have a solution.
-\marginnote{Linear least squares}
-Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
-\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
-In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
-This problem is usually formulated as:
+Given a least squares problem:
 \[ 
    \tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
 \]
-It always admits a solution and, depending on $\text{rank}(\matr{A})$, there two possible cases:
-\begin{descriptionlist}
-    \item[$\text{rank}(\matr{A}) = n$] 
-        The solution is unique for each $b \in \mathbb{R}^m$.
-        \marginnote{Normal equation}
-        It is found by solving the normal equation:
-        \[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
-        $\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.
-    
-    \item[$\text{rank}(\matr{A}) < n$] \marginnote{Least squares using SVD}
-        The system admits infinite solutions.
+When $\text{rank}(\matr{A}) < n$, the system admits infinite solutions.
 Of all the solutions $S$, we are interested in the one with minimum norm:
 \[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
 This problem can be solved using SVD:
 \[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
-\end{descriptionlist}


 \subsection{Application: Polynomial interpolation}
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
@ -3,9 +3,9 @@

 \begin{description}
    \item[Probability]
-        model of a process where the underlying uncertainty is captured by random variables.
+        Model of a process where the underlying uncertainty is captured by random variables.
    \item[Statistics] 
-        determine the underlying process that explains an observation.
+        Determines the underlying process that explains an observation.
 \end{description}


@ -23,7 +23,7 @@

    \item[Probability] \marginnote{Probability}
        Let $\mathcal{E}$ be the set of all the possible events (i.e. power set of $\Omega$).
-        The probability is a function:
+        The probability of an event is a function:
        \[ \prob{A}: \mathcal{E} \rightarrow [0, 1] \]
        \begin{example}
            Let $\Omega$ be as above.
@ -115,14 +115,14 @@
        \begin{example}
            A coin is tossed twice.

-            The random variable is $X(\omega) = \{ \text{number of heads} \}$.
+            Given the random variable $X(\omega) = \{ \text{number of heads} \}$.
            We have that $\mathcal{T}_X = \{ 0, 1, 2 \}$, therefore $X$ is discrete.
        \end{example}

        \begin{example}
            Roll a die until 6 comes out.

-            The random variable is $Y(\omega) = \{ \text{number of rolls before 6} \}$.
+            Given the random variable $Y(\omega) = \{ \text{number of rolls before 6} \}$.
            We have that $\mathcal{T}_Y = \{ 1, 2, \dots \} = \mathbb{N} \smallsetminus \{0\}$, 
            therefore $Y$ is discrete as $\mathcal{T}_Y$ is a countable set.
        \end{example}
@ -143,7 +143,7 @@
        \begin{example}
            Let $\Omega = \{ (\text{T}, \text{T}), (\text{T}, \text{H}), (\text{H}, \text{T}), (\text{H}, \text{H}) \}$.
            Given a random variable $X = \{ \text{number of heads} \}$ with $\mathcal{T}_X = \{ 0, 1, 2 \}$.
-            The PMF is:
+            Its PMF is:
            \[
                \begin{split}
                    p_X &= \prob{X = 0} = \frac{1}{4} \\
@ -160,7 +160,7 @@
 \begin{description}
    \item[Continuous random variable] \marginnote{Continuous random variable}
        A random variable $X$ is continuous if its target space $\mathcal{T}_X$ is uncountably infinite (i.e. a subset of $\mathbb{R}$).
-        Usually, $\mathcal{T}_X$ is an interval or union of intervals.
+        Usually, $\mathcal{T}_X$ is an interval or a union of intervals.

        \begin{example}
            Given a random variable $Z = \{ \text{Time before the arrival of a client} \}$.
@ -210,25 +210,25 @@
            \end{center}
            We denote with:
            \begin{itemize}
-                \item $N$ the number of events
-                \item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p(x, y) = n_{ij}$)
-                \item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column
-                \item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row
+                \item $N$ the number of events.
+                \item $n_{ij}$ the number of events with state $X=x_i$ and $Y=y_j$ (i.e. $p_{XY}(x, y) = n_{ij}$).
+                \item $c_i = \sum_{j=1}^{3} n_{ij}$ the sum of the $i$-th column.
+                \item $r_j = \sum_{i=1}^{5} n_{ij}$ the sum of the $j$-th row.
            \end{itemize}

            The marginal probabilities are:\\
            \begin{minipage}{.48\linewidth}
                \centering
-                \[ p(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
+                \[ p_X(x_i) = \prob{X = x_i} = \frac{c_i}{N} \]
            \end{minipage}
            \begin{minipage}{.48\linewidth}
                \centering
-                \[ p(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
+                \[ p_Y(y_j) = \prob{Y = y_j} = \frac{r_j}{N} \]
            \end{minipage}

            The conditional probabilities can be computed as:
-            \[ \prob{Y = y_j \vert X = x_i} = \frac{p(x_i, y_i)}{p(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
-            \[ \prob{X = x_i \vert Y = y_j} = \frac{p(x_i, y_i)}{p(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
+            \[ \prob{Y = y_j \vert X = x_i} = \frac{p_{XY}(x_i, y_i)}{p_X(x_i)} = \frac{n_{ij}/N}{c_i/N} = \frac{n_{ij}}{c_i} \]
+            \[ \prob{X = x_i \vert Y = y_j} = \frac{p_{XY}(x_i, y_i)}{p_Y(y_j)} = \frac{n_{ij}/N}{r_j/N} = \frac{n_{ij}}{r_j} \]
        \end{example}
 \end{description}

@ -240,18 +240,18 @@
 \marginnote{Sum rule\\Marginalization property}
 Given $X$ and $Y$ random variables. The sum rule states that:
 \[
-    p(\bm{x}) =
+    p_X(\bm{x}) =
    \begin{cases}
-        \sum_{\bm{y} \in \mathcal{T}_Y} p(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
-        \int_{\mathcal{T}_Y} p(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
+        \sum_{\bm{y} \in \mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) & \text{if } \bm{y} \text{ discrete} \\
+        \int_{\mathcal{T}_Y} p_{XY}(\bm{x}, \bm{y}) \,d\bm{y} & \text{if } \bm{y} \text{ continuous}
    \end{cases}
 \]

-The sum rule relates the joint distribution and a marginal distribution.
+The sum rule relates the joint distribution and the marginal distribution.
 In fact, the sum rule can be applied to any subset of the random variables of a joint distribution.
 Given $\bm{x} = \begin{pmatrix} x_1, \dots, x_D \end{pmatrix}^T$, 
 the marginal w.r.t. $x_i$ can be obtained by integrating/summing out all random variables except $x_i$:
-\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\backslash i} \]
+\[ p(x_i) = \int p(x_1, \dots, x_D) \,d\bm{x}_{\smallsetminus i} \]

 \subsection{Product rule}
 \marginnote{Product rule}
@ -302,7 +302,7 @@ Note: sometimes, instead of the full posterior, the maximum is considered (with

 \begin{description}
    \item[Statistic] \marginnote{Statistic}
-        A statistic of a random variable is a deterministic function of it. 
+        A statistic of a random variable is a deterministic function defined on it. 
 \end{description}


@ -447,7 +447,7 @@ Two random variables $X$ and $Y$ are conditionally independent given $Z$ iff:
 \marginnote{Inner product of random variables}
 Given two zero mean random variables $X$ and $Y$, their inner product is defined as:
 \[ \left\langle X, Y \right\rangle = \text{Cov}[x, y] \]
-The covariance matrix is symmetric, positive definite.
+The covariance matrix is symmetric positive definite.

 Moreover, we have that:
 \begin{itemize}
@ -465,7 +465,7 @@ Moreover, we have that:
 \subsection{Discrete random variables}
 \begin{descriptionlist}
    \item[Uniform distribution] \marginnote{Uniform distribution}
-        Given a discrete random variable $X$ with $\#(\mathcal{T}_X) = N$,
+        Given a discrete random variable $X$ with $\vert \mathcal{T}_X \vert = N$,
        $X$ has an uniform distribution if:
        \[ p_X(x) = \frac{1}{N}, \forall x \in \mathcal{T}_X \]
    
--- a/src/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
+++ b/src/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
@ -49,11 +49,11 @@
        Let $f: \mathbb{R}^n \rightarrow \mathbb{R}$ and $\vec{g}$ a vector of $n$ functions $g_i: \mathbb{R}^m \rightarrow \mathbb{R}$:
        \[
            \frac{\partial}{\partial \vec{x}} (f \circ \vec{g})(\vec{x}) = 
-                \frac{\partial}{\partial \vec{x}} (f(\vec{g}(\vec{x}))) =
+                \frac{\partial}{\partial \vec{x}} \Big( f(\vec{g}(\vec{x})) \Big) =
                \frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial \vec{x}}
        \]

-        More precisely, considering a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables 
+        For instance, consider a $f: \mathbb{R}^2 \rightarrow \mathbb{R}$ of two variables 
        $g_1(t), g_2(t): \mathbb{R} \rightarrow \mathbb{R}$ that are functions of $t$. 
        The gradient of $f$ with respect to $t$ is:
        \[
@ -71,7 +71,7 @@
        the second matrix contains in the $i$-th row the gradient of $g_i$.

        Therefore, if $g_i$ are in turn multivariate functions $g_1(s, t), g_2(s, t): \mathbb{R}^2 \rightarrow \mathbb{R}$,
-        the chain rule can be applies as:
+        the chain rule can be applies as follows:
        \[
            \frac{\text{d}f}{\text{d}(s, t)} = 
            \begin{pmatrix}
@ -96,26 +96,26 @@
        \end{example}

        \begin{example}
-            Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ g)(t)$ where:
-            \[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(\vec{x}) = \exp(x_1 x_2^2) \]
+            Let $h: \mathbb{R} \rightarrow \mathbb{R}$ be defined as $h(t) = (f \circ \vec{g})(t) = f(\vec{g}(t))$ where:
+            \[ f: \mathbb{R}^2 \rightarrow \mathbb{R} \text{ is defined as } f(g_1, g_2) = \exp(g_1 g_2^2) \]
            \[ 
-                g: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as } 
-                \vec{g}(t) = \begin{pmatrix} x_1 \\ x_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
+                \vec{g}: \mathbb{R} \rightarrow \mathbb{R}^2 \text{ is defined as } 
+                \vec{g}(t) = \begin{pmatrix} g_1 \\ g_2 \end{pmatrix} = \begin{pmatrix}t \cos(t) \\ t \sin(t) \end{pmatrix}
            \]
            The gradient of $h$ with respect to $t$ can be computed as:
            \[
                \frac{\text{d} h}{\text{d} t} =
                    \frac{\partial f}{\partial \vec{g}} \frac{\partial \vec{g}}{\partial t} =
                    \begin{pmatrix}
-                        \frac{\partial f}{\partial x_1} & \frac{\partial f}{\partial x_2}
+                        \frac{\partial f}{\partial g_1} & \frac{\partial f}{\partial g_2}
                    \end{pmatrix}
                    \begin{pmatrix}
-                        \frac{\partial x_1}{\partial t} \\ \frac{\partial x_2}{\partial t}
+                        \frac{\partial g_1}{\partial t} \\ \frac{\partial g_2}{\partial t}
                    \end{pmatrix}
            \]
            \[
                = 
-                \begin{pmatrix} \exp(x_1 x_2^2)x_2^2 & 2\exp(x_1 x_2^2)x_1 x_2 \end{pmatrix}
+                \begin{pmatrix} \exp(g_1 g_2^2)g_2^2 & 2\exp(g_1 g_2^2)g_1 g_2 \end{pmatrix}
                \begin{pmatrix} \cos(t) + (-t\sin(t)) \\ \sin(t) + t\cos(t) \end{pmatrix}
            \]
        \end{example}
@ -210,7 +210,7 @@ We can more compactly denote a neural network with input $\vec{x}$ and $K$ layer
        \vec{f}_i &= \sigma_i(\matr{A}_{i-1} \vec{f}_{i-1} + \vec{b}_{i-1}) \text{ } i=1, \dots, K
    \end{split}
 \]
-Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimizes the squared loss:
+Given the ground truth $\vec{y}$, we want to find the parameters $\matr{A}_j$ and $\vec{b}_j$ that minimize the squared loss:
 \[ L(\vec{\uptheta}) = \Vert \vec{y} - \vec{f}_K(\vec{\uptheta}, \vec{x}) \Vert^2 \]
 where $\vec{\uptheta} = \{ \matr{A}_{0}, \vec{b}_{0}, \dots, \matr{A}_{K-1}, \vec{b}_{K-1} \}$ are the parameters of each layer.
 This can be done by using the chain rule to compute the partial derivatives of $L$ with respect to the parameters $\vec{\uptheta}_j = \{ \matr{A}_j, \vec{b}_j \}$:
@ -260,12 +260,12 @@ In other words, each intermediate variable is expressed as an elementary functio
 The derivatives of $f$ can then be computed step-by-step going backwards as:
 \[ \frac{\partial f}{\partial x_D} = 1 \text{, as by definition } f = x_D \]
 \[ 
-    \frac{\partial f}{\partial x_i} = \sum_{\forall x_j: x_i \in \text{Pa}(x_j)} \frac{\partial f}{\partial x_j} \frac{\partial x_j}{\partial x_i}
-        = \sum_{\forall x_j: x_i \in \text{Pa}(x_j)} \frac{\partial f}{\partial x_j} \frac{\partial g_j}{\partial x_i}
+    \frac{\partial f}{\partial x_i} = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial x_c}{\partial x_i}
+        = \sum_{\forall x_c: x_i \in \text{Pa}(x_c)} \frac{\partial f}{\partial x_c} \frac{\partial g_c}{\partial x_i}
 \]
-where $\text{Pa}(x_j)$ is the set of parent nodes of $x_j$ in the graph.
+where $\text{Pa}(x_c)$ is the set of parent nodes of $x_c$ in the graph.
 In other words, to compute the partial derivative of $f$ w.r.t. $x_i$, 
-we apply the chain rule by first computing 
+we apply the chain rule by computing 
 the partial derivative of $f$ w.r.t. the variables following $x_i$ in the graph (as the computation goes backwards).

 Automatic differentiation is applicable to all functions that can be expressed as a computational graph and 
@ -327,8 +327,8 @@ Note that backpropagation is a special case of automatic differentiation.
    \begin{minipage}{.5\linewidth}
        \[
            \begin{split}
-                \frac{\partial f}{\partial d} &= \text{ already known (previous step)} \\
-                \frac{\partial f}{\partial e} &= \text{ already known (previous step)} \\
+                \frac{\partial f}{\partial d} &= \text{ known (previous step)} \\
+                \frac{\partial f}{\partial e} &= \text{ known (previous step)} \\
                \frac{\partial f}{\partial c} &= 
                    \frac{\partial f}{\partial d}\frac{\partial d}{\partial c} + \frac{\partial f}{\partial e}\frac{\partial e}{\partial c} \\
            \end{split}