Fix typos

2026-02-04 07:41:43 +01:00 · 2023-09-24 18:05:19 +02:00
parent 40090bfa77
commit 736ef14010
3 changed files with 52 additions and 47 deletions
--- a/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
+++ b/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
@ -12,7 +12,7 @@
        Propagation of rounding errors in each step of an algorithm.
    \item[Truncation error] \marginnote{Truncation error}
-        Approximating an infinite procedure into a finite number of iterations.
+        Approximating an infinite procedure to a finite number of iterations.
    \item[Inherent error] \marginnote{Inherent error}
        Caused by the finite representation of the data (floating-point).
@ -30,16 +30,16 @@
 Let $x$ be a value and $\hat{x}$ its approximation. Then:
 \begin{descriptionlist}
    \item[Absolute error] 
-        \begin{equation}
+        \[
            E_{a} = \hat{x} - x 
            \marginnote{Absolute error}
-        \end{equation} 
+        \] 
        Note that, out of context, the absolute error is meaningless.
    \item[Relative error] 
-        \begin{equation}
+        \[
            E_{a} = \frac{\hat{x} - x}{x} 
            \marginnote{Relative error}
-        \end{equation} 
+        \] 
 \end{descriptionlist}
@ -48,9 +48,9 @@ Let $x$ be a value and $\hat{x}$ its approximation. Then:
 Let $\beta \in \mathbb{N}_{> 1}$ be the base.
 Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as:
-\begin{equation} \label{eq:finnum_b_representation}
+\[ \label{eq:finnum_b_representation}
-    x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots d_n\beta^{-n})\beta^p
+    x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots + d_n\beta^{-n})\beta^p
-\end{equation}
+\]
 where:
 \begin{itemize}
    \item $0 \leq d_i \leq \beta-1$
@ -59,9 +59,9 @@ where:
 \end{itemize}
 %
 \Cref{eq:finnum_b_representation} can be represented using the normalized scientific notation as: \marginnote{Normalized scientific notation}
-\begin{equation}
+\[
    x = \pm (0.d_1d_2\dots) \beta^p
-\end{equation}
+\]
 where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponent}. \marginnote{Mantissa\\Exponent}
@ -73,11 +73,13 @@ A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the paramete
    \item $t$: precision (number of digits in the mantissa)
    \item $[L, U]$: range of the exponent
 \end{itemize}
-%
+
 Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized form:
 \begin{eqnarray}
    x = \pm (0.d_1d_2 \dots d_t) \beta^p & L \leq p \leq U
 \end{eqnarray}
 We denote with $\texttt{fl}(x)$ the representation of $x \in \mathbb{R}$ in a given floating-point system.
 \begin{example}
    In $\mathcal{F}(10, 5, -3, 3)$, $x=12.\bar{3}$ is represented as:
    \begin{equation*}
@ -101,21 +103,20 @@ It must be noted that there is an underflow area around 0.
 \end{figure}
-\subsection{Numbers representation}
+\subsection{Number representation}
 Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in:
 \begin{descriptionlist}
    \item[Exact representation] 
        if $p \in [L, U]$ and $d_i=0$ for $i>t$.
-    \item[Approximation] 
+    \item[Approximation] \marginnote{Truncation\\Rounding}
        if $p \in [L, U]$ but $d_i$ may not be 0 for $i>t$. 
        In this case, the representation is obtained by truncating or rounding the value.
        \marginnote{Truncation\\Rounding}
-    \item[Underflow] 
+    \item[Underflow] \marginnote{Underflow}
-        if $p < L$. In this case, the values is approximated as 0.
+        if $p < L$. In this case, the value is approximated to 0.
-    \item[Overflow] 
+    \item[Overflow] \marginnote{Overflow}
        if $p > U$. In this case, an exception is usually raised.
 \end{descriptionlist}
@ -179,16 +180,17 @@ Let:
 %
 To compute $x \oplus y$, a machine:
 \begin{enumerate}
-    \item Calculates $x + y$ in a high precision register (still approximated, but more precise than the storing system)
+    \item Calculates $x + y$ in a high precision register 
        (still approximated, but more precise than the floating-point system used to store the result)
    \item Stores the result as $\texttt{fl}(x + y)$
 \end{enumerate}
 A floating-point operation causes a small rounding error:
-\begin{equation}
+\[
    \left\vert \frac{(x \oplus y) - (x + y)}{x+y} \right\vert < \varepsilon_{\text{mach}}
-\end{equation}
+\]
 %
-Although, some operations may be subject to the \textbf{cancellation} problem which causes information loss.
+However, some operations may be subject to the \textbf{cancellation} problem which causes information loss.
 \marginnote{Cancellation}
 \begin{example}
    Given $x = 1$ and $y = 1 \cdot 10^{-16}$, we want to compute $x + y$ in $\mathcal{F}(10, 16, U, L)$.\\
--- a/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
+++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
@ -16,7 +16,8 @@ A vector space has the following properties:
    \item Addition is commutative and associative
    \item A null vector exists: $\exists \nullvec \in V$ s.t. $\forall \vec{u} \in V: \nullvec + \vec{u} = \vec{u} + \nullvec = \vec{u}$
    \item An identity element for scalar multiplication exists: $\forall \vec{u} \in V: 1\vec{u} = \vec{u}$
-    \item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$   
+    \item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$.\\
        $\vec{a}$ is denoted as $-\vec{u}$.
    \item Distributive properties:
        \[ \forall \alpha \in \mathbb{R}, \forall \vec{u}, \vec{w} \in V: \alpha(\vec{u} + \vec{w}) = \alpha \vec{u} + \alpha \vec{w} \]
        \[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha + \beta)\vec{u} = \alpha \vec{u} + \beta \vec{u} \]
@ -24,7 +25,7 @@ A vector space has the following properties:
        \[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha \beta)\vec{u} = \alpha (\beta \vec{u}) \]
 \end{enumerate}
 %
-A subset $U \subseteq V$ of a vector space $V$, is a \textbf{subspace} iff $U$ is a vector space.
+A subset $U \subseteq V$ of a vector space $V$ is a \textbf{subspace} iff $U$ is a vector space.
 \marginnote{Subspace}
@ -95,7 +96,7 @@ The norm of a vector is a function: \marginnote{Vector norm}
 such that for each $\lambda \in \mathbb{R}$ and $\vec{x}, \vec{y} \in \mathbb{R}^n$:
 \begin{itemize}
    \item $\Vert \vec{x} \Vert \geq 0$
-    \item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = 0$
+    \item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = \nullvec$
    \item $\Vert \lambda \vec{x} \Vert = \vert \lambda \vert \cdot \Vert \vec{x} \Vert$
    \item $\Vert \vec{x} + \vec{y} \Vert \leq \Vert \vec{x} \Vert + \Vert \vec{y} \Vert$
 \end{itemize}
@ -110,7 +111,7 @@ Common norms are:
 \end{descriptionlist}
 %
 In general, different norms tend to maintain the same proportion.
-In some cases, unbalanced results may be given when comparing different norms.
+In some cases, unbalanced results may be obtained when comparing different norms.
 \begin{example}
    Let $\vec{x} = (1, 1000)$ and $\vec{y} = (999, 1000)$. Their norms are:
    \begin{center}
@ -130,7 +131,7 @@ The norm of a matrix is a function: \marginnote{Matrix norm}
 such that for each $\lambda \in \mathbb{R}$ and $\matr{A}, \matr{B} \in \mathbb{R}^{m \times n}$:
 \begin{itemize}
    \item $\Vert \matr{A} \Vert \geq 0$
-    \item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \bar{0}$
+    \item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \matr{0}$
    \item $\Vert \lambda \matr{A} \Vert = \vert \lambda \vert \cdot \Vert \matr{A} \Vert$
    \item $\Vert \matr{A} + \matr{B} \Vert \leq \Vert \matr{A} \Vert + \Vert \matr{B} \Vert$
 \end{itemize}
@ -141,7 +142,7 @@ Common norms are:
        $\Vert \matr{A} \Vert_2 = \sqrt{ \rho(\matr{A}^T\matr{A}) }$,\\
        where $\rho(\matr{X})$ is the largest absolute value of the eigenvalues of $\matr{X}$ (spectral radius).
-    \item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$
+    \item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$ (i.e. max sum of the columns in absolute value)
    \item[Frobenius norm] $\Vert \matr{A} \Vert_F = \sqrt{ \sum_{i=1}^{m} \sum_{j=1}^{n} a_{i,j}^2 }$
 \end{descriptionlist}
@ -210,12 +211,12 @@ Common norms are:
        \end{enumerate}
    \item[Orthogonal basis] \marginnote{Orthogonal basis}
-        Given an $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
+        Given a $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
        $\beta$ is an orthogonal basis if:
        \[ \vec{b}_i \perp \vec{b}_j \text{ for } i \neq j \text{ (i.e.} \left\langle \vec{b}_i, \vec{b}_j \right\rangle = 0 \text{)} \]
    \item[Orthonormal basis] \marginnote{Orthonormal basis}
-        Given an $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
+        Given a $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$.
        $\beta$ is an orthonormal basis if:
        \[ \Vert \vec{b}_i \Vert_2 = 1 \text{ (or} \left\langle \vec{b}_i, \vec{b}_i \right\rangle = 1 \text{)} \]
@ -267,7 +268,7 @@ and is found by minimizing the distance between $\pi_U(\vec{x})$ and $\vec{x}$.
 Given a square matrix $\matr{A} \in \mathbb{R}^{n \times n}$, 
 $\lambda \in \mathbb{C}$ is an eigenvalue of $\matr{A}$ \marginnote{Eigenvalue}
-with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if \marginnote{Eigenvector}
+with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if: \marginnote{Eigenvector}
 \[ \matr{A}\vec{x} = \lambda\vec{x} \]
 It is equivalent to say that:
@ -295,7 +296,7 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is
 \begin{description}
    \item[Eigenspace] \marginnote{Eigenspace}
-        Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalues $\lambda$.
+        Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalue $\lambda$.
        This set is a subspace of $\mathbb{R}^n$.
    \item[Eigenspectrum] \marginnote{Eigenspectrum}
@ -306,7 +307,7 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is
 \begin{description}
    \item[Geometric multiplicity] \marginnote{Geometric multiplicity}
        Given an eigenvalue $\lambda$ of a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
-        The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated with $\lambda$.
+        The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated to $\lambda$.
 \end{description}
--- a/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
+++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
@ -54,7 +54,7 @@ has an unique solution iff one of the following conditions is satisfied:
 The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
 \[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
-However this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
+However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
@ -74,21 +74,23 @@ the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} =
    \item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix
 \end{itemize}
 %
-As directly solving a system with a triangular matrix has complexity $O(n^2)$ (forward or backward substitutions), 
+The system can be decomposed to:
-the system can be decomposed to:
+\[
 \begin{equation}
    \begin{split}
        \matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
            & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \vec{b}
    \end{split}
-\end{equation}
+\]
 To find the solution, it is sufficient to solve in order:
 \begin{enumerate}
    \item $\matr{L}\vec{y} = \vec{b}$ (solved w.r.t. $\vec{y}$)
    \item $\vec{y} = \matr{U}\vec{x}$ (solved w.r.t. $\vec{x}$)
 \end{enumerate}
-The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$
+The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\
 $O(\frac{n^3}{3})$ is the time complexity of the LU factorization. 
 $O(n^2)$ is the complexity to directly solving a system with a triangular matrix (forward or backward substitutions).
 \subsection{Gaussian factorization with pivoting}
 \marginnote{Gaussian factorization with pivoting}
@ -100,12 +102,12 @@ This is achieved by using a permutation matrix $\matr{P}$, which is obtained as
 The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
 The system can be decomposed to:
-\begin{equation}
+\[
    \begin{split}
        \matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
            & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \matr{P}\vec{b}
    \end{split}
-\end{equation}
+\]
 An alternative formulation (which is what \texttt{SciPy} uses) 
 is defined as:
@ -132,7 +134,7 @@ The two most common families of iterative methods are:
        compute the sequence as:
        \[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \]
        where $\matr{B}$ is called iteration matrix and $\vec{d}$ is computed from the $\vec{b}$ vector of the system.
-        The time complexity per iteration $O(n^2)$.
+        The time complexity per iteration is $O(n^2)$.
    \item[Gradient-like methods] \marginnote{Gradient-like methods}
        have the form:
@ -142,20 +144,20 @@ The two most common families of iterative methods are:
 \subsection{Stopping criteria}
 \marginnote{Stopping criteria}
-One ore more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite).
+One or more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite).
 The most common approaches are:
 \begin{descriptionlist}
    \item[Residual based]
        The algorithm is terminated when the current solution is close enough to the exact solution.
        The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$.
-        Given a tolerance $\varepsilon$, the algorithm stops when:
+        Given a tolerance $\varepsilon$, the algorithm may stop when:
        \begin{itemize}
-            \item $\Vert \vec{r}_k \Vert \leq \varepsilon$
+            \item $\Vert \vec{r}_k \Vert \leq \varepsilon$ (absolute)
-            \item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$
+            \item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$ (relative)
        \end{itemize}
    \item[Update based] 
-        The algorithm is terminated when the change between iterations is very small.
+        The algorithm is terminated when the difference between iterations is very small.
        Given a tolerance $\tau$, the algorithm stops when:
        \[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \]
 \end{descriptionlist}
@ -183,5 +185,5 @@ Finally, we can define the \textbf{condition number} of a matrix $\matr{A}$ as:
 \[ K(\matr{A}) = \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \]
 A system is \textbf{ill-conditioned} if $K(\matr{A})$ is large \marginnote{Ill-conditioned}
-(i.e. small perturbation on the input causes large changes in the output).
+(i.e. a small perturbation of the input causes a large change of the output).
 Otherwise it is \textbf{well-conditioned}. \marginnote{Well-conditioned}