From 736ef140106e02c14e0f40ffc87930ecf8278bd9 Mon Sep 17 00:00:00 2001 From: NotXia <35894453+NotXia@users.noreply.github.com> Date: Sun, 24 Sep 2023 18:05:19 +0200 Subject: [PATCH] Fix typos --- .../sections/_finite_numbers.tex | 44 ++++++++++--------- .../sections/_linear_algebra.tex | 23 +++++----- .../sections/_linear_systems.tex | 32 +++++++------- 3 files changed, 52 insertions(+), 47 deletions(-) diff --git a/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex b/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex index 1c33f8f..105c13f 100644 --- a/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex +++ b/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex @@ -12,7 +12,7 @@ Propagation of rounding errors in each step of an algorithm. \item[Truncation error] \marginnote{Truncation error} - Approximating an infinite procedure into a finite number of iterations. + Approximating an infinite procedure to a finite number of iterations. \item[Inherent error] \marginnote{Inherent error} Caused by the finite representation of the data (floating-point). @@ -30,16 +30,16 @@ Let $x$ be a value and $\hat{x}$ its approximation. Then: \begin{descriptionlist} \item[Absolute error] - \begin{equation} + \[ E_{a} = \hat{x} - x \marginnote{Absolute error} - \end{equation} + \] Note that, out of context, the absolute error is meaningless. \item[Relative error] - \begin{equation} + \[ E_{a} = \frac{\hat{x} - x}{x} \marginnote{Relative error} - \end{equation} + \] \end{descriptionlist} @@ -48,9 +48,9 @@ Let $x$ be a value and $\hat{x}$ its approximation. Then: Let $\beta \in \mathbb{N}_{> 1}$ be the base. Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as: -\begin{equation} \label{eq:finnum_b_representation} - x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots d_n\beta^{-n})\beta^p -\end{equation} +\[ \label{eq:finnum_b_representation} + x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots + d_n\beta^{-n})\beta^p +\] where: \begin{itemize} \item $0 \leq d_i \leq \beta-1$ @@ -59,9 +59,9 @@ where: \end{itemize} % \Cref{eq:finnum_b_representation} can be represented using the normalized scientific notation as: \marginnote{Normalized scientific notation} -\begin{equation} +\[ x = \pm (0.d_1d_2\dots) \beta^p -\end{equation} +\] where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponent}. \marginnote{Mantissa\\Exponent} @@ -73,11 +73,13 @@ A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the paramete \item $t$: precision (number of digits in the mantissa) \item $[L, U]$: range of the exponent \end{itemize} -% + Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized form: \begin{eqnarray} x = \pm (0.d_1d_2 \dots d_t) \beta^p & L \leq p \leq U \end{eqnarray} +We denote with $\texttt{fl}(x)$ the representation of $x \in \mathbb{R}$ in a given floating-point system. + \begin{example} In $\mathcal{F}(10, 5, -3, 3)$, $x=12.\bar{3}$ is represented as: \begin{equation*} @@ -101,21 +103,20 @@ It must be noted that there is an underflow area around 0. \end{figure} -\subsection{Numbers representation} +\subsection{Number representation} Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in: \begin{descriptionlist} \item[Exact representation] if $p \in [L, U]$ and $d_i=0$ for $i>t$. - \item[Approximation] + \item[Approximation] \marginnote{Truncation\\Rounding} if $p \in [L, U]$ but $d_i$ may not be 0 for $i>t$. In this case, the representation is obtained by truncating or rounding the value. - \marginnote{Truncation\\Rounding} - \item[Underflow] - if $p < L$. In this case, the values is approximated as 0. + \item[Underflow] \marginnote{Underflow} + if $p < L$. In this case, the value is approximated to 0. - \item[Overflow] + \item[Overflow] \marginnote{Overflow} if $p > U$. In this case, an exception is usually raised. \end{descriptionlist} @@ -179,16 +180,17 @@ Let: % To compute $x \oplus y$, a machine: \begin{enumerate} - \item Calculates $x + y$ in a high precision register (still approximated, but more precise than the storing system) + \item Calculates $x + y$ in a high precision register + (still approximated, but more precise than the floating-point system used to store the result) \item Stores the result as $\texttt{fl}(x + y)$ \end{enumerate} A floating-point operation causes a small rounding error: -\begin{equation} +\[ \left\vert \frac{(x \oplus y) - (x + y)}{x+y} \right\vert < \varepsilon_{\text{mach}} -\end{equation} +\] % -Although, some operations may be subject to the \textbf{cancellation} problem which causes information loss. +However, some operations may be subject to the \textbf{cancellation} problem which causes information loss. \marginnote{Cancellation} \begin{example} Given $x = 1$ and $y = 1 \cdot 10^{-16}$, we want to compute $x + y$ in $\mathcal{F}(10, 16, U, L)$.\\ diff --git a/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex b/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex index b24506f..84245c8 100644 --- a/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex +++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex @@ -16,7 +16,8 @@ A vector space has the following properties: \item Addition is commutative and associative \item A null vector exists: $\exists \nullvec \in V$ s.t. $\forall \vec{u} \in V: \nullvec + \vec{u} = \vec{u} + \nullvec = \vec{u}$ \item An identity element for scalar multiplication exists: $\forall \vec{u} \in V: 1\vec{u} = \vec{u}$ - \item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$ + \item Each vector has its opposite: $\forall \vec{u} \in V, \exists \vec{a} \in V: \vec{a} + \vec{u} = \vec{u} + \vec{a} = \nullvec$.\\ + $\vec{a}$ is denoted as $-\vec{u}$. \item Distributive properties: \[ \forall \alpha \in \mathbb{R}, \forall \vec{u}, \vec{w} \in V: \alpha(\vec{u} + \vec{w}) = \alpha \vec{u} + \alpha \vec{w} \] \[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha + \beta)\vec{u} = \alpha \vec{u} + \beta \vec{u} \] @@ -24,7 +25,7 @@ A vector space has the following properties: \[ \forall \alpha, \beta \in \mathbb{R}, \forall \vec{u} \in V: (\alpha \beta)\vec{u} = \alpha (\beta \vec{u}) \] \end{enumerate} % -A subset $U \subseteq V$ of a vector space $V$, is a \textbf{subspace} iff $U$ is a vector space. +A subset $U \subseteq V$ of a vector space $V$ is a \textbf{subspace} iff $U$ is a vector space. \marginnote{Subspace} @@ -95,7 +96,7 @@ The norm of a vector is a function: \marginnote{Vector norm} such that for each $\lambda \in \mathbb{R}$ and $\vec{x}, \vec{y} \in \mathbb{R}^n$: \begin{itemize} \item $\Vert \vec{x} \Vert \geq 0$ - \item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = 0$ + \item $\Vert \vec{x} \Vert = 0 \iff \vec{x} = \nullvec$ \item $\Vert \lambda \vec{x} \Vert = \vert \lambda \vert \cdot \Vert \vec{x} \Vert$ \item $\Vert \vec{x} + \vec{y} \Vert \leq \Vert \vec{x} \Vert + \Vert \vec{y} \Vert$ \end{itemize} @@ -110,7 +111,7 @@ Common norms are: \end{descriptionlist} % In general, different norms tend to maintain the same proportion. -In some cases, unbalanced results may be given when comparing different norms. +In some cases, unbalanced results may be obtained when comparing different norms. \begin{example} Let $\vec{x} = (1, 1000)$ and $\vec{y} = (999, 1000)$. Their norms are: \begin{center} @@ -130,7 +131,7 @@ The norm of a matrix is a function: \marginnote{Matrix norm} such that for each $\lambda \in \mathbb{R}$ and $\matr{A}, \matr{B} \in \mathbb{R}^{m \times n}$: \begin{itemize} \item $\Vert \matr{A} \Vert \geq 0$ - \item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \bar{0}$ + \item $\Vert \matr{A} \Vert = 0 \iff \matr{A} = \matr{0}$ \item $\Vert \lambda \matr{A} \Vert = \vert \lambda \vert \cdot \Vert \matr{A} \Vert$ \item $\Vert \matr{A} + \matr{B} \Vert \leq \Vert \matr{A} \Vert + \Vert \matr{B} \Vert$ \end{itemize} @@ -141,7 +142,7 @@ Common norms are: $\Vert \matr{A} \Vert_2 = \sqrt{ \rho(\matr{A}^T\matr{A}) }$,\\ where $\rho(\matr{X})$ is the largest absolute value of the eigenvalues of $\matr{X}$ (spectral radius). - \item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$ + \item[1-norm] $\Vert \matr{A} \Vert_1 = \max_{1 \leq j \leq n} \sum_{i=1}^{m} \vert a_{i,j} \vert$ (i.e. max sum of the columns in absolute value) \item[Frobenius norm] $\Vert \matr{A} \Vert_F = \sqrt{ \sum_{i=1}^{m} \sum_{j=1}^{n} a_{i,j}^2 }$ \end{descriptionlist} @@ -210,12 +211,12 @@ Common norms are: \end{enumerate} \item[Orthogonal basis] \marginnote{Orthogonal basis} - Given an $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$. + Given a $n$-dimensional vector space $V$ and a basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$. $\beta$ is an orthogonal basis if: \[ \vec{b}_i \perp \vec{b}_j \text{ for } i \neq j \text{ (i.e.} \left\langle \vec{b}_i, \vec{b}_j \right\rangle = 0 \text{)} \] \item[Orthonormal basis] \marginnote{Orthonormal basis} - Given an $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$. + Given a $n$-dimensional vector space $V$ and an orthogonal basis $\beta = \{ \vec{b}_1, \dots, \vec{b}_n \}$ of $V$. $\beta$ is an orthonormal basis if: \[ \Vert \vec{b}_i \Vert_2 = 1 \text{ (or} \left\langle \vec{b}_i, \vec{b}_i \right\rangle = 1 \text{)} \] @@ -267,7 +268,7 @@ and is found by minimizing the distance between $\pi_U(\vec{x})$ and $\vec{x}$. Given a square matrix $\matr{A} \in \mathbb{R}^{n \times n}$, $\lambda \in \mathbb{C}$ is an eigenvalue of $\matr{A}$ \marginnote{Eigenvalue} -with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if \marginnote{Eigenvector} +with corresponding eigenvector $\vec{x} \in \mathbb{R}^n \smallsetminus \{ \nullvec \}$ if: \marginnote{Eigenvector} \[ \matr{A}\vec{x} = \lambda\vec{x} \] It is equivalent to say that: @@ -295,7 +296,7 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is \begin{description} \item[Eigenspace] \marginnote{Eigenspace} - Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalues $\lambda$. + Set of all the eigenvectors of $\matr{A} \in \mathbb{R}^{n \times n}$ associated to an eigenvalue $\lambda$. This set is a subspace of $\mathbb{R}^n$. \item[Eigenspectrum] \marginnote{Eigenspectrum} @@ -306,7 +307,7 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is \begin{description} \item[Geometric multiplicity] \marginnote{Geometric multiplicity} Given an eigenvalue $\lambda$ of a matrix $\matr{A} \in \mathbb{R}^{n \times n}$. - The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated with $\lambda$. + The geometric multiplicity of $\lambda$ is the number of linearly independent eigenvectors associated to $\lambda$. \end{description} diff --git a/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex index e27a75b..ac54000 100644 --- a/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex +++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex @@ -54,7 +54,7 @@ has an unique solution iff one of the following conditions is satisfied: The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems} \[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \] -However this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$. +However, this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$. @@ -74,21 +74,23 @@ the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix \end{itemize} % -As directly solving a system with a triangular matrix has complexity $O(n^2)$ (forward or backward substitutions), -the system can be decomposed to: -\begin{equation} +The system can be decomposed to: +\[ \begin{split} \matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\ & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \vec{b} \end{split} -\end{equation} +\] To find the solution, it is sufficient to solve in order: \begin{enumerate} \item $\matr{L}\vec{y} = \vec{b}$ (solved w.r.t. $\vec{y}$) \item $\vec{y} = \matr{U}\vec{x}$ (solved w.r.t. $\vec{x}$) \end{enumerate} -The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$ +The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$.\\ +$O(\frac{n^3}{3})$ is the time complexity of the LU factorization. +$O(n^2)$ is the complexity to directly solving a system with a triangular matrix (forward or backward substitutions). + \subsection{Gaussian factorization with pivoting} \marginnote{Gaussian factorization with pivoting} @@ -100,12 +102,12 @@ This is achieved by using a permutation matrix $\matr{P}$, which is obtained as The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$. The system can be decomposed to: -\begin{equation} +\[ \begin{split} \matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\ & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \matr{P}\vec{b} \end{split} -\end{equation} +\] An alternative formulation (which is what \texttt{SciPy} uses) is defined as: @@ -132,7 +134,7 @@ The two most common families of iterative methods are: compute the sequence as: \[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \] where $\matr{B}$ is called iteration matrix and $\vec{d}$ is computed from the $\vec{b}$ vector of the system. - The time complexity per iteration $O(n^2)$. + The time complexity per iteration is $O(n^2)$. \item[Gradient-like methods] \marginnote{Gradient-like methods} have the form: @@ -142,20 +144,20 @@ The two most common families of iterative methods are: \subsection{Stopping criteria} \marginnote{Stopping criteria} -One ore more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite). +One or more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite). The most common approaches are: \begin{descriptionlist} \item[Residual based] The algorithm is terminated when the current solution is close enough to the exact solution. The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$. - Given a tolerance $\varepsilon$, the algorithm stops when: + Given a tolerance $\varepsilon$, the algorithm may stop when: \begin{itemize} - \item $\Vert \vec{r}_k \Vert \leq \varepsilon$ - \item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$ + \item $\Vert \vec{r}_k \Vert \leq \varepsilon$ (absolute) + \item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$ (relative) \end{itemize} \item[Update based] - The algorithm is terminated when the change between iterations is very small. + The algorithm is terminated when the difference between iterations is very small. Given a tolerance $\tau$, the algorithm stops when: \[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \] \end{descriptionlist} @@ -183,5 +185,5 @@ Finally, we can define the \textbf{condition number} of a matrix $\matr{A}$ as: \[ K(\matr{A}) = \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \] A system is \textbf{ill-conditioned} if $K(\matr{A})$ is large \marginnote{Ill-conditioned} -(i.e. small perturbation on the input causes large changes in the output). +(i.e. a small perturbation of the input causes a large change of the output). Otherwise it is \textbf{well-conditioned}. \marginnote{Well-conditioned} \ No newline at end of file