From 8e85303b9470f7d3a3abac17352a16304fac912e Mon Sep 17 00:00:00 2001
From: NotXia <35894453+NotXia@users.noreply.github.com>
Date: Sat, 23 Sep 2023 19:52:51 +0200
Subject: [PATCH] Add SMM linear systems

---
 .../main.tex                                  |   3 +-
 .../sections/_linear_systems.tex              | 187 ++++++++++++++++++
 2 files changed, 189 insertions(+), 1 deletion(-)
 create mode 100644 statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex

diff --git a/statistical-and-mathematical-methods-for-ai/main.tex b/statistical-and-mathematical-methods-for-ai/main.tex
index 1f6ea18..5599f2e 100644
--- a/statistical-and-mathematical-methods-for-ai/main.tex
+++ b/statistical-and-mathematical-methods-for-ai/main.tex
@@ -20,7 +20,8 @@
     linktoc=all
 }
 
-\setlist[description]{labelindent=\parindent}   % Indents `description`
+\setlist[description]{labelindent=1em}   % Indents `description`
+\setlength{\parindent}{0pt}
 \renewcommand*{\marginfont}{\color{gray}\footnotesize}
 
 \theoremstyle{definition}
diff --git a/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
new file mode 100644
index 0000000..8220289
--- /dev/null
+++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex
@@ -0,0 +1,187 @@
+\section{Linear systems}
+
+A linear system:
+\begin{equation*}
+    \begin{cases}
+        a_{1,1}x_1 + a_{1,2}x_2 + \dots + a_{1,n}x_n = b_1\\
+        a_{2,1}x_1 + a_{2,2}x_2 + \dots + a_{2,n}x_n = b_2\\
+        \hspace*{7em} \vdots \\
+        a_{m,1}x_1 + a_{m,2}x_2 + \dots + a_{m,n}x_n = b_m\\
+    \end{cases}
+\end{equation*}
+can be represented as:
+\[ \matr{A}\vec{x} = \vec{b} \]
+where:
+\[
+    \matr{A} = 
+    \begin{pmatrix}
+        a_{1,1} & a_{1, 2} & \hdots & a_{1,n} \\
+        a_{2,1} & a_{2, 2} & \hdots & a_{2,n} \\
+        \vdots  & \vdots   & \ddots & \vdots  \\
+        a_{m,1} & a_{m, 2} & \hdots & a_{m,n}
+    \end{pmatrix} \in \mathbb{R}^{m \times n}
+    \hspace*{2em}
+    %
+    \vec{x} = 
+    \begin{pmatrix}
+        x_1 \\
+        x_2 \\ 
+        \vdots \\
+        x_n
+    \end{pmatrix} \in \mathbb{R}^n
+    \hspace*{2em}
+    %
+    \vec{b} = 
+    \begin{pmatrix}
+        b_1 \\
+        b_2 \\ 
+        \vdots \\
+        b_m
+    \end{pmatrix} \in \mathbb{R}^m
+\]
+    
+
+
+\subsection{Square linear systems}
+\marginnote{Square linear system}
+A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$
+has an unique solution iff one of the following conditions is satisfied:
+\begin{enumerate}
+    \item $\matr{A}$ is non-singular (invertible)
+    \item $\text{rank}(\matr{A}) = n$ (full rank)
+    \item $\matr{A}\vec{x}$ admits only the solution $\vec{x} = \nullvec$
+\end{enumerate}
+
+The solution can be algebraically determined as \marginnote{Algebraic solution to linear systems}
+\[ \matr{A}\vec{x} = \vec{b} \iff \vec{x} = \matr{A}^{-1}\vec{b} \]
+However this approach requires to compute the inverse of a matrix, which has a time complexity of $O(n^3)$.
+
+
+
+\subsection{Direct methods}
+\marginnote{Direct methods}
+Direct methods compute the solution of a linear system in a finite number of steps.
+Compared to iterative methods, they are more precise but more expensive.
+
+The most common approach consists in factorizing the matrix $\matr{A}$.
+
+\subsubsection{Gaussian factorization}
+\marginnote{Gaussian factorization\\(LU decomposition)}
+Given a square linear system $\matr{A}\vec{x} = \vec{b}$, 
+the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that:
+\begin{itemize}
+    \item $\matr{L} \in \mathbb{R}^{n \times n}$ is a lower triangular matrix
+    \item $\matr{U} \in \mathbb{R}^{n \times n}$ is an upper triangular matrix
+\end{itemize}
+%
+As directly solving a system with a triangular matrix has complexity $O(n^2)$ (forward or backward substitutions), 
+the system can be decomposed to:
+\begin{equation}
+    \begin{split}
+        \matr{A}\vec{x} = \vec{b} & \iff \matr{LU}\vec{x} = \vec{b} \\
+            & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \vec{b}
+    \end{split}
+\end{equation}
+To find the solution, it is sufficient to solve in order:
+\begin{enumerate}
+    \item $\matr{L}\vec{y} = \vec{b}$ (solved w.r.t. $\vec{y}$)
+    \item $\vec{y} = \matr{U}\vec{x}$ (solved w.r.t. $\vec{x}$)
+\end{enumerate}
+
+The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$
+
+\subsubsection{Gaussian factorization with pivoting}
+\marginnote{Gaussian factorization with pivoting}
+During the computation of $\matr{A} = \matr{L}\matr{U}$ 
+(using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}), 
+a division by 0 may occur.
+A method to prevent this problem (and to lower the algorithmic error) is to change the order of the rows of $\matr{A}$ before decomposing it.
+This is achieved by using a permutation matrix $\matr{P}$, which is obtained as a permutation of the identity matrix.
+
+The permuted system becomes $\matr{P}\matr{A}\vec{x} = \matr{P}\vec{b}$ and the factorization is obtained as $\matr{P}\matr{A} = \matr{L}\matr{U}$.
+The system can be decomposed to:
+\begin{equation}
+    \begin{split}
+        \matr{P}\matr{A}\vec{x} = \matr{P}\vec{b} & \iff \matr{L}\matr{U}\vec{x} = \matr{P}\vec{b} \\
+            & \iff \vec{y} = \matr{U}\vec{x} \text{ \& } \matr{L}\vec{y} = \matr{P}\vec{b}
+    \end{split}
+\end{equation}
+
+An alternative formulation (which is what \texttt{SciPy} uses) 
+is defined as:
+\[\matr{A} = \matr{P}\matr{L}\matr{U} \iff \matr{P}^T\matr{A} = \matr{L}\matr{U} \]
+It must be noted that $\matr{P}$ is orthogonal, so $\matr{P}^T = \matr{P}^{-1}$.
+The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can be found as above.
+
+
+
+\subsection{Iterative methods}
+\marginnote{Iterative methods}
+Iterative methods solve a linear system by computing a sequence that converges to the exact solution.
+Compared to direct methods, they are less precise but computationally faster and more adapt for large systems. 
+
+The overall idea is to build a sequence of vectors $\vec{x}_k$ 
+that converges to the exact solution $\vec{x}^*$:
+\[ \lim_{k \rightarrow \infty} \vec{x}_k = \vec{x}^* \]
+Generally, the first vector $\vec{x}_0$ is given (or guessed). Subsequent vectors are computed w.r.t. the previous iteration 
+as $\vec{x}_k = g(\vec{x}_{k-1})$.
+
+The two most common families of iterative methods are:
+\begin{description}
+    \item[Stationary methods] \marginnote{Stationary methods}
+        compute the sequence as:
+        \[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \]
+        where $\matr{B}$ is called iteration matrix and $\vec{d}$ is computed from the $\vec{b}$ vector of the system.
+        The time complexity per iteration $O(n^2)$.
+    
+    \item[Gradient-like methods] \marginnote{Gradient-like methods}
+        have the form:
+        \[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \]
+        where $\alpha_{k-1} \in \mathbb{R}$ and the vector $\vec{p}_{k-1}$ is called direction.
+\end{description}
+
+\subsubsection{Stopping criteria}
+\marginnote{Stopping criteria}
+One ore more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite).
+The most common approaches are:
+\begin{description}
+    \item[Residual based]
+        The algorithm is terminated when the current solution is close enough to the exact solution.
+        The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$.
+        Given a tolerance $\varepsilon$, the algorithm stops when:
+        \begin{itemize}
+            \item $\Vert \vec{r}_k \Vert \leq \varepsilon$
+            \item $\frac{\Vert \vec{r}_k \Vert}{\Vert \vec{b} \Vert} \leq \varepsilon$
+        \end{itemize}
+
+    \item[Update based] 
+        The algorithm is terminated when the change between iterations is very small.
+        Given a tolerance $\tau$, the algorithm stops when:
+        \[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \]
+\end{description}
+Obviously, as the sequence is truncated, a truncation error is introduced when using iterative methods.
+
+
+
+\subsection{Condition number}
+Inherent error causes inaccuracies during the resolution of a system.
+This problem is independent from the algorithm and is estimated using exact arithmetic.
+
+Given a system $\matr{A}\vec{x} = \vec{b}$, we perturbate $\matr{A}$ and/or $\vec{b}$ and study the inherited error.
+For instance, if we perturbate $\vec{b}$, we obtain the following system:
+\[ \matr{A}\tilde{\vec{x}} = (\vec{b} + \Delta\vec{b}) \]
+After finding $\tilde{\vec{x}}$, we can compute the inherited error as $\Delta\vec{x} = \tilde{\vec{x}} - \vec{x}$.
+
+By comparing $\left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert$ and $\left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert$, 
+we can compute the error introduced by the perturbation.
+It can be shown that the distance is:
+\[ 
+    \left\Vert \frac{\Delta\vec{x}}{\vec{x}} \right\Vert \leq 
+    \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \cdot \left\Vert \frac{\Delta\vec{b}}{\vec{b}} \right\Vert 
+\]
+Finally, we can define the \textbf{condition number} of a matrix $\matr{A}$ as: \marginnote{Condition number}
+\[ K(\matr{A}) = \Vert \matr{A} \Vert \cdot \Vert \matr{A}^{-1} \Vert \]
+
+A system is \textbf{ill-conditioned} if $K(\matr{A})$ is large \marginnote{Ill-conditioned}
+(i.e. small perturbation on the input causes large changes in the output).
+Otherwise it is \textbf{well-conditioned}. \marginnote{Well-conditioned}
\ No newline at end of file