From 40090bfa771c16c8bf174291972ed7f22b250bc5 Mon Sep 17 00:00:00 2001 From: NotXia <35894453+NotXia@users.noreply.github.com> Date: Sun, 24 Sep 2023 11:48:01 +0200 Subject: [PATCH] Update document style --- .../main.tex | 26 +++++++------- .../sections/_finite_numbers.tex | 36 +++++++++---------- .../sections/_linear_algebra.tex | 34 +++++++++--------- .../sections/_linear_systems.tex | 24 ++++++------- 4 files changed, 60 insertions(+), 60 deletions(-) diff --git a/statistical-and-mathematical-methods-for-ai/main.tex b/statistical-and-mathematical-methods-for-ai/main.tex index 952e5b8..7f7d4c0 100644 --- a/statistical-and-mathematical-methods-for-ai/main.tex +++ b/statistical-and-mathematical-methods-for-ai/main.tex @@ -1,5 +1,5 @@ -\documentclass[11pt]{article} -\usepackage[margin=3cm, lmargin=2cm, rmargin=4cm, marginparwidth=3cm]{geometry} +\documentclass[11pt]{scrreprt} +\usepackage{geometry} \usepackage{graphicx, xcolor} \usepackage{amsmath, amsfonts, amssymb, amsthm, mathtools, bm} \usepackage{hyperref} @@ -7,20 +7,16 @@ \usepackage[all]{hypcap} % Links hyperref to object top and not caption \usepackage[inline]{enumitem} \usepackage{marginnote} +\usepackage[bottom]{footmisc} -\title{Statistical and Mathematical Methods for Artificial Intelligence} -\date{2023 -- 2024} +\geometry{ margin=3cm, lmargin=2cm, rmargin=4cm, marginparwidth=3cm } +\hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=black, urlcolor=black, linktoc=all } -\hypersetup{ - colorlinks, - citecolor=black, - filecolor=black, - linkcolor=black, - urlcolor=black, - linktoc=all +\NewDocumentEnvironment{descriptionlist}{}{% + \begin{description}[labelindent=1em] +}{ + \end{description}% } - -\setlist[description]{labelindent=1em} % Indents `description` \setlength{\parindent}{0pt} \renewcommand*{\marginfont}{\color{gray}\footnotesize} @@ -36,6 +32,10 @@ \newcommand{\matr}[1]{{\bm{#1}}} + +\title{Statistical and Mathematical Methods for Artificial Intelligence} +\date{2023 -- 2024} + \begin{document} \newgeometry{margin=3cm} \makeatletter diff --git a/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex b/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex index 51c9bdc..1c33f8f 100644 --- a/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex +++ b/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex @@ -1,8 +1,8 @@ -\section{Finite numbers} +\chapter{Finite numbers} -\subsection{Sources of error} +\section{Sources of error} \begin{description} \item[Measure error] \marginnote{Measure error} @@ -25,10 +25,10 @@ -\subsection{Error measurement} +\section{Error measurement} Let $x$ be a value and $\hat{x}$ its approximation. Then: -\begin{description} +\begin{descriptionlist} \item[Absolute error] \begin{equation} E_{a} = \hat{x} - x @@ -40,11 +40,11 @@ Let $x$ be a value and $\hat{x}$ its approximation. Then: E_{a} = \frac{\hat{x} - x}{x} \marginnote{Relative error} \end{equation} -\end{description} +\end{descriptionlist} -\subsection{Representation in base \texorpdfstring{$\beta$}{B}} +\section{Representation in base \texorpdfstring{$\beta$}{B}} Let $\beta \in \mathbb{N}_{> 1}$ be the base. Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as: @@ -66,7 +66,7 @@ where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponen -\subsection{Floating-point} +\section{Floating-point} A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the parameters: \marginnote{Floating-point} \begin{itemize} \item $\beta$: base @@ -86,7 +86,7 @@ Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized fo \end{example} -\subsubsection{Numbers distribution} +\subsection{Numbers distribution} Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the total amount of representable numbers is: \begin{equation*} 2(\beta-1) \beta^{t-1} (U-L+1)+1 @@ -101,9 +101,9 @@ It must be noted that there is an underflow area around 0. \end{figure} -\subsubsection{Numbers representation} +\subsection{Numbers representation} Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in: -\begin{description} +\begin{descriptionlist} \item[Exact representation] if $p \in [L, U]$ and $d_i=0$ for $i>t$. @@ -117,16 +117,16 @@ Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation \item[Overflow] if $p > U$. In this case, an exception is usually raised. -\end{description} +\end{descriptionlist} -\subsubsection{Machine precision} +\subsection{Machine precision} Machine precision $\varepsilon_{\text{mach}}$ determines the accuracy of a floating-point system. \marginnote{Machine precision} Depending on the approximation approach, machine precision can be computes as: -\begin{description} +\begin{descriptionlist} \item[Truncation] $\varepsilon_{\text{mach}} = \beta^{1-t}$ \item[Rounding] $\varepsilon_{\text{mach}} = \frac{1}{2}\beta^{1-t}$ -\end{description} +\end{descriptionlist} Therefore, rounding results in more accurate representations. $\varepsilon_{\text{mach}}$ is the smallest distance among the representable numbers (\Cref{fig:finnum_eps}). @@ -143,9 +143,9 @@ In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest repre \end{equation*} -\subsubsection{IEEE standard} +\subsection{IEEE standard} IEEE 754 defines two floating-point formats: -\begin{description} +\begin{descriptionlist} \item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{float32} \begin{center} \small @@ -165,12 +165,12 @@ IEEE 754 defines two floating-point formats: \hline \end{tabular} \end{center} -\end{description} +\end{descriptionlist} As the first digit of the mantissa is always 1, it does not need to be stored. Moreover, special configurations are reserved to represent \texttt{Inf} and \texttt{NaN}. -\subsubsection{Floating-point arithmetic} +\subsection{Floating-point arithmetic} Let: \begin{itemize} \item $+: \mathbb{R} \times \mathbb{R} \rightarrow \mathbb{R}$ be a real numbers operation. diff --git a/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex b/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex index 7671c99..b24506f 100644 --- a/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex +++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex @@ -1,7 +1,7 @@ -\section{Linear algebra} +\chapter{Linear algebra} -\subsection{Vector space} +\section{Vector space} A \textbf{vector space} over $\mathbb{R}$ is a nonempty set $V$, whose elements are called vectors, with two operations: \marginnote{Vector space} @@ -28,7 +28,7 @@ A subset $U \subseteq V$ of a vector space $V$, is a \textbf{subspace} iff $U$ i \marginnote{Subspace} -\subsubsection{Basis} +\subsection{Basis} \marginnote{Basis} Let $V$ be a vector space of dimension $n$. A basis $\beta = \{ \vec{v}_1, \dots, \vec{v}_n \}$ of $V$ is a set of $n$ linearly independent vectors of $V$.\\ @@ -41,7 +41,7 @@ The canonical basis of a vector space is a basis where each vector represents a The canonical basis $\beta$ of $\mathbb{R}^3$ is $\beta = \{ (1, 0, 0), (0, 1, 0), (0, 0, 1) \}$ \end{example} -\subsubsection{Dot product} +\subsection{Dot product} The dot product of two vectors in $\vec{x}, \vec{y} \in \mathbb{R}^n$ is defined as: \marginnote{Dot product} \begin{equation*} \left\langle \vec{x}, \vec{y} \right\rangle = @@ -49,7 +49,7 @@ The dot product of two vectors in $\vec{x}, \vec{y} \in \mathbb{R}^n$ is defined \end{equation*} -\subsection{Matrix} +\section{Matrix} This is a {\tiny(very formal definition of)} matrix: \marginnote{Matrix} \begin{equation*} @@ -62,14 +62,14 @@ This is a {\tiny(very formal definition of)} matrix: \marginnote{Matrix} \end{pmatrix} \end{equation*} -\subsubsection{Invertible matrix} +\subsection{Invertible matrix} A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is invertible (non-singular) if: \marginnote{Non-singular matrix} \begin{equation*} \exists \matr{B} \in \mathbb{R}^{n \times n}: \matr{AB} = \matr{BA} = \matr{I} \end{equation*} where $\matr{I}$ is the identity matrix. $\matr{B}$ is denoted as $\matr{A}^{-1}$. -\subsubsection{Kernel} +\subsection{Kernel} The null space (kernel) of a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ is a subspace such that: \marginnote{Kernel} \begin{equation*} \text{Ker}(\matr{A}) = \{ \vec{x} \in \mathbb{R}^n : \matr{A}\vec{x} = \nullvec \} @@ -79,15 +79,15 @@ The null space (kernel) of a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ is a A square matrix $\matr{A}$ with $\text{\normalfont Ker}(\matr{A}) = \{\nullvec\}$ is non singular. \end{theorem} -\subsubsection{Similar matrices} \marginnote{Similar matrices} +\subsection{Similar matrices} \marginnote{Similar matrices} Two matrices $\matr{A}$ and $\matr{D}$ are \textbf{similar} if there exists an invertible matrix $\matr{P}$ such that: \[ \matr{D} = \matr{P}^{-1} \matr{A} \matr{P} \] -\subsection{Norms} +\section{Norms} -\subsubsection{Vector norms} +\subsection{Vector norms} The norm of a vector is a function: \marginnote{Vector norm} \begin{equation*} \Vert \cdot \Vert: \mathbb{R}^n \rightarrow \mathbb{R} @@ -122,7 +122,7 @@ In some cases, unbalanced results may be given when comparing different norms. \end{example} -\subsubsection{Matrix norms} +\subsection{Matrix norms} The norm of a matrix is a function: \marginnote{Matrix norm} \begin{equation*} \Vert \cdot \Vert: \mathbb{R}^{m \times n} \rightarrow \mathbb{R} @@ -148,7 +148,7 @@ Common norms are: -\subsection{Symmetric, positive definite matrices} +\section{Symmetric, positive definite matrices} \begin{description} \item[Symmetric matrix] \marginnote{Symmetric matrix} @@ -176,7 +176,7 @@ Common norms are: -\subsection{Orthogonality} +\section{Orthogonality} \begin{description} \item[Angle between vectors] \marginnote{Angle between vectors} The angle $\omega$ between two vectors $\vec{x}$ and $\vec{y}$ can be obtained from: @@ -239,7 +239,7 @@ Common norms are: -\subsection{Projections} +\section{Projections} Projections are methods to map high-dimensional data into a lower-dimensional space while minimizing the compression loss.\\ \marginnote{Orthogonal projection} @@ -250,7 +250,7 @@ In other words, applying $\pi$ multiple times gives the same result (i.e. idempo $\pi$ can be expressed as a transformation matrix $\matr{P}_\pi$ such that: \[ \matr{P}_\pi^2 = \matr{P}_\pi \] -\subsubsection{Projection onto general subspaces} \marginnote{Projection onto subspace basis} +\subsection{Projection onto general subspaces} \marginnote{Projection onto subspace basis} To project a vector $\vec{x} \in \mathbb{R}^n$ into a lower-dimensional subspace $U \subseteq \mathbb{R}^n$, it is possible to use the basis of $U$.\\ % @@ -263,7 +263,7 @@ and is found by minimizing the distance between $\pi_U(\vec{x})$ and $\vec{x}$. -\subsection{Eigenvectors and eigenvalues} +\section{Eigenvectors and eigenvalues} Given a square matrix $\matr{A} \in \mathbb{R}^{n \times n}$, $\lambda \in \mathbb{C}$ is an eigenvalue of $\matr{A}$ \marginnote{Eigenvalue} @@ -328,7 +328,7 @@ we can prove that $\forall c \in \mathbb{R} \smallsetminus \{0\}:$ $c\vec{x}$ is \end{theorem} -\subsubsection{Diagonalizability} +\subsection{Diagonalizability} \marginnote{Diagonalizable matrix} A matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is diagonalizable if it is similar to a diagonal matrix $\matr{D} \in \mathbb{R}^{n \times n}$: \[ \exists \matr{P} \in \mathbb{R}^{n \times n} \text{ s.t. } \matr{P} \text{ invertible and } \matr{D} = \matr{P}^{-1}\matr{A}\matr{P} \] diff --git a/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex index 8220289..e27a75b 100644 --- a/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex +++ b/statistical-and-mathematical-methods-for-ai/sections/_linear_systems.tex @@ -1,4 +1,4 @@ -\section{Linear systems} +\chapter{Linear systems} A linear system: \begin{equation*} @@ -42,7 +42,7 @@ where: -\subsection{Square linear systems} +\section{Square linear systems} \marginnote{Square linear system} A square linear system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{n \times n}$ and $\vec{x}, \vec{b} \in \mathbb{R}^n$ has an unique solution iff one of the following conditions is satisfied: @@ -58,14 +58,14 @@ However this approach requires to compute the inverse of a matrix, which has a t -\subsection{Direct methods} +\section{Direct methods} \marginnote{Direct methods} Direct methods compute the solution of a linear system in a finite number of steps. Compared to iterative methods, they are more precise but more expensive. The most common approach consists in factorizing the matrix $\matr{A}$. -\subsubsection{Gaussian factorization} +\subsection{Gaussian factorization} \marginnote{Gaussian factorization\\(LU decomposition)} Given a square linear system $\matr{A}\vec{x} = \vec{b}$, the matrix $\matr{A} \in \mathbb{R}^{n \times n}$ is factorized into $\matr{A} = \matr{L}\matr{U}$ such that: @@ -90,7 +90,7 @@ To find the solution, it is sufficient to solve in order: The overall complexity is $O(\frac{n^3}{3}) + 2 \cdot O(n^2) = O(\frac{n^3}{3})$ -\subsubsection{Gaussian factorization with pivoting} +\subsection{Gaussian factorization with pivoting} \marginnote{Gaussian factorization with pivoting} During the computation of $\matr{A} = \matr{L}\matr{U}$ (using Gaussian elimination\footnote{\url{https://en.wikipedia.org/wiki/LU\_decomposition\#Using\_Gaussian\_elimination}}), @@ -115,7 +115,7 @@ The solution to the system ($\matr{P}^T\matr{A}\vec{x} = \matr{P}^T\vec{b}$) can -\subsection{Iterative methods} +\section{Iterative methods} \marginnote{Iterative methods} Iterative methods solve a linear system by computing a sequence that converges to the exact solution. Compared to direct methods, they are less precise but computationally faster and more adapt for large systems. @@ -127,7 +127,7 @@ Generally, the first vector $\vec{x}_0$ is given (or guessed). Subsequent vector as $\vec{x}_k = g(\vec{x}_{k-1})$. The two most common families of iterative methods are: -\begin{description} +\begin{descriptionlist} \item[Stationary methods] \marginnote{Stationary methods} compute the sequence as: \[ \vec{x}_k = \matr{B}\vec{x}_{k-1} + \vec{d} \] @@ -138,13 +138,13 @@ The two most common families of iterative methods are: have the form: \[ \vec{x}_k = \vec{x}_{k-1} + \alpha_{k-1}\vec{p}_{k-1} \] where $\alpha_{k-1} \in \mathbb{R}$ and the vector $\vec{p}_{k-1}$ is called direction. -\end{description} +\end{descriptionlist} -\subsubsection{Stopping criteria} +\subsection{Stopping criteria} \marginnote{Stopping criteria} One ore more stopping criteria are needed to determine when to truncate the sequence (as it is theoretically infinite). The most common approaches are: -\begin{description} +\begin{descriptionlist} \item[Residual based] The algorithm is terminated when the current solution is close enough to the exact solution. The residual at iteration $k$ is computed as $\vec{r}_k = \vec{b} - \matr{A}\vec{x}_k$. @@ -158,12 +158,12 @@ The most common approaches are: The algorithm is terminated when the change between iterations is very small. Given a tolerance $\tau$, the algorithm stops when: \[ \Vert \vec{x}_{k} - \vec{x}_{k-1} \Vert \leq \tau \] -\end{description} +\end{descriptionlist} Obviously, as the sequence is truncated, a truncation error is introduced when using iterative methods. -\subsection{Condition number} +\section{Condition number} Inherent error causes inaccuracies during the resolution of a system. This problem is independent from the algorithm and is estimated using exact arithmetic.