\section{Finite numbers} \subsection{Sources of error} \begin{description} \item[Measure error] \marginnote{Measure error} Precision of the measurement instrument. \item[Arithmetic error] \marginnote{Arithmetic error} Propagation of rounding errors in each step of an algorithm. \item[Truncation error] \marginnote{Truncation error} Approximating an infinite procedure into a finite number of iterations. \item[Inherent error] \marginnote{Inherent error} Caused by the finite representation of the data (floating-point). \begin{figure}[h] \centering \includegraphics[width=0.6\textwidth]{img/_inherent_error.pdf} \caption{Inherent error visualization} \end{figure} \end{description} \subsection{Error measurement} Let $x$ be a value and $\hat{x}$ its approximation. Then: \begin{description} \item[Absolute error] \begin{equation} E_{a} = \hat{x} - x \marginnote{Absolute error} \end{equation} Note that, out of context, the absolute error is meaningless. \item[Relative error] \begin{equation} E_{a} = \frac{\hat{x} - x}{x} \marginnote{Relative error} \end{equation} \end{description} \subsection{Representation in base \texorpdfstring{$\beta$}{B}} Let $\beta \in \mathbb{N}_{> 1}$ be the base. Each $x \in \mathbb{R} \smallsetminus \{0\}$ can be uniquely represented as: \begin{equation} \label{eq:finnum_b_representation} x = \texttt{sign}(x) \cdot (d_1\beta^{-1} + d_2\beta^{-2} + \dots d_n\beta^{-n})\beta^p \end{equation} where: \begin{itemize} \item $0 \leq d_i \leq \beta-1$ \item $d_1 \neq 0$ \item starting from an index $i$, not all $d_j$ ($j \geq i$) are equal to $\beta-1$ \end{itemize} % \Cref{eq:finnum_b_representation} can be represented using the normalized scientific notation as: \marginnote{Normalized scientific notation} \begin{equation} x = \pm (0.d_1d_2\dots) \beta^p \end{equation} where $0.d_1d_2\dots$ is the \textbf{mantissa} and $\beta^p$ the \textbf{exponent}. \marginnote{Mantissa\\Exponent} \subsection{Floating-point} A floating-point system $\mathcal{F}(\beta, t, L, U)$ is defined by the parameters: \marginnote{Floating-point} \begin{itemize} \item $\beta$: base \item $t$: precision (number of digits in the mantissa) \item $[L, U]$: range of the exponent \end{itemize} % Each $x \in \mathcal{F}(\beta, t, L, U)$ can be represented in its normalized form: \begin{eqnarray} x = \pm (0.d_1d_2 \dots d_t) \beta^p & L \leq p \leq U \end{eqnarray} \begin{example} In $\mathcal{F}(10, 5, -3, 3)$, $x=12.\bar{3}$ is represented as: \begin{equation*} \texttt{fl}(x) = + 0.12333 \cdot 10^2 \end{equation*} \end{example} \subsubsection{Numbers distribution} Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the total amount of representable numbers is: \begin{equation*} 2(\beta-1) \beta^{t-1} (U-L+1)+1 \end{equation*} % Representable numbers are more sparse towards the exponent upper bound and more dense towards the lower bound. It must be noted that there is an underflow area around 0. \begin{figure}[h] \centering \includegraphics[width=0.8\textwidth]{img/floatingpoint_range.png} \caption{Floating-point numbers in $\mathcal{F}(2, 3, -1, 2)$} \end{figure} \subsubsection{Numbers representation} Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the representation of $x \in \mathbb{R}$ can result in: \begin{description} \item[Exact representation] if $p \in [L, U]$ and $d_i=0$ for $i>t$. \item[Approximation] if $p \in [L, U]$ but $d_i$ may not be 0 for $i>t$. In this case, the representation is obtained by truncating or rounding the value. \marginnote{Truncation\\Rounding} \item[Underflow] if $p < L$. In this case, the values is approximated as 0. \item[Overflow] if $p > U$. In this case, an exception is usually raised. \end{description} \subsubsection{Machine precision} Machine precision $\varepsilon_{\text{mach}}$ determines the accuracy of a floating-point system. \marginnote{Machine precision} Depending on the approximation approach, machine precision can be computes as: \begin{description} \item[Truncation] $\varepsilon_{\text{mach}} = \beta^{1-t}$ \item[Rounding] $\varepsilon_{\text{mach}} = \frac{1}{2}\beta^{1-t}$ \end{description} Therefore, rounding results in more accurate representations. $\varepsilon_{\text{mach}}$ is the smallest distance among the representable numbers (\Cref{fig:finnum_eps}). \begin{figure}[h] \centering \includegraphics[width=0.2\textwidth]{img/machine_eps.png} \caption{Visualization of $\varepsilon_{\text{mach}}$ in $\mathcal{F}(2, 3, -1, 2)$} \label{fig:finnum_eps} \end{figure}\\ % In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest representable number such that: \begin{equation*} \texttt{fl}(1 + \varepsilon_{\text{mach}}) > 1. \end{equation*} \subsubsection{IEEE standard} IEEE 754 defines two floating-point formats: \begin{description} \item[Single precision] Stored in 32 bits. Represents the system $\mathcal{F}(2, 24, -128, 127)$. \marginnote{float32} \begin{center} \small \begin{tabular}{|c|c|c|} \hline 1 (sign) & 8 (exponent) & 23 (mantissa) \\ \hline \end{tabular} \end{center} \item[Double precision] Stored in 64 bits. Represents the system $\mathcal{F}(2, 53, -1024, 1023)$. \marginnote{float64} \begin{center} \small \begin{tabular}{|c|c|c|} \hline 1 (sign) & 11 (exponent) & 52 (mantissa) \\ \hline \end{tabular} \end{center} \end{description} As the first digit of the mantissa is always 1, it does not need to be stored. Moreover, special configurations are reserved to represent \texttt{Inf} and \texttt{NaN}. \subsubsection{Floating-point arithmetic} Let: \begin{itemize} \item $+: \mathbb{R} \times \mathbb{R} \rightarrow \mathbb{R}$ be a real numbers operation. \item $\oplus: \mathcal{F} \times \mathcal{F} \rightarrow \mathcal{F}$ be the corresponding operation in a floating-point system. \end{itemize} % To compute $x \oplus y$, a machine: \begin{enumerate} \item Calculates $x + y$ in a high precision register (still approximated, but more precise than the storing system) \item Stores the result as $\texttt{fl}(x + y)$ \end{enumerate} A floating-point operation causes a small rounding error: \begin{equation} \left\vert \frac{(x \oplus y) - (x + y)}{x+y} \right\vert < \varepsilon_{\text{mach}} \end{equation} % Although, some operations may be subject to the \textbf{cancellation} problem which causes information loss. \marginnote{Cancellation} \begin{example} Given $x = 1$ and $y = 1 \cdot 10^{-16}$, we want to compute $x + y$ in $\mathcal{F}(10, 16, U, L)$.\\ \begin{equation*} \begin{split} z & = \texttt{fl}(x) + \texttt{fl}(y) \\ & = 0.1 \cdot 10^1 + 0.1 \cdot 10^{-15} \\ & = (0.1 + 0.\overbrace{0\dots0}^{\mathclap{16\text{ zeros}}}1) \cdot 10^1 \\ & = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}}1 \cdot 10^1 \end{split} \end{equation*} Then, we have that $\texttt{fl}(z) = 0.1\overbrace{0\dots0}^{\mathclap{15\text{ zeros}}} \cdot 10^1 = 1 = x$. \end{example}