unibo-ai-notes/src/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex

\chapter{Matrix decomposition}


\section{Eigendecomposition}
\marginnote{Eigendecomposition}
Given a matrix $\matr{A} \in \mathbb{R}^{n \times n}$.
If the eigenvectors of $\matr{A}$ form a basis of $\mathbb{R}^n$,
then $\matr{A} \in \mathbb{R}^{n \times n}$ can be decomposed into:
\[ \matr{A} = \matr{P}\matr{D}\matr{P}^{-1} \]
where $\matr{P} \in \mathbb{R}^{n \times n}$ contains the eigenvectors of $\matr{A}$ as its columns and
$\matr{D}$ is a diagonal matrix whose diagonal contains the eigenvalues of $\matr{A}$.


\section{Singular value decomposition}
\marginnote{Singular value decomposition}
Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r \in [0, \min\{m, n\}]$.
The singular value decomposition (SVD) of $\matr{A}$ is always possible and has form:
\[
    \matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
\]
\[
    =
    \begin{pmatrix}
        \begin{pmatrix} \\ \vec{u}_1 \\ \\ \end{pmatrix}    &
        \dots                                               &
        \begin{pmatrix} \\ \vec{u}_m \\ \\ \end{pmatrix}
    \end{pmatrix}
    \begin{pmatrix}
        \sigma_1    & 0         & 0                 \\
        0           & \ddots    & 0                 \\
        0           & 0    & \sigma_{\min\{m, n\}}  \\
    \end{pmatrix}
    \begin{pmatrix}
        \begin{pmatrix} & \vec{v}_1 & \end{pmatrix} \\
        \vdots                                      \\
        \begin{pmatrix} & \vec{v}_n & \end{pmatrix} \\
    \end{pmatrix}
\]
where:
\begin{itemize}
    \item
        $\matr{U} \in \mathbb{R}^{m \times m}$ is an orthogonal matrix with columns $\vec{u}_i$ called left-singular vectors.

    \item
        $\matr{V} \in \mathbb{R}^{n \times n}$ is an orthogonal matrix with columns $\vec{v}_i$ called right-singular vectors.

    \item
        $\matr{\Sigma} \in \mathbb{R}^{m \times n}$ is a matrix with $\matr{\Sigma}_{i,j} = 0$ (i.e. diagonal if it was a square matrix) and
        the singular values $\sigma_i, i = 1 \dots \min\{m, n\}$ on the diagonal.
        By convention $\sigma_1 \geq \sigma_2 \geq \dots \geq \sigma_r \geq 0$.
        Note that singular values $\sigma_j = 0$ for $(r + 1) \leq j \leq \min\{m, n\}$
        (i.e. singular values at indexes after $\text{rank}(\matr{A})$ are always 0).
\end{itemize}

\marginnote{Singular value equation}
We can also represent SVD as a \textbf{singular value equation}, which resembles the eigenvalue equation:
\[  \matr{A}\vec{v}_i = \sigma_i\vec{u}_i \text{ for } i = 1, \dots, r \]
This is derived from:
\[
    \matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T
        \iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}\matr{V}^T\matr{V}
        \iff \matr{A}\matr{V} = \matr{U}\matr{\Sigma}
\]

\subsection{Singular values and eigenvalues}
\marginnote{Eigendecomposition of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$}
Given $\matr{A} \in \mathbb{R}^{m \times n}$, we can obtain the eigenvalues and eigenvectors
of $\matr{A}^T\matr{A}$ and $\matr{A}\matr{A}^T$ through SVD.

For $\matr{A}^T\matr{A}$, we can compute:
\[
\begin{split}
    \matr{A}^T\matr{A} & = (\matr{U}\matr{\Sigma}\matr{V}^T)^T(\matr{U}\matr{\Sigma}\matr{V}^T) \text{ using } (\matr{A}\matr{B})^T = \matr{B}^T\matr{A}^T \\
        & = (\matr{V}\matr{\Sigma}^T\matr{U}^T)(\matr{U}\matr{\Sigma}\matr{V}^T) \\
        & = \matr{V}\matr{\Sigma}^T\matr{\Sigma}\matr{V}^T \\
        & = \matr{V}\matr{\Sigma}^2\matr{V}^T
\end{split}
\]
As $\matr{V}$ is orthogonal ($\matr{V}^T = \matr{V}^{-1}$), we can apply the eigendecomposition theorem:
\begin{itemize}
    \item The diagonal of $\matr{\Sigma}^2$ (i.e. the square of the singular values of $A$) are the eigenvalues of $\matr{A}^T\matr{A}$
    \item The columns of $\matr{V}$ (right-singular vectors) are the eigenvectors of $\matr{A}^T\matr{A}$
\end{itemize}

The same process holds for $\matr{A}\matr{A}^T$. In this case, the columns of $\matr{U}$ (left-singular vectors) are the eigenvectors.


\subsection{Singular values and 2-norm}
Given a symmetric matrix $\matr{A} \in \mathbb{R}^{n \times n}$,
we have that $\matr{A}^T\matr{A} = \matr{A}^2 = \matr{A}\matr{A}^T$ (as $\matr{A}^T = \matr{A}$).

The eigenvalues of $\matr{A}^2$ are $\lambda_1^2, \dots,\lambda_n^2$, where $\lambda_i$ are eigenvalues of $\matr{A}$.
Alternatively, the eigenvalues of $\matr{A}^2$ are the squared singular values of $\matr{A}$: $\lambda_i^2 = \sigma_i^2$.
Moreover, the eigenvalues of $\matr{A}^{-1}$ are $\frac{1}{\lambda_1}, \dots, \frac{1}{\lambda_n}$.

\marginnote{2-norm using SVD}
We can compute the 2-norm as:
\[ \Vert \matr{A} \Vert_2 = \sqrt{\rho(\matr{A}^T\matr{A})} = \sqrt{\rho(\matr{A}^2)} = \sqrt{\max\{\sigma_1^2, \dots, \sigma_r^2\}} = \sigma_1 \]
\[
    \Vert \matr{A}^{-1} \Vert_2 = \sqrt{\rho((\matr{A}^{-1})^T(\matr{A}^{-1}))} =
    \sqrt{\rho((\matr{A}\matr{A}^T)^{-1})} = \sqrt{\rho((\matr{A}^2)^{-1})} = \sqrt{\max\{\frac{1}{\sigma_1^2}, \dots, \frac{1}{\sigma_r^2}\}} = \frac{1}{\sigma_r}
\]
Furthermore, we can compute the condition number of $\matr{A}$ as:
\[ K(\matr{A}) = \Vert \matr{A} \Vert_2 \cdot \Vert \matr{A}^{-1} \Vert_2 = \sigma_1 \cdot \frac{1}{\sigma_r} \]


\subsection{Application: Matrix approximation}
Given a matrix $\matr{A} \in \mathbb{R}^{m \times n}$ and its SVD decomposition $\matr{A} = \matr{U}\matr{\Sigma}\matr{V}^T$,
we can construct a rank-1 matrix (dyad) $\matr{A}_i \in \mathbb{R}^{m \times n}$ as: \marginnote{Dyad}
\[ \matr{A}_i = \vec{u}_i \vec{v}_i^T \]
where $\vec{u}_i \in \mathbb{R}^m$ is the $i$-th column of $\matr{U}$ and
$\vec{v}_i \in \mathbb{R}^n$ is the $i$-th column of $\matr{V}$.
Then, we can compose $\matr{A}$ as a sum of dyads:
\[ \matr{A}_i = \sum_{i=1}^{r} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{r} \sigma_i \matr{A}_i \]

\marginnote{Rank-$k$ approximation}
By considering only the first $k < r$ singular values, we can obtain a rank-$k$ approximation of $\matr{A}$:
\[ \hat{\matr{A}}(k) = \sum_{i=1}^{k} \sigma_i \vec{u}_i \vec{v}_i^T = \sum_{i=1}^{k} \sigma_i \matr{A}_i \]

\begin{theorem}[Eckart-Young]
    Given $\matr{A} \in \mathbb{R}^{m \times n}$ of rank $r$.
    For any $k \leq r$ (this theorem is interesting for $k < r$), the rank-$k$ approximation is:
    \[
        \hat{\matr{A}}(k) = \arg \min_{\matr{B} \in \mathbb{R}^{m \times n}, \text{rank}(\matr{B}) = k} \Vert \matr{A} - \matr{B} \Vert_2
    \]
\end{theorem}
In other words, among all the possible projections, $\hat{\matr{A}}(k)$ is the closer one to $\matr{A}$.
Moreover, the error of the rank-$k$ approximation is:
\[
    \Vert \matr{A} - \hat{\matr{A}}(k) \Vert_2 =
        \left\Vert \sum_{i=1}^{r} \sigma_i \matr{A}_i - \sum_{j=1}^{k} \sigma_j \matr{A}_j \right\Vert_2 =
        \left\Vert \sum_{i=k+1}^{r} \sigma_i \matr{A}_i \right\Vert_2 =
        \sigma_{k+1}
\]

\subsubsection{Image compression}
Each dyad requires $1 + m + n$ (respectively for $\sigma_i$, $\vec{u}_i$ and $\vec{v}_i$) numbers to be stored.
A rank-$k$ approximation requires to store $k(1 + m + n)$ numbers.
Therefore, the compression factor is given by: \marginnote{Compression factor}
\[
    c_k = 1 - \frac{k(1 + m + n)}{mn}
\]

\begin{figure}[h]
    \centering
    \includegraphics[width=0.60\textwidth]{img/_rank_k_approx.pdf}
    \caption{Approximation of an image}
\end{figure}


\subsection{Application: Linear least squares problem} \label{sec:lls}
A system $\matr{A}\vec{x} = \vec{b}$ with $\matr{A} \in \mathbb{R}^{m \times n} \text{, } m > n$
does not generally have a solution.
\marginnote{Linear least squares}
Therefore, instead of finding the exact solution, it is possible to search for a $\tilde{\vec{x}}$ such that:
\[ \matr{A}\tilde{\vec{x}} - \vec{b} \approx \nullvec \]
In other words, we aim to find a $\tilde{\vec{x}}$ that is close enough to solve the system.
This problem is usually formulated as:
\[
    \tilde{\vec{x}} = \arg\min_{\vec{x} \in \mathbb{R}^n} \Vert \matr{A}\vec{x} - \vec{b} \Vert_2^2
\]
It always admits a solution and, depending on $\text{rank}(\matr{A})$, there two possible cases:
\begin{descriptionlist}
    \item[$\text{rank}(\matr{A}) = n$]
        The solution is unique for each $b \in \mathbb{R}^m$.
        \marginnote{Normal equation}
        It is found by solving the normal equation:
        \[ \matr{A}^T\matr{A}\vec{x} = \matr{A}^T\vec{b} \]
        $\matr{A}^T\matr{A}$ is symmetric definite positive and the system can be solved using the Cholesky factorization.

    \item[$\text{rank}(\matr{A}) < n$] \marginnote{Least squares using SVD}
        The system admits infinite solutions.
        Of all the solutions $S$, we are interested in the one with minimum norm:
        \[ \vec{x}^* = \arg\min_{\vec{x} \in S} \Vert \vec{x} \Vert_2 \]
        This problem can be solved using SVD:
        \[ \vec{x}^* = \sum_{i=1}^{\text{rank}(\matr{A})} \frac{\vec{u}_i^T\vec{b}}{\sigma_i}\vec{v}_i \]
\end{descriptionlist}


\subsection{Application: Polynomial interpolation}
\marginnote{Polynomial interpolation}
Given a set of $m$ data $(x_i, y_i), i=1, \dots, m$,
we want to find a polynomial of degree $n$ ($m > n$) that approximates it.
In other words, we want to find a function:
\[ f(x) = c_0 + c_1 x + c_2 x^2 + \dots + c_n x^n \]
that minimizes the residual vector $\vec{r} = (r_1, \dots, r_m)$,
where $r_i = \vert y_i - f(x_i) \vert$.
We can formulate this as a linear system:
\[
    \vec{r} = \vec{y} - \matr{A}\vec{c} =
    \begin{pmatrix}
        y_1     \\
        \vdots  \\
        y_m
    \end{pmatrix}
    -
    \begin{pmatrix}
        1       & x_1    & x_1^2    & \dots     & x_1^n  \\
        \vdots  & \vdots & \vdots   & \ddots    & \vdots \\
        1       & x_m    & x_m^2    & \dots     & x_m^n
    \end{pmatrix}
    \begin{pmatrix}
        c_0     \\
        \vdots  \\
        c_n
    \end{pmatrix}
\]
that can be solved as a linear least squares problem:
\[ \min_{\vec{c} \in \mathbb{R}^n} \Vert \vec{y} - \matr{A}\vec{c} \Vert_2^2 \]

\begin{figure}[h]
    \centering
    \includegraphics[width=0.40\textwidth]{img/linear_regression.png}
    \caption{Interpolation using a polynomial of degree 1}
\end{figure}


\section{Eigendecomposition vs SVD}
\begin{center}
    \begin{tabular}{m{16em} | m{16em}}
        \hline
        \multicolumn{1}{c|}{\textbf{Eigendecomposition}} & \multicolumn{1}{c}{\textbf{SVD}} \\
        \multicolumn{1}{c|}{$\matr{A} = \matr{P}\matr{D}\matr{P}^{-1}$} & \multicolumn{1}{c}{$\matr{A}=\matr{U}\matr{\Sigma}\matr{V}$} \\
        \hline
        Only defined for square matrices $\matr{A} \in \mathbb{R}^{n \times n}$ with eigenvectors that form a basis of $\mathbb{R}^n$
        & Always exists \\
        \hline
        $\matr{P}$ is not necessarily orthogonal & $\matr{U}$ and $\matr{V}$ are orthogonal \\
        \hline
        The elements on the diagonal of $\matr{D}$ may be in $\mathbb{C}$
        & The elements on the diagonal of $\matr{\Sigma}$ are all non-negative reals \\
        \hline
        \multicolumn{2}{c}{For symmetric matrices, eigendecomposition and SVD are the same} \\
        \hline
    \end{tabular}
\end{center}