mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
720 lines
34 KiB
TeX
720 lines
34 KiB
TeX
\chapter{Optimization}
|
|
|
|
|
|
\section{Definitions}
|
|
|
|
|
|
\subsection{Unconstrained optimization}
|
|
|
|
\begin{description}
|
|
\item[Unconstrained optimization] \marginnote{Unconstrained optimization}
|
|
Problem of form:
|
|
\[ \min_{\z \in \mathbb{R}^d} l(\z) \]
|
|
where $l: \mathbb{R}^d \rightarrow \mathbb{R}$ is the cost function and $\z$ the decision variables.
|
|
\end{description}
|
|
|
|
\begin{theorem}[First-order necessary condition of optimality] \marginnote{First-order necessary condition of optimality}
|
|
Given a point $\z^*$ and a cost function $l: \mathbb{R}^d \rightarrow \mathbb{R}$ such that $l \in C^1$ in $B(\z^*, \varepsilon)$ (i.e., neighbors of $\z^*$ within a radius $\varepsilon$), it holds that:
|
|
\[
|
|
\z^* \text{ is local minimum } \Rightarrow \nabla l(\z^*) = 0
|
|
\]
|
|
\end{theorem}
|
|
|
|
\begin{theorem}[Second-order necessary condition of optimality] \marginnote{Second-order necessary condition of optimality}
|
|
Given a point $\z^*$ and a cost function $l: \mathbb{R}^d \rightarrow \mathbb{R}$ such that $l \in C^2$ in $B(\z^*, \varepsilon)$, it holds that:
|
|
\[
|
|
\z^* \text{ is local minimum } \Rightarrow \nabla^2 l(\z^*) \geq 0 \text{ (i.e., positive semidefinite)}
|
|
\]
|
|
\end{theorem}
|
|
|
|
|
|
\subsection{Convexity}
|
|
|
|
\begin{description}
|
|
\item[Convex set] \marginnote{Convex set}
|
|
A set $Z \subseteq \mathbb{R}^d$ is convex if it holds that:
|
|
\[
|
|
\forall \z_A, \z_B \in Z: \Big( \exists \alpha \in [0, 1]: (\alpha \z_A + (1-\alpha)\z_B) \in Z \Big)
|
|
\]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.4\linewidth]{img/_convex_set.pdf}
|
|
\end{figure}
|
|
|
|
\item[Convex function] \marginnote{Convex function}
|
|
Given a convex set $Z \subseteq \mathbb{R}^d$, a function $l: Z \rightarrow \mathbb{R}$ is convex if it holds that:
|
|
\[
|
|
\forall \z_A, \z_B \in Z: \Big( \exists \alpha \in [0, 1]: l(\alpha \z_A + (1-\alpha) \z_B) \leq \alpha l(\z_A) + (1-\alpha) l(\z_B) \Big)
|
|
\]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.25\linewidth]{img/_convex_function.pdf}
|
|
\end{figure}
|
|
|
|
\begin{remark}
|
|
Given a differentiable and convex function $l: Z \rightarrow \mathbb{R}$, it holds that any of its points lie above all its tangents:
|
|
\[ \forall \z_A, \z_B \in Z: l(\z_B) \geq l(\z_A) + \nabla l(\z_A)^T (\z_B - \z_A) \]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.3\linewidth]{img/_convex_tangent.pdf}
|
|
\end{figure}
|
|
\end{remark}
|
|
|
|
\item[Strongly convex function] \marginnote{Strongly convex function}
|
|
Given a convex set $Z \subseteq \mathbb{R}^d$, a function $l: Z \rightarrow \mathbb{R}$ is strongly convex with parameter $\mu > 0$ if it holds that:
|
|
\[
|
|
\begin{split}
|
|
\forall \z_A, \z_B \in Z, \z_A \neq \z_B: \Big( \exists \alpha \in (0, 1)&: l(\alpha \z_A + (1-\alpha) \z_B) < \\
|
|
&\alpha l(\z_A) + (1-\alpha) l(\z_B) - \frac{1}{2} \mu \alpha (1-\alpha) \Vert \z_A-\z_B \Vert^2 \Big)
|
|
\end{split}
|
|
\]
|
|
Intuitively, it is strictly convex and grows as fast as a quadratic function.
|
|
|
|
\begin{remark}
|
|
Given a differentiable and $\mu$-strongly convex function $l: Z \rightarrow \mathbb{R}$, it holds that any of its points lie above all the paraboloids with curvature determined by $\mu$ and tangent to a point of the function:
|
|
\[ \forall \z_A, \z_B \in Z: l(\z_B) \geq l(\z_A) + \nabla l(\z_A)^T (\z_B - \z_A) + \frac{\mu}{2} \Vert \z_B - \z_A \Vert^2 \]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.35\linewidth]{img/_strongly_convex.pdf}
|
|
\end{figure}
|
|
|
|
A geometric interpretation is that strong convexity imposes a quadratic lower-bound to the function.
|
|
\end{remark}
|
|
\end{description}
|
|
|
|
\begin{lemma}[Convexity and gradient monotonicity] \marginnote{Convexity and gradient monotonicity}
|
|
Given a differentiable and convex function $l$, its gradient $\nabla l$ is a monotone operator, which means that it satisfies:
|
|
\[
|
|
\forall \z_A, \z_B: \big( \nabla l(\z_A) - \nabla l(\z_B) \big)^T (\z_A - \z_B) \geq 0
|
|
\]
|
|
\end{lemma}
|
|
|
|
\begin{lemma}[Strict convexity and gradient monotonicity] \marginnote{Strict convexity and gradient monotonicity}
|
|
Given a differentiable and strictly convex function $l$, its gradient $\nabla l$ is a strictly monotone operator, which means that it satisfies:
|
|
\[
|
|
\forall \z_A, \z_B: \big( \nabla l(\z_A) - \nabla l(\z_B) \big)^T (\z_A - \z_B) > 0
|
|
\]
|
|
\end{lemma}
|
|
|
|
\begin{lemma}[Strong convexity and gradient monotonicity] \marginnote{Strong convexity and gradient monotonicity}
|
|
Given a differentiable and $\mu$-strongly convex function $l$, its gradient $\nabla l$ is a strongly monotone operator, which means that it satisfies:
|
|
\[
|
|
\forall \z_A, \z_B: \big( \nabla l(\z_A) - \nabla l(\z_B) \big)^T (\z_A - \z_B) \geq \mu \Vert \z_A - \z_B \Vert^2
|
|
\]
|
|
\end{lemma}
|
|
|
|
\begin{description}
|
|
\item[Lipschitz continuity] \marginnote{Lipschitz continuity}
|
|
Given a function $l$, it is Lipschitz continuous with parameter $L > 0$ if:
|
|
\[
|
|
\forall \z_A, \z_B: \Vert l(\z_A) - l(\z_B) \Vert \leq L \Vert \z_A - \z_B \Vert
|
|
\]
|
|
|
|
\begin{remark}
|
|
Given a differentiable function $l$ with $L$-Lipschitz continuous gradient $\nabla l$, it holds that any of its points lie below all the paraboloids with curvature determined by $L$ and tangent to a point of the function:
|
|
\[ \forall \z_A, \z_B \in Z: l(\z_B) \leq l(\z_A) + \nabla l(\z_A)^T (\z_B - \z_A) + \frac{L}{2} \Vert \z_B - \z_A \Vert^2 \]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.35\linewidth]{img/_lipschitz_gradient.pdf}
|
|
\end{figure}
|
|
|
|
A geometric interpretation is that Lipschitz continuity of the gradient imposes a quadratic upper-bound to the function.
|
|
\end{remark}
|
|
\end{description}
|
|
|
|
\begin{lemma}[Convexity and Lipschitz continuity of gradient] \marginnote{Convexity and Lipschitz continuity of gradient}
|
|
Given a differentiable convex function $l$ with $L$-Lipschitz continuous gradient $\nabla l$, its gradient is a co-coercive operator, which means that it satisfies:
|
|
\[
|
|
\forall \z_A, \z_B: \Big( \nabla l(\z_A) - \nabla l(\z_B) \Big)^T (\z_A - \z_B) \geq \frac{1}{L} \Vert \nabla l(\z_A) - \nabla l(\z_B) \Vert^2
|
|
\]
|
|
\end{lemma}
|
|
|
|
\begin{lemma}[Strong convexity and Lipschitz continuity of gradient] \marginnote{Strong convexity and Lipschitz continuity of gradient} \phantomsection\label{th:strong_convex_lipschitz_gradient}
|
|
Given a differentiable $\mu$-strongly convex function $l$ with $L$-Lipschitz continuous gradient $\nabla l$, its gradient is a strongly co-coercive operator, which means that it satisfies:
|
|
\[
|
|
\forall \z_A, \z_B: \Big( \nabla l(\z_A) - \nabla l(\z_B) \Big)^T (\z_A - \z_B) \geq \underbrace{\frac{\mu L}{\mu+L}}_{\gamma_1} \Vert \z_A - \z_B \Vert^2 + \underbrace{\frac{1}{\mu+L}}_{\gamma_2} \Vert \nabla l(\z_A) - \nabla l(\z_B) \Vert^2
|
|
\]
|
|
\end{lemma}
|
|
|
|
|
|
|
|
\section{Iterative descent methods}
|
|
|
|
\begin{theorem}
|
|
Given a convex function $l$, it holds that a local minimum of $l$ is also global.
|
|
|
|
Moreover, in the unconstrained optimization case, the first-order necessary condition of optimality is sufficient for a global minimum.
|
|
\end{theorem}
|
|
|
|
\begin{theorem}
|
|
Given a convex function $l$, it holds that $\z^*$ is a global minimum if and only if $\nabla f(\z^*) = 0$.
|
|
\end{theorem}
|
|
|
|
|
|
\begin{description}
|
|
\item[Iterative descent] \marginnote{Iterative descent}
|
|
Given a function $l$ and an initial guess $\z^{0}$, an iterative descent algorithm iteratively moves to new points $\z^{k}$ such that:
|
|
\[
|
|
\forall k \in \mathbb{N}: l(\z^{k+1}) < l(\z^{k})
|
|
\]
|
|
\end{description}
|
|
|
|
|
|
\subsection{Gradient method}
|
|
|
|
\begin{description}
|
|
\item[Gradient method] \marginnote{Gradient method}
|
|
Algorithm that given the function $l$ to minimize and the initial guess $\z^0$, computes the update as:
|
|
\[ \z^{k+1} = \z^k - \alpha^k \nabla l(\z^k) \]
|
|
where $\alpha^k > 0$ is the step size and $- \nabla l(\z^k)$ is the step direction.
|
|
|
|
\begin{theorem}
|
|
For a sufficiently small $\alpha^k > 0$, the gradient method is an iterative descent algorithm:
|
|
\[ l(\z^{k+1}) < l(\z^{k}) \]
|
|
|
|
\begin{proof}
|
|
Consider the first-order Taylor approximation of $l(\z^{k+1})$ about $\z^k$:
|
|
\[
|
|
\begin{split}
|
|
l(\z^{k+1}) &= l(\z^k) + \nabla l(\z^k)^T (\z^{k+1} - \z^k) + o(\Vert \z^{k+1} - \z^k \Vert) \\
|
|
&= l(\z^k) - \alpha^k \Vert \nabla l(\z^k)\Vert^2 + o(\alpha^k)
|
|
\end{split}
|
|
\]
|
|
Therefore, $l(\z^{k+1}) < l(\z^{k})$ for some $\alpha^k$.
|
|
\end{proof}
|
|
\end{theorem}
|
|
|
|
\begin{remark}[Step size choice] \marginnote{Step size choice}
|
|
Possible choices for the step size are:
|
|
\begin{descriptionlist}
|
|
\item[Constant]
|
|
$\forall k \in \mathbb{N}: \alpha^k = \alpha > 0$.
|
|
|
|
\item[Diminishing]
|
|
$\alpha^k \overset{k \rightarrow \infty}{\longrightarrow} 0$. To avoid decreasing the step too much, a typical choice is an $\alpha^k$ such that:
|
|
\[
|
|
\sum_{k=0}^{\infty} \alpha^k = \infty
|
|
\qquad
|
|
\sum_{k=0}^{\infty} (\alpha^k)^2 < \infty
|
|
\]
|
|
|
|
\item[Line search]
|
|
Algorithmic methods such as the Armijo rule.
|
|
\end{descriptionlist}
|
|
\end{remark}
|
|
|
|
\item[Generalized gradient method] \marginnote{Generalized gradient method}
|
|
Gradient method where the update rule is generalized as:
|
|
\[ \z^{k+1} = \z^k - \alpha^k \matr{D}^k \nabla l(\z^k) \]
|
|
where $\matr{D}^k \in \mathbb{R}^{d \times d}$ is uniformly positive definite (i.e., $\delta_1 \matr{I} \leq \matr{D}^k \leq \delta_2 \matr{I}$ for some $\delta_2 \geq \delta_1 > 0$).
|
|
|
|
Possible choices for $\matr{D}^k$ are:
|
|
\begin{itemize}
|
|
\item Steepest descent: $\matr{D}^k = \matr{I}$.
|
|
\item Newton's method: $\matr{D}^k = (\nabla^2 l(\z^k))^{-1}$.
|
|
\item Quasi-Newton method: $\matr{D}^k = (H(\z^k))^{-1}$, where $H(\z^k) \approx \nabla^2 l(\z^k)$.
|
|
\end{itemize}
|
|
\end{description}
|
|
|
|
\begin{description}
|
|
\item[Gradient method as discrete-time integrator with feedback] \marginnote{Gradient method as discrete-time integrator with feedback}
|
|
The gradient method can be interpreted as a discrete-time integrator with a feedback loop. This means that it is composed of:
|
|
\begin{descriptionlist}
|
|
\item[Integrator] A linear system that defines the update: $\z^{k+1} = \z^k - \alpha \vec{u}^k$.
|
|
\item[Plant] A non-linear (bounded) function whose output is re-injected into the integrator. In this case, it is the gradient: $\vec{u}^k = \nabla l(\z^k)$.
|
|
\end{descriptionlist}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.55\linewidth]{./img/_gradient_method_integrator.pdf}
|
|
\end{figure}
|
|
\end{description}
|
|
|
|
\begin{theorem}[Gradient method convergence] \marginnote{Gradient method convergence}
|
|
Consider a function $l$ such that:
|
|
\begin{itemize}
|
|
\item $\nabla l$ is $L$-Lipschitz continuous,
|
|
\item The step size is constant or diminishing.
|
|
\end{itemize}
|
|
Let $\{ \z^k \}_{k \in \mathbb{N}}$ be the (bounded) sequence generated by the gradient method. It holds that every limit point $\bar{\z}$ of the sequence $\{ \z^k \}_{k \in \mathbb{N}}$ is a stationary point (i.e., $\nabla l(\bar{z}) = 0$).
|
|
|
|
In addition, if $l$ is $\mu$-strongly convex and the step size is constant, then the convergence rate of the sequence $\{ \z^k \}_{k \in \mathbb{N}}$ is exponential (also said geometric or linear):
|
|
\[
|
|
\Vert \z^k - \z^* \Vert \leq M \rho^k
|
|
\]
|
|
where $\rho \in (0, 1)$ and $M > 0$ depends on $\mu$, $L$, and $\Vert \z^0 - \z^* \Vert$.
|
|
|
|
\begin{proof}
|
|
We need to prove the two parts of the theorem:
|
|
\begin{enumerate}
|
|
\item
|
|
We want to prove that any limit point of the sequence generated by the gradient method is a stationary point.
|
|
|
|
In other words, by considering the gradient method as an integrator with feedback, we want to analyze the equilibrium of the system. Assume that the system converges to some equilibrium $\z_E$. To be an equilibrium, it must be that the feedback loop stopped updating the system (i.e., $\vec{u}^k = 0$ for $k$ after some threshold) so that:
|
|
\[
|
|
\z_E = \z_E - \alpha \nabla l(\z_E)
|
|
\]
|
|
Therefore, an equilibrium point is necessarily a stationary point of $l$ as it must be that $\nabla l(\z_E) = 0$.
|
|
|
|
\item
|
|
We want to prove that if $l$ is $\mu$-strongly convex and the step size is constant, the sequence converges exponentially.
|
|
|
|
\begin{remark}
|
|
As $l$ is convex, its equilibrium is also the global minimum $\z^*$.
|
|
\end{remark}
|
|
|
|
Consider the following change in coordinates (i.e., a translation):
|
|
\[
|
|
\begin{gathered}
|
|
\z^k \mapsto \tilde{\z}^k \\
|
|
\text{with } \tilde{\z}^k = \z^k - \z_E = \z^k - \z^*
|
|
\end{gathered}
|
|
\]
|
|
The system in the new coordinates becomes:
|
|
\[
|
|
\begin{aligned}
|
|
&\tilde{\z}^{k+1} = \tilde{\z}^k - \alpha \vec{u}^k \\
|
|
&\begin{aligned}
|
|
\vec{u}^k &= \nabla l(\z^k) \\
|
|
&= \nabla l(\tilde{\z}^k + \z^*) \\
|
|
&= \nabla l(\tilde{\z}^k + \z^*) - \nabla l(\z^*) & & & \text{\small $\nabla l(\z^*)=0$, but useful for \Cref{th:strong_convex_lipschitz_gradient}}
|
|
\end{aligned}
|
|
\end{aligned}
|
|
\]
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.55\linewidth]{./img/_gradient_method_integrator_new_coords.pdf}
|
|
\end{figure}
|
|
|
|
\begin{remark}
|
|
As $l$ is strongly convex and its gradient Lipschitz continuous, by \Cref{th:strong_convex_lipschitz_gradient} it holds that:
|
|
\[
|
|
-(\vec{u}^k)^T \tilde{\z}^k \leq - \gamma_1 \Vert \tilde{\z}^k \Vert^2 - \gamma_2 \Vert \tilde{\vec{u}}^k \Vert^2
|
|
\]
|
|
\end{remark}
|
|
|
|
Consider a Lyapunov function $V: \mathbb{R}^d \rightarrow \mathbb{R}_{\geq 0}$ defined as:
|
|
\[
|
|
V(\tilde{\z}) = \Vert \tilde{\z} \Vert^2
|
|
\]
|
|
It holds that:
|
|
\[
|
|
\begin{aligned}
|
|
V(\tilde{\z}^{k+1}) - V(\tilde{\z}^k) &= \Vert \tilde{\z}^{k+1} \Vert^2 - \Vert \tilde{\z}^k \Vert^2 \\
|
|
&= \cancel{\Vert \tilde{\z}^k \Vert^2} - 2\alpha(\vec{u}^k)^T\tilde{\z}^k + \alpha^2 \Vert \vec{u}^k \Vert^2 - \cancel{\Vert \tilde{\z}^k \Vert^2} &&& \text{\Cref{th:strong_convex_lipschitz_gradient}} \\
|
|
&\leq -2\alpha\gamma_1 \Vert\tilde{\z}^k\Vert^2 + \alpha(\alpha-2\gamma_2) \Vert\vec{u}^k\Vert^2
|
|
\end{aligned}
|
|
\]
|
|
|
|
By choosing $\alpha \leq 2\gamma_2$, we have that:
|
|
\[
|
|
\begin{split}
|
|
V(\tilde{\z}^{k+1}) - V(\tilde{\z}^k) &\leq -2\alpha\gamma_1 \Vert \tilde{\z}^k \Vert^2 \\
|
|
\iff \Vert \tilde{\z}^{k+1} \Vert^2 - \Vert \tilde{\z}^k \Vert^2 &\leq -2\alpha\gamma_1 \Vert \tilde{\z}^k \Vert^2 \\
|
|
\iff \Vert \tilde{\z}^{k+1} \Vert^2 &\leq (1-2\alpha\gamma_1) \Vert \tilde{\z}^k \Vert^2 \\
|
|
\end{split}
|
|
\]
|
|
Finally, as the gradient method is an iterative descent algorithm, it holds that:
|
|
\[
|
|
\begin{split}
|
|
\Vert \tilde{\z}^{k+1} \Vert^2 &\leq (1-2\alpha\gamma_1) \Vert \tilde{\z}^k \Vert^2 \\
|
|
&\leq \dots \\
|
|
&\leq (1-2\alpha\gamma_1)^k \Vert \tilde{\z}^0 \Vert^2 \\
|
|
\end{split}
|
|
\]
|
|
Therefore, the sequence $\{ \tilde{\z}^k \}_{k \in \mathbb{R}}$ goes exponentially fast to zero and we have shown that:
|
|
\[
|
|
\begin{split}
|
|
\Vert \z^{k+1} - \z^* \Vert^2 &\leq (1-2\alpha\gamma_1)^k \Vert \z^0 - \z^* \Vert^2 \\
|
|
&= \rho^k M
|
|
\end{split}
|
|
\]
|
|
\end{enumerate}
|
|
\end{proof}
|
|
\end{theorem}
|
|
|
|
\begin{remark}[Gradient method for a quadratic function] \marginnote{Gradient method for a quadratic function}
|
|
Given the problem of minimizing a quadratic function:
|
|
\[
|
|
\min_{\z} \frac{1}{2}\z^T \matr{Q} \z + \vec{r}^T \z
|
|
\qquad
|
|
\nabla l = \matr{Q} \z^k + \vec{r}
|
|
\]
|
|
The gradient method can be reduced to an affine linear system:
|
|
\[
|
|
\begin{split}
|
|
\z^{k+1} &= \z^k - \alpha (\matr{Q} \z^k + \vec{r}) \\
|
|
&= (\matr{I} - \alpha \matr{Q}) \z^k - \alpha \vec{r}
|
|
\end{split}
|
|
\]
|
|
For a sufficiently small $\alpha$, the matrix $(\matr{I} - \alpha \matr{Q})$ is Schur (i.e., $\forall \matr{\rho}, |\matr{\rho}| < 1: \sum_{i=0}^{\infty} \matr{\rho}^i = (1-\matr{\rho})^{-1}$). Therefore, the solution can be computed in closed form as:
|
|
\[
|
|
\begin{split}
|
|
\z^k &= (\matr{I} - \alpha \matr{Q})^k \z^0 - \alpha \sum_{\tau=0}^{k-1} (\matr{I} - \alpha \matr{Q})^\tau \vec{r} \\
|
|
&\overset{k \rightarrow \infty}{\longrightarrow} - \alpha \left( \sum_{\tau=0}^{\infty} (\matr{I} - \alpha \matr{Q})^\tau \right) \vec{r} = -\matr{Q}^{-1} \vec{r}
|
|
\end{split}
|
|
\]
|
|
\end{remark}
|
|
|
|
\begin{remark}[Gradient flow] \marginnote{Gradient flow}
|
|
By inverting the integrator and plant of the discrete-time integrator of the gradient method, and considering the continuous-time case, the result is the gradient flow:
|
|
\[
|
|
\dot{\z}(t) = -\nabla l(\z(t))
|
|
\]
|
|
which has a solution if the vector field is Lipschitz continuous.
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.8\linewidth]{./img/_gradient_flow.pdf}
|
|
\end{figure}
|
|
\end{remark}
|
|
|
|
|
|
\subsection{Accelerated gradient methods}
|
|
|
|
\begin{description}
|
|
\item[Heavy-ball method] \marginnote{Heavy-ball method}
|
|
Given $\eta^0$ and $\eta^{-1}$, the algorithm is defined as:
|
|
\[
|
|
\eta^{k+1} = \eta^k + \alpha_1 (\eta^k - \eta^{k-1}) - \alpha_2 \nabla l(\eta^k)
|
|
\]
|
|
with $\alpha_1, \alpha_2 > 0$.
|
|
|
|
\begin{remark}
|
|
With $\alpha_1 = 0$, the algorithm is reduced to the gradient method with step size $\alpha_2$.
|
|
\end{remark}
|
|
|
|
\begin{remark}
|
|
The algorithm admits a state-space representation as a discrete-time integrator with a feedback loop:
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.55\linewidth]{./img/_heavy_ball.pdf}
|
|
\end{figure}
|
|
|
|
Note that the matrix $\begin{bmatrix} 1+\alpha_1 & -\alpha_1 \\ 1 & 0 \end{bmatrix}$ is row stochastic.
|
|
\end{remark}
|
|
|
|
\item[Generalized heavy-ball method] \marginnote{Generalized heavy-ball method}
|
|
Given $\zeta^0$ and $\zeta^{-1}$, the algorithm is defined as:
|
|
\[
|
|
\zeta^{k+1} = \zeta^k + \alpha_1 (\zeta^k - \zeta^{k-1}) - \alpha_2 \nabla l(\zeta^k + \alpha_3(\zeta^k - \zeta^{k-1}))
|
|
\]
|
|
with $\alpha_1, \alpha_2, \alpha_3 > 0$.
|
|
|
|
\begin{remark}
|
|
The algorithm admits a state-space representation as a discrete-time integrator with a feedback loop:
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.55\linewidth]{./img/_generalized_heavy_ball.pdf}
|
|
\end{figure}
|
|
\end{remark}
|
|
\end{description}
|
|
|
|
|
|
|
|
\section{Cost-coupled optimization}
|
|
|
|
|
|
\begin{description}
|
|
\item[Cost-coupled optimization] \marginnote{Cost-coupled optimization}
|
|
Problem of minimizing $N$ cost functions $l_i: \mathbb{R}^d \rightarrow \mathbb{R}$, each local and private to an agent:
|
|
\[
|
|
\min_{\z \in \mathbb{R}^{d}} \sum_{i=1}^{N} l_i(\z)
|
|
\]
|
|
\end{description}
|
|
|
|
|
|
\subsection{Optimization methods}
|
|
|
|
\begin{description}
|
|
\item[Batch gradient method] \marginnote{Batch gradient method}
|
|
Compute the direction for the gradient method by considering all the losses:
|
|
\[
|
|
\z^{k+1} = \z^k - \alpha \sum_{i=1}^{N} \nabla l_i(\z^k)
|
|
\]
|
|
|
|
\begin{remark}
|
|
Computation in this way can be expensive.
|
|
\end{remark}
|
|
|
|
\item[Incremental gradient method] \marginnote{Incremental gradient method}
|
|
At each iteration $k$, compute the direction by considering the loss of a single agent $i^k$:
|
|
\[
|
|
\z^{k+1} = \z^k - \alpha \nabla l_{i^k}(\z^k)
|
|
\]
|
|
|
|
\begin{remark}
|
|
Two possible rules to select the agent at each iteration are:
|
|
\begin{descriptionlist}
|
|
\item[Cyclic]
|
|
$i^k = 1, 2, \dots, N, 1, 2, \dots, N, \dots$, or cyclic in any order (essentially cyclic).
|
|
\item[Randomized]
|
|
Draw $i^k$ from a uniform distribution.
|
|
\end{descriptionlist}
|
|
\end{remark}
|
|
|
|
\begin{remark}
|
|
A single gradient is not necessarily a descent direction.
|
|
\end{remark}
|
|
|
|
\begin{theorem}
|
|
If the step size is diminishing, the incremental gradient method converges.
|
|
\end{theorem}
|
|
\end{description}
|
|
|
|
|
|
\begin{description}
|
|
\item[Stochastic gradient descent (SGD)] \marginnote{Stochastic gradient descent (SGD)}
|
|
Instance of incremental gradient method where the selection rule follows an unknown distribution.
|
|
|
|
The problem can be formulated as:
|
|
\[ \min_{\z \in \mathbb{R}^d} \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})] \]
|
|
where $\mathcal{W}$ is a random variable with possibly an unknown distribution.
|
|
|
|
It is assumed that, given any realization $\bar{w}$ of $\mathcal{W}$ (e.g., the index of an agent or a single data point), it is possible to obtain the gradient $\nabla l(\bar{\z}, \bar{w})$ at any query point $\bar{\z}$. The optimization step at each iteration is then:
|
|
\[ \z^{k+1} = \z^k - \alpha \nabla l(\z^k, w^k) \]
|
|
|
|
\begin{remark}
|
|
Monte Carlo approximation can be used to represent the expected value with a finite sequence of realizations:
|
|
\[ \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})] \approx \frac{1}{K} \sum_{k=1}^{K} l(\z, w^k) \]
|
|
\end{remark}
|
|
|
|
\begin{theorem}[SGD convergence with constant step size] \marginnote{SGD convergence with constant step size}
|
|
Given a function $l$ such that:
|
|
\begin{itemize}
|
|
\item $l$ is $\mu$-strongly convex with $L$-Lipschitz continuous gradient (i.e., bounded),
|
|
\item $\nabla l(\z, \mathcal{W})$ is an unbiased estimate of $\nabla_\z \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})]$,
|
|
\item $\Vert \nabla l(\z, \mathcal{W}) \Vert \leq M_\nabla$ almost surely (i.e., asymptotically with probability $1$) for some $M_\nabla > 0$.
|
|
\end{itemize}
|
|
With a constant step size $\alpha \leq \frac{1}{2}\mu$, it holds that at any time step $k$:
|
|
\[
|
|
\Vert \z^k - \z^* \Vert \leq
|
|
\underbrace{(1-2\mu\alpha)^k \left( \Vert \z^0 - \z^* \Vert - \frac{\alpha M_\nabla^2}{2\mu} \right)}_{\text{Error term}} +
|
|
\underbrace{\frac{\alpha M_\nabla^2}{2\mu}}_{\text{Residual term}} \]
|
|
where the error diminishes over time and the residual term is constant.
|
|
\end{theorem}
|
|
|
|
\begin{theorem}[SGD convergence with diminishing step size] \marginnote{SGD convergence with diminishing step size}
|
|
With a diminishing step size, both the error and the residual converge to $0$.
|
|
\end{theorem}
|
|
|
|
\item[Mini-batch SGD] \marginnote{Mini-batch SGD}
|
|
SGD where the update at each time step $k$ is based on a set $\mathcal{I}^k \subset \{ 1, \dots, N \}$ of realizations of $\mathcal{W}$:
|
|
\[ \z^{k+1} = \z^k - \alpha \sum_{i \in \mathcal{I}^k} \nabla l(\z^k, w^i) \]
|
|
|
|
\item[Adaptive momentum (ADAM)] \marginnote{Adaptive momentum (ADAM)}
|
|
Method based on the first and second momentum of the gradient:
|
|
\[
|
|
\begin{split}
|
|
\vec{m}^{k+1} &= \beta_1 \vec{m}^k + (1-\beta_1) \nabla l(\z^k, w^k) \\
|
|
\vec{v}^{k+1} &= \beta_2 \vec{v}^k + (1-\beta_2) \left( \nabla l(\z^k, w^k) \right)^2
|
|
\end{split}
|
|
\]
|
|
where $\beta_1, \beta_2 \in (0, 1)$ are hyperparameters.
|
|
|
|
The descent direction is defined as:
|
|
\[
|
|
\begin{gathered}
|
|
\hat{\vec{m}} = \frac{1}{1 - \beta_1^{k+1}} \vec{m}^{k+1}
|
|
\quad
|
|
\hat{\vec{v}} = \frac{1}{1 - \beta_2^{k+1}} \vec{v}^{k+1} \\
|
|
\vec{d}^k = - \frac{\hat{\vec{m}}}{\sqrt{\hat{\vec{v}}} + \varepsilon} \\
|
|
\end{gathered}
|
|
\]
|
|
The update is performed as:
|
|
\[ \z^{k+1} = \z^{k} + \alpha \vec{d}^k \]
|
|
\end{description}
|
|
|
|
|
|
|
|
\subsection{Learning paradigms}
|
|
|
|
\begin{description}
|
|
\item[Federated learning] \marginnote{Federated learning}
|
|
Problem where $N$ agents with their local and private data $\mathcal{D}^{i}$ want to learn a common set of parameters $\z^*$ based on the same loss function (evaluated on different data points):
|
|
\[
|
|
\min_\z \sum_{i=1}^{N} l(\z; \mathcal{D}^i)
|
|
\]
|
|
A centralized parameter server (master) is responsible for aggregating the estimates of the agents (e.g., pick some nodes and average them).
|
|
% \[
|
|
% \z^{t+1} = \z^k - \alpha \sum_{i \in I_k} \nabla l(\z; \mathcal{D}^i, p^i)
|
|
% \]
|
|
|
|
\item[Distributed learning] \marginnote{Distributed learning}
|
|
Federated learning where there is no centralized entity and agents communicate with their neighbors only.
|
|
\end{description}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\begin{subfigure}{0.45\linewidth}
|
|
\centering
|
|
\includegraphics[width=0.55\linewidth]{./img/federated_learning.png}
|
|
\caption{Federated learning}
|
|
\end{subfigure}
|
|
\begin{subfigure}{0.45\linewidth}
|
|
\centering
|
|
\includegraphics[width=0.7\linewidth]{./img/distributed_learning.png}
|
|
\caption{Distributed learning}
|
|
\end{subfigure}
|
|
\end{figure}
|
|
|
|
|
|
|
|
\section{Distributed cost-coupled/consensus optimization}
|
|
|
|
\begin{description}
|
|
\item[Distributed cost-coupled optimization] \marginnote{Distributed cost-coupled optimization}
|
|
Optimization problem with $N$ agents that communicate according to a graph $G$ aiming at learning a common set of parameters $\z$ such that:
|
|
\[
|
|
\min_{\z \in Z} \sum_{i=1}^{N} l_i(\z)
|
|
\]
|
|
where:
|
|
\begin{itemize}
|
|
\item Each agent $i$ knows its loss $l_i$ (based on its available data) and the parameter space $Z$,
|
|
\item At each time step $k$, each agent $i$ estimates a set of parameters $\z_i^k$.
|
|
\end{itemize}
|
|
|
|
\begin{figure}[H]
|
|
\centering
|
|
\includegraphics[width=0.45\linewidth]{./img/_distributed_cost_coupled.pdf}
|
|
\end{figure}
|
|
\end{description}
|
|
|
|
\begin{remark}
|
|
Using as direction the sum of the gradients of all agents is not possible as not everyone can communicate with everyone.
|
|
\end{remark}
|
|
|
|
|
|
\subsection{Distributed gradient algorithm}
|
|
|
|
\begin{description}
|
|
\item[Distributed gradient algorithm] \marginnote{Distributed gradient algorithm}
|
|
Method that estimates a (more precise) set of parameters as a weighted sum those of its neighbors' (self-loop included):
|
|
\[
|
|
\vec{v}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k
|
|
\]
|
|
Then, the update step is performed using $\vec{v}_i^{k+1}$ and the agent's own local loss $l_i$:
|
|
\[
|
|
\begin{split}
|
|
\z_i^{k+1} &= \vec{v}_i^{k+1} - \alpha^k \nabla l_i(\vec{v}_i^{k+1}) \\
|
|
&= \left(\sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k\right) - \alpha^k \nabla l_i\left(\sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k\right)
|
|
\end{split}
|
|
\]
|
|
\end{description}
|
|
|
|
\begin{theorem}[Distributed gradient algorithm convergence] \marginnote{Distributed gradient algorithm convergence}
|
|
Assume that:
|
|
\begin{itemize}
|
|
\item The matrix $\matr{A}$ associated to the undirected and connected communication graph $G$ is doubly stochastic and such that $a_{ij} > 0$,
|
|
\item The step size is diminishing,
|
|
\item Each $l_i$ is convex, has gradients bounded by a scalar $C_i > 0$, and there exists at least one optimal solution.
|
|
\end{itemize}
|
|
Then, the sequence of local solutions $\{ \z_i^k \}_{k \in \mathbb{N}}$ of each agent $i$ produced using the distributed gradient algorithm converges to a common optimal solution $\z^*$:
|
|
\[ \lim_{k \rightarrow \infty} \Vert \z_i^k - \z^* \Vert = 0 \]
|
|
\end{theorem}
|
|
|
|
|
|
\begin{description}
|
|
\item[Distributed projected subgradient algorithm] \marginnote{Distributed projected subgradient algorithm}
|
|
Distributed gradient algorithm extended to the case where $l_i$ are non-smooth convex functions and $\z$ is constrained to a closed convex set $Z \subseteq \mathbb{R}^d$. The distributed step is the following:
|
|
\[
|
|
\begin{split}
|
|
\vec{v}_i^{k+1} &= \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k \\
|
|
\z_i^{k+1} &= P_Z \big( \vec{v}_i^{k+1} - \alpha^k \tilde{\nabla} l_i(\vec{v}_i^{k+1}) \big)
|
|
\end{split}
|
|
\]
|
|
where $P_Z(\cdot)$ is the Euclidean projection onto $Z$ and $\tilde{\nabla} l_i$ is a subgradient of $l_i$.
|
|
\end{description}
|
|
|
|
\begin{theorem}[Distributed projected subgradient algorithm convergence] \marginnote{Distributed projected subgradient algorithm convergence}
|
|
Assume that:
|
|
\begin{itemize}
|
|
\item The adjacency matrix $\matr{A}$ associated to $G$ is doubly stochastic and $a_{ij} > 0$,
|
|
\item The step size is diminishing,
|
|
\item Each $l_i$ is convex, has subgradients bounded by a scalar $C_i > 0$, and there exists at least one optimal solution.
|
|
\end{itemize}
|
|
Then, each agent converges to an optimal solution $\z^*$.
|
|
\end{theorem}
|
|
|
|
|
|
\subsection{Gradient tracking algorithm}
|
|
|
|
\begin{theorem}
|
|
The distributed gradient algorithm does not converge with a constant step size.
|
|
|
|
\begin{proof}[Proof idea]
|
|
% Assume that the starting guess in standard gradient descent is is a local minimum:
|
|
% \[ \z^{0} = \z^* \]
|
|
% We have that:
|
|
% \[
|
|
% \begin{split}
|
|
% \z^* &= \z^* - \alpha \nabla l(\z^*) \\
|
|
% \z^* &= \z^* \\
|
|
% \end{split}
|
|
% \]
|
|
% $\z^*$ is an equilibrium.
|
|
|
|
We want to check whether the optimum $\z^*$ with a constant step size $\alpha$ is an equilibrium:
|
|
\[
|
|
\begin{aligned}
|
|
\z^* &= \sum_{j=1}^N a_{ij} \z^* - \alpha \nabla l_i \left( \sum_{j=1}^N a_{ij} \z^* \right) \\
|
|
&= \z^* - \alpha \nabla l_i (\z^*) &&& \parbox{0.20\linewidth}{\footnotesize $\matr{A}$ doubly stochastic and $\z^*$ constant} \\
|
|
\end{aligned}
|
|
\]
|
|
In general, $\nabla l_i(\z^*) \neq 0$ ($\z^*$ is the optimum for the whole problem, but $l_i$ depends on the subset of data available to the agent). Therefore, $\z^*$ is not an equilibrium.
|
|
\end{proof}
|
|
\end{theorem}
|
|
|
|
|
|
\begin{description}
|
|
\item[Dynamic average consensus] \marginnote{Dynamic average consensus}
|
|
Consensus algorithm where each agent measures a signal $r_i^k$ and wants to estimate the average signal of all agents:
|
|
\[ \bar{r}^k = \frac{1}{N} \sum_{i=1}^{N} r_i^k \]
|
|
The average signal estimated by an agent is represented by a state $s_i^k$ and we want that $\lim_{k \rightarrow \infty} \Vert s_i^k - \bar{r}^k \Vert = 0$. This can be achieved using a perturbed consensus algorithm:
|
|
\[
|
|
s_i^{k+1} =
|
|
\underbrace{\sum_{j \in \mathcal{N}_i} a_{ij} s_j^k}_{\text{Consensus}} +
|
|
\underbrace{\vphantom{\sum_{j \in \mathcal{N}_i}}(r_i^{k+1} - r_i^k)}_{\text{Innovation}}
|
|
\]
|
|
where:
|
|
\begin{itemize}
|
|
\item The consensus term converges to the states average.
|
|
\item The local innovation allows converging to the common signal.
|
|
\end{itemize}
|
|
|
|
\begin{theorem}[Dynamic average consensus convergence]
|
|
If the first-order differences are bounded (i.e., $\Vert r_i^{k+1} - r_i^{k} \Vert \leq C_1$), then the tracking error is bounded by some $C_2 > 0$:
|
|
\[ \lim_{k \rightarrow \infty} \Vert s_i^k - \bar{r}^k \Vert \leq C_2 \]
|
|
|
|
Moreover, the error is zeroed if the signal becomes constant after some time $k$ (i.e., $\Vert r_i^{k+1} - r_i^{k} \Vert \rightarrow 0$).
|
|
\end{theorem}
|
|
|
|
\item[Gradient tracking algorithm] \marginnote{Gradient tracking algorithm}
|
|
Method that chooses the local descent direction attempting to asymptotically track the true gradient:
|
|
\[ d_i^k \underset{k \rightarrow \infty}{\longrightarrow} - \frac{1}{N} \sum_{h=1}^N \nabla l_h(\z_h^k) \]
|
|
|
|
By using dynamic average consensus, we consider as signal the local gradient:
|
|
\[ \vec{r}_i^k = \nabla l_i(\z_i^k) \]
|
|
Then, the estimate of the average signal (i.e., gradient) is given by:
|
|
\[
|
|
\vec{s}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \vec{s}_j^k + \left( \nabla l_i(\z_i^{k+1}) - \nabla l_i(\z_i^k) \right)
|
|
\]
|
|
The update step is then performed as:
|
|
\[ \z_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k - \alpha \vec{s}_i^k \]
|
|
|
|
% Each agent accesses some $\nabla l_i(\z)$. It can be seen as a signal $r_i^k = \nabla l_i(\z_i^k)$ only available at agent $i$.
|
|
|
|
% We want that the direction:
|
|
% \[ d_i^k \underset{k \rightarrow \infty}{\longrightarrow} - \frac{1}{N} \sum_{h=1}^N r_h^k \]
|
|
|
|
% Ideally, we want:
|
|
% \[
|
|
% \z_i^{k+1} = \sum_{j=1}^N a_{ij} \z_j^k - (N\alpha) \frac{1}{N} \sum_{j=1}^N \nabla l_k(z_k^k)
|
|
% \]
|
|
\end{description} |