Add DAS gradient method

This commit is contained in:
2025-03-20 19:27:08 +01:00
parent 50cdf6ba54
commit 440868cec5
12 changed files with 125693 additions and 0 deletions

View File

@ -39,6 +39,7 @@
\def\stf{{\texttt{stf}}}
\def\lap{{\matr{L}}}
\def\x{{\vec{x}}}
\def\z{{\vec{z}}}
\begin{document}
@ -47,5 +48,6 @@
\include{./sections/_graphs.tex}
\include{./sections/_averaging_systems.tex}
\include{./sections/_leader_follower.tex}
\include{./sections/_optimization.tex}
\end{document}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,459 @@
\chapter{Optimization}
\section{Definitions}
\subsection{Unconstrained optimization}
\begin{description}
\item[Unconstrained optimization] \marginnote{Unconstrained optimization}
Problem of form:
\[ \min_{\z \in \mathbb{R}^d} l(\z) \]
where $l: \mathbb{R}^d \rightarrow \mathbb{R}$ is the cost function and $\z$ the decision variables.
\end{description}
\begin{theorem}[First-order necessary condition of optimality] \marginnote{First-order necessary condition of optimality}
Given a point $\z^*$ and a cost function $l: \mathbb{R}^d \rightarrow \mathbb{R}$ such that $l \in C^1$ in $B(\z^*, \varepsilon)$ (i.e., neighbors of $\z^*$ within a radius $\varepsilon$), it holds that:
\[
\z^* \text{ is local minimum } \Rightarrow \nabla l(\z^*) = 0
\]
\end{theorem}
\begin{theorem}[Second-order necessary condition of optimality] \marginnote{Second-order necessary condition of optimality}
Given a point $\z^*$ and a cost function $l: \mathbb{R}^d \rightarrow \mathbb{R}$ such that $l \in C^2$ in $B(\z^*, \varepsilon)$, it holds that:
\[
\z^* \text{ is local minimum } \Rightarrow \nabla^2 l(\z^*) \geq 0 \text{ (i.e., positive semidefinite)}
\]
\end{theorem}
\subsection{Convexity}
\begin{description}
\item[Convex set] \marginnote{Convex set}
A set $Z \subseteq \mathbb{R}^d$ is convex if it holds that:
\[
\forall \z_A, \z_B \in Z: \Big( \exists \alpha \in [0, 1]: (\alpha \z_A + (1-\alpha)\z_B) \in Z \Big)
\]
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{img/_convex_set.pdf}
\end{figure}
\item[Convex function] \marginnote{Convex function}
Given a convex set $Z \subseteq \mathbb{R}^d$, a function $l: Z \rightarrow \mathbb{R}$ is convex if it holds that:
\[
\forall \z_A, \z_B \in Z: \Big( \exists \alpha \in [0, 1]: l(\alpha \z_A + (1-\alpha) \z_B) \leq \alpha l(\z_A) + (1-\alpha) l(\z_B) \Big)
\]
\begin{figure}[H]
\centering
\includegraphics[width=0.25\linewidth]{img/_convex_function.pdf}
\end{figure}
\begin{remark}
Given a differentiable and convex function $l: Z \rightarrow \mathbb{R}$, it holds that any of its points lie above all its tangents:
\[ \forall \z_A, \z_B \in Z: l(\z_B) \geq l(\z_A) + \nabla l(\z_A)^T (\z_B - \z_A) \]
\begin{figure}[H]
\centering
\includegraphics[width=0.3\linewidth]{img/_convex_tangent.pdf}
\end{figure}
\end{remark}
\item[Strongly convex function] \marginnote{Strongly convex function}
Given a convex set $Z \subseteq \mathbb{R}^d$, a function $l: Z \rightarrow \mathbb{R}$ is strongly convex with parameter $\mu > 0$ if it holds that:
\[
\begin{split}
\forall \z_A, \z_B \in Z, \z_A \neq \z_B: \Big( \exists \alpha \in (0, 1)&: l(\alpha \z_A + (1-\alpha) \z_B) < \\
&\alpha l(\z_A) + (1-\alpha) l(\z_B) - \frac{1}{2} \mu \alpha (1-\alpha) \Vert \z_A-\z_B \Vert^2 \Big)
\end{split}
\]
Intuitively, it is strictly convex and grows as fast as a quadratic function.
\begin{remark}
Given a differentiable and $\mu$-strongly convex function $l: Z \rightarrow \mathbb{R}$, it holds that any of its points lie above all the paraboloids with curvature determined by $\mu$ and tangent to a point of the function:
\[ \forall \z_A, \z_B \in Z: l(\z_B) \geq l(\z_A) + \nabla l(\z_A)^T (\z_B - \z_A) + \frac{\mu}{2} \Vert \z_B - \z_A \Vert^2 \]
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{img/_strongly_convex.pdf}
\end{figure}
A geometric interpretation is that strong convexity imposes a quadratic lower-bound to the function.
\end{remark}
\end{description}
\begin{lemma}[Convexity and gradient monotonicity] \marginnote{Convexity and gradient monotonicity}
Given a differentiable and convex function $l$, its gradient $\nabla l$ is a monotone operator, which means that it satisfies:
\[
\forall \z_A, \z_B: \big( \nabla l(\z_A) - \nabla l(\z_B) \big)^T (\z_A - \z_B) \geq 0
\]
\end{lemma}
\begin{lemma}[Strict convexity and gradient monotonicity] \marginnote{Strict convexity and gradient monotonicity}
Given a differentiable and strictly convex function $l$, its gradient $\nabla l$ is a strictly monotone operator, which means that it satisfies:
\[
\forall \z_A, \z_B: \big( \nabla l(\z_A) - \nabla l(\z_B) \big)^T (\z_A - \z_B) > 0
\]
\end{lemma}
\begin{lemma}[Strong convexity and gradient monotonicity] \marginnote{Strong convexity and gradient monotonicity}
Given a differentiable and $\mu$-strongly convex function $l$, its gradient $\nabla l$ is a strongly monotone operator, which means that it satisfies:
\[
\forall \z_A, \z_B: \big( \nabla l(\z_A) - \nabla l(\z_B) \big)^T (\z_A - \z_B) \geq \mu \Vert \z_A - \z_B \Vert^2
\]
\end{lemma}
\begin{description}
\item[Lipschitz continuity] \marginnote{Lipschitz continuity}
Given a function $l$, it is Lipschitz continuous with parameter $L > 0$ if:
\[
\forall \z_A, \z_B: \Vert l(\z_A) - l(\z_B) \Vert \leq L \Vert \z_A - \z_B \Vert
\]
\begin{remark}
Given a differentiable function $l$ with $L$-Lipschitz continuous gradient $\nabla l$, it holds that any of its points lie below all the paraboloids with curvature determined by $L$ and tangent to a point of the function:
\[ \forall \z_A, \z_B \in Z: l(\z_B) \leq l(\z_A) + \nabla l(\z_A)^T (\z_B - \z_A) + \frac{L}{2} \Vert \z_B - \z_A \Vert^2 \]
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{img/_lipschitz_gradient.pdf}
\end{figure}
A geometric interpretation is that Lipschitz continuity of the gradient imposes a quadratic upper-bound to the function.
\end{remark}
\end{description}
\begin{lemma}[Convexity and Lipschitz continuity of gradient] \marginnote{Convexity and Lipschitz continuity of gradient}
Given a differentiable convex function $l$ with $L$-Lipschitz continuous gradient $\nabla l$, its gradient is a co-coercive operator, which means that it satisfies:
\[
\forall \z_A, \z_B: \Big( \nabla l(\z_A) - \nabla l(\z_B) \Big)^T (\z_A - \z_B) \geq \frac{1}{L} \Vert \nabla l(\z_A) - \nabla l(\z_B) \Vert^2
\]
\end{lemma}
\begin{lemma}[Strong convexity and Lipschitz continuity of gradient] \marginnote{Strong convexity and Lipschitz continuity of gradient} \phantomsection\label{th:strong_convex_lipschitz_gradient}
Given a differentiable $\mu$-strongly convex function $l$ with $L$-Lipschitz continuous gradient $\nabla l$, its gradient is a strongly co-coercive operator, which means that it satisfies:
\[
\forall \z_A, \z_B: \Big( \nabla l(\z_A) - \nabla l(\z_B) \Big)^T (\z_A - \z_B) \geq \underbrace{\frac{\mu L}{\mu+L}}_{\gamma_1} \Vert \z_A - \z_B \Vert^2 + \underbrace{\frac{1}{\mu+L}}_{\gamma_2} \Vert \nabla l(\z_A) - \nabla l(\z_B) \Vert^2
\]
\end{lemma}
\section{Iterative descent methods}
\begin{theorem}
Given a convex function $l$, it holds that a local minimum of $l$ is also global.
Moreover, in the unconstrained optimization case, the first-order necessary condition of optimality is sufficient for a global minimum.
\end{theorem}
\begin{theorem}
Given a convex function $l$, it holds that $\z^*$ is a global minimum if and only if $\nabla f(\z^*) = 0$.
\end{theorem}
\begin{description}
\item[Iterative descent] \marginnote{Iterative descent}
Given a function $l$ and an initial guess $\z^{0}$, an iterative descent algorithm iteratively moves to new points $\z^{k}$ such that:
\[
\forall k \in \mathbb{N}: l(\z^{k+1}) < l(\z^{k})
\]
\end{description}
\subsection{Gradient method}
\begin{description}
\item[Gradient method] \marginnote{Gradient method}
Algorithm that given the function $l$ to minimize and the initial guess $\z^0$, computes the update as:
\[ \z^{k+1} = \z^k - \alpha^k \nabla l(\z^k) \]
where $\alpha^k > 0$ is the step size and $- \nabla l(\z^k)$ is the step direction.
\begin{theorem}
For a sufficiently small $\alpha^k > 0$, the gradient method is an iterative descent algorithm:
\[ l(\z^{k+1}) < l(\z^{k}) \]
\begin{proof}
Consider the first-order Taylor approximation of $l(\z^{k+1})$ about $\z^k$:
\[
\begin{split}
l(\z^{k+1}) &= l(\z^k) + \nabla l(\z^k)^T (\z^{k+1} - \z^k) + o(\Vert \z^{k+1} - \z^k \Vert) \\
&= l(\z^k) - \alpha^k \Vert \nabla l(\z^k)\Vert^2 + o(\alpha^k)
\end{split}
\]
Therefore, $l(\z^{k+1}) < l(\z^{k})$ for some $\alpha^k$.
\end{proof}
\end{theorem}
\begin{remark}[Step size choice] \marginnote{Step size choice}
Possible choices for the step size are:
\begin{descriptionlist}
\item[Constant]
$\forall k \in \mathbb{N}: \alpha^k = \alpha > 0$.
\item[Diminishing]
$\alpha^k \overset{k \rightarrow \infty}{\longrightarrow} 0$. To avoid decreasing the step too much, a typical choice is an $\alpha^k$ such that:
\[
\sum_{k=0}^{\infty} \alpha^k = \infty
\qquad
\sum_{k=0}^{\infty} (\alpha^k)^2 < \infty
\]
\item[Line search]
Algorithmic methods such as the Armijo rule.
\end{descriptionlist}
\end{remark}
\item[Generalized gradient method] \marginnote{Generalized gradient method}
Gradient method where the update rule is generalized as:
\[ \z^{k+1} = \z^k - \alpha^k \matr{D}^k \nabla l(\z^k) \]
where $\matr{D}^k \in \mathbb{R}^{d \times d}$ is uniformly positive definite (i.e., $\delta_1 \matr{I} \leq \matr{D}^k \leq \delta_2 \matr{I}$ for some $\delta_2 \geq \delta_1 > 0$).
Possible choices for $\matr{D}^k$ are:
\begin{itemize}
\item Steepest descent: $\matr{D}^k = \matr{I}$.
\item Newton's method: $\matr{D}^k = (\nabla^2 l(\z^k))^{-1}$.
\item Quasi-Newton method: $\matr{D}^k = (H(\z^k))^{-1}$, where $H(\z^k) \approx \nabla^2 l(\z^k)$.
\end{itemize}
\end{description}
\begin{description}
\item[Gradient method as discrete-time integrator with feedback] \marginnote{Gradient method as discrete-time integrator with feedback}
The gradient method can be interpreted as a discrete-time integrator with a feedback loop. This means that it is composed of:
\begin{descriptionlist}
\item[Integrator] A linear system that defines the update: $\z^{k+1} = \z^k - \alpha \vec{u}^k$.
\item[Plant] A non-linear (bounded) function whose output is re-injected into the integrator. In this case, it is the gradient: $\vec{u}^k = \nabla l(\z^k)$.
\end{descriptionlist}
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/_gradient_method_integrator.pdf}
\end{figure}
\end{description}
\begin{theorem}[Gradient method convergence] \marginnote{Gradient method convergence}
Consider a function $l$ such that:
\begin{itemize}
\item $\nabla l$ is $L$-Lipschitz continuous,
\item The step size is constant or diminishing.
\end{itemize}
Let $\{ \z^k \}_{k \in \mathbb{N}}$ be the (bounded) sequence generated by the gradient method. It holds that every limit point $\bar{\z}$ of the sequence $\{ \z^k \}_{k \in \mathbb{N}}$ is a stationary point (i.e., $\nabla l(\bar{z}) = 0$).
In addition, if $l$ is $\mu$-strongly convex and the step size is constant, then the convergence rate of the sequence $\{ \z^k \}_{k \in \mathbb{N}}$ is exponential (also said geometric or linear):
\[
\Vert \z^k - \z^* \Vert \leq M \rho^k
\]
where $\rho \in (0, 1)$ and $M > 0$ depends on $\mu$, $L$, and $\Vert \z^0 - \z^* \Vert$.
\begin{proof}
We need to prove the two parts of the theorem:
\begin{enumerate}
\item
We want to prove that any limit point of the sequence generated by the gradient method is a stationary point.
In other words, by considering the gradient method as an integrator with feedback, we want to analyze the equilibrium of the system. Assume that the system converges to some equilibrium $\z_E$. To be an equilibrium, it must be that the feedback loop stopped updating the system (i.e., $\vec{u}^k = 0$ for $k$ after some threshold) so that:
\[
\z_E = \z_E - \alpha \nabla l(\z_E)
\]
Therefore, an equilibrium point is necessarily a stationary point of $l$ as it must be that $\nabla l(\z_E) = 0$.
\item
We want to prove that if $l$ is $\mu$-strongly convex and the step size is constant, the sequence converges exponentially.
\begin{remark}
As $l$ is convex, its equilibrium is also the global minimum $\z^*$.
\end{remark}
Consider the following change in coordinates (i.e., a translation):
\[
\begin{gathered}
\z^k \mapsto \tilde{\z}^k \\
\text{with } \tilde{\z}^k = \z^k - \z_E = \z^k - \z^*
\end{gathered}
\]
The system in the new coordinates becomes:
\[
\begin{aligned}
&\tilde{\z}^{k+1} = \tilde{\z}^k - \alpha \vec{u}^k \\
&\begin{aligned}
\vec{u}^k &= \nabla l(\z^k) \\
&= \nabla l(\tilde{\z}^k + \z^*) \\
&= \nabla l(\tilde{\z}^k + \z^*) - \nabla l(\z^*) & & & \text{\small $\nabla l(\z^*)=0$, but useful for \Cref{th:strong_convex_lipschitz_gradient}}
\end{aligned}
\end{aligned}
\]
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/_gradient_method_integrator_new_coords.pdf}
\end{figure}
\begin{remark}
As $l$ is strongly convex and its gradient Lipschitz continuous, by \Cref{th:strong_convex_lipschitz_gradient} it holds that:
\[
-(\vec{u}^k)^T \tilde{\z}^k \leq - \gamma_1 \Vert \tilde{\z}^k \Vert^2 - \gamma_2 \Vert \tilde{\vec{u}}^k \Vert^2
\]
\end{remark}
Consider a Lyapunov function $V: \mathbb{R}^d \rightarrow \mathbb{R}_{\geq 0}$ defined as:
\[
V(\tilde{\z}) = \Vert \tilde{\z} \Vert^2
\]
It holds that:
\[
\begin{aligned}
V(\tilde{\z}^{k+1}) - V(\tilde{\z}^k) &= \Vert \tilde{\z}^{k+1} \Vert^2 - \Vert \tilde{\z}^k \Vert^2 \\
&= \cancel{\Vert \tilde{\z}^k \Vert^2} - 2\alpha(\vec{u}^k)^T\tilde{\z}^k + \alpha^2 \Vert \vec{u}^k \Vert^2 - \cancel{\Vert \tilde{\z}^k \Vert^2} &&& \text{\Cref{th:strong_convex_lipschitz_gradient}} \\
&\leq -2\alpha\gamma_1 \Vert\tilde{\z}^k\Vert^2 + \alpha(\alpha-2\gamma_2) \Vert\vec{u}^k\Vert^2
\end{aligned}
\]
By choosing $\alpha \leq 2\gamma_2$, we have that:
\[
\begin{split}
V(\tilde{\z}^{k+1}) - V(\tilde{\z}^k) &\leq -2\alpha\gamma_1 \Vert \tilde{\z}^k \Vert^2 \\
\iff \Vert \tilde{\z}^{k+1} \Vert^2 - \Vert \tilde{\z}^k \Vert^2 &\leq -2\alpha\gamma_1 \Vert \tilde{\z}^k \Vert^2 \\
\iff \Vert \tilde{\z}^{k+1} \Vert^2 &\leq (1-2\alpha\gamma_1) \Vert \tilde{\z}^k \Vert^2 \\
\end{split}
\]
Finally, as the gradient method is an iterative descent algorithm, it holds that:
\[
\begin{split}
\Vert \tilde{\z}^{k+1} \Vert^2 &\leq (1-2\alpha\gamma_1) \Vert \tilde{\z}^k \Vert^2 \\
&\leq \dots \\
&\leq (1-2\alpha\gamma_1)^k \Vert \tilde{\z}^0 \Vert^2 \\
\end{split}
\]
Therefore, the sequence $\{ \tilde{\z}^k \}_{k \in \mathbb{R}}$ goes exponentially fast to zero and we have shown that:
\[
\begin{split}
\Vert \z^{k+1} - \z^* \Vert^2 &\leq (1-2\alpha\gamma_1)^k \Vert \z^0 - \z^* \Vert^2 \\
&= \rho^k M
\end{split}
\]
\end{enumerate}
\end{proof}
\end{theorem}
\begin{remark}[Gradient method for a quadratic function] \marginnote{Gradient method for a quadratic function}
Given the problem of minimizing a quadratic function:
\[
\min_{\z} \frac{1}{2}\z^T \matr{Q} \z + \vec{r}^T \z
\qquad
\nabla l = \matr{Q} \z^k + \vec{r}
\]
The gradient method can be reduced to an affine linear system:
\[
\begin{split}
\z^{k+1} &= \z^k - \alpha (\matr{Q} \z^k + \vec{r}) \\
&= (\matr{I} - \alpha \matr{Q}) \z^k - \alpha \vec{r}
\end{split}
\]
For a sufficiently small $\alpha$, the matrix $(\matr{I} - \alpha \matr{Q})$ is Schur (i.e., $\forall \matr{\rho}, |\matr{\rho}| < 1: \sum_{i=0}^{\infty} \matr{\rho}^i = (1-\matr{\rho})^{-1}$). Therefore, the solution can be computed in closed form as:
\[
\begin{split}
\z^k &= (\matr{I} - \alpha \matr{Q})^k \z^0 - \alpha \sum_{\tau=0}^{k-1} (\matr{I} - \alpha \matr{Q})^\tau \vec{r} \\
&\overset{k \rightarrow \infty}{\longrightarrow} - \alpha \left( \sum_{\tau=0}^{\infty} (\matr{I} - \alpha \matr{Q})^\tau \right) \vec{r} = -\matr{Q}^{-1} \vec{r}
\end{split}
\]
\end{remark}
\begin{remark}[Gradient flow] \marginnote{Gradient flow}
By inverting the integrator and plant of the discrete-time integrator of the gradient method, and considering the continuous-time case, the result is the gradient flow:
\[
\dot{\z}(t) = -\nabla l(\z(t))
\]
which has a solution if the vector field is Lipschitz continuous.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_gradient_flow.pdf}
\end{figure}
\end{remark}
\subsection{Accelerated gradient methods}
\begin{description}
\item[Heavy-ball method] \marginnote{Heavy-ball method}
Given $\eta^0$ and $\eta^{-1}$, the algorithm is defined as:
\[
\eta^{k+1} = \eta^k + \alpha_1 (\eta^k - \eta^{k-1}) - \alpha_2 \nabla l(\eta^k)
\]
with $\alpha_1, \alpha_2 > 0$.
\begin{remark}
With $\alpha_1 = 0$, the algorithm is reduced to the gradient method with step size $\alpha_2$.
\end{remark}
\begin{remark}
The algorithm admits a state-space representation as a discrete-time integrator with a feedback loop:
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/_heavy_ball.pdf}
\end{figure}
Note that the matrix $\begin{bmatrix} 1+\alpha_1 & -\alpha_1 \\ 1 & 0 \end{bmatrix}$ is row stochastic.
\end{remark}
\item[Generalized heavy-ball method] \marginnote{Generalized heavy-ball method}
Given $\zeta^0$ and $\zeta^{-1}$, the algorithm is defined as:
\[
\zeta^{k+1} = \zeta^k + \alpha_1 (\zeta^k - \zeta^{k-1}) - \alpha_2 \nabla l(\zeta^k + \alpha_3(\zeta^k - \zeta^{k-1}))
\]
with $\alpha_1, \alpha_2, \alpha_3 > 0$.
\begin{remark}
The algorithm admits a state-space representation as a discrete-time integrator with a feedback loop:
\begin{figure}[H]
\centering
\includegraphics[width=0.55\linewidth]{./img/_generalized_heavy_ball.pdf}
\end{figure}
\end{remark}
\end{description}
\section{Parallel optimization}
\begin{description}
\item[Cost-coupled optimization] \marginnote{Cost-coupled optimization}
Problem of minimizing $N$ cost functions $l_i: \mathbb{R}^d \rightarrow \mathbb{R}$, each local and private to an agent:
\[
\min_{\z \in \mathbb{R}^{d}} \sum_{i=1}^{N} l_i(\z)
\]
\item[Batch gradient method] \marginnote{Batch gradient method}
Compute the gradient method direction by considering all the losses:
\[
\z^{k+1} = \z^k - \alpha \sum_{i=1}^{N} \nabla l_i(\z^k)
\]
\begin{remark}
Computation in this way can be expensive.
\end{remark}
\item[Incremental gradient method] \marginnote{Incremental gradient method}
At each iteration $k$, compute the direction by considering the loss of a single agent $i^k$:
\[
\z^{k+1} = \z^k - \alpha \nabla l_{i^k}(\z^k)
\]
\begin{remark}
Two possible rules to select the agent at each iteration are:
\begin{descriptionlist}
\item[Cyclic]
$i^k = 1, 2, \dots, N, 1, 2, \dots, N, \dots$
\item[Randomized]
Draw $i^k$ from a uniform distribution.
\end{descriptionlist}
\end{remark}
% \begin{remark}
% The step size should decrease to reach convergence.
% \end{remark}
\end{description}