Add DAS distributed optimization

2026-02-04 07:41:43 +01:00 · 2025-04-03 13:13:43 +02:00
parent d470716486
commit d5496f3ef7
5 changed files with 264 additions and 8 deletions
--- a/src/year2/distributed-autonomous-systems/img/_distributed_cost_coupled.pdf
+++ b/src/year2/distributed-autonomous-systems/img/_distributed_cost_coupled.pdf
--- a/src/year2/distributed-autonomous-systems/img/distributed_learning.png
+++ b/src/year2/distributed-autonomous-systems/img/distributed_learning.png
--- a/src/year2/distributed-autonomous-systems/img/federated_learning.png
+++ b/src/year2/distributed-autonomous-systems/img/federated_learning.png
--- a/src/year2/distributed-autonomous-systems/sections/_leader_follower.tex
+++ b/src/year2/distributed-autonomous-systems/sections/_leader_follower.tex
@ -269,7 +269,7 @@
                \dot{\x}_l(t) &= \vec{v}_0 \quad & \x_l(0) &= \x_l^{(0)} \\
            \end{aligned}
        \]
-        where $\vec{u}_f(t)$ is a distributed control action (can be seen as a correction) that processes the containment error $\vec{e}(t)$. It is composed of a proportional controller (i.e., value proportional to the error) and an integral controller (i.e., value proportional to the integral to the error):
+        where $\vec{u}_f(t)$ is a distributed control action (can be seen as a correction) that processes the containment error $\vec{e}(t)$. It is composed of a proportional controller (i.e., value proportional to the error) and an integral controller (i.e., value proportional to the integral of the error):
        \[
            \begin{split}
                \vec{u}_f(t) &= \matr{K}_P \vec{e}(t) + \matr{K}_I \int_{0}^{t} \vec{e}(\tau) \,d\tau
--- a/src/year2/distributed-autonomous-systems/sections/_optimization.tex
+++ b/src/year2/distributed-autonomous-systems/sections/_optimization.tex
@ -418,7 +418,7 @@
-\section{Parallel optimization}
+\section{Cost-coupled optimization}
 \begin{description}
@ -427,9 +427,14 @@
        \[
            \min_{\z \in \mathbb{R}^{d}} \sum_{i=1}^{N} l_i(\z)
        \]
-    
+\end{description}
 \subsection{Optimization methods}
 \begin{description}
    \item[Batch gradient method] \marginnote{Batch gradient method}
-        Compute the gradient method direction by considering all the losses:
+        Compute the direction for the gradient method by considering all the losses:
        \[
            \z^{k+1} = \z^k - \alpha \sum_{i=1}^{N} \nabla l_i(\z^k)
        \]
@ -448,12 +453,263 @@
            Two possible rules to select the agent at each iteration are:
            \begin{descriptionlist}
                \item[Cyclic] 
-                    $i^k = 1, 2, \dots, N, 1, 2, \dots, N, \dots$
+                    $i^k = 1, 2, \dots, N, 1, 2, \dots, N, \dots$, or cyclic in any order (essentially cyclic).
                \item[Randomized] 
                    Draw $i^k$ from a uniform distribution.
            \end{descriptionlist}
        \end{remark}
-        % \begin{remark}
+
-        %     The step size should decrease to reach convergence.
+        \begin{remark}
-        % \end{remark}
+            A single gradient is not necessarily a descent direction.
        \end{remark}
        \begin{theorem}
            If the step size is diminishing, the incremental gradient method converges.
        \end{theorem}
 \end{description}
 \begin{description}
    \item[Stochastic gradient descent (SGD)] \marginnote{Stochastic gradient descent (SGD)}
        Instance of incremental gradient method where the selection rule follows an unknown distribution.
        The problem can be formulated as:
        \[ \min_{\z \in \mathbb{R}^d} \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})] \]
        where $\mathcal{W}$ is a random variable with possibly an unknown distribution.
        It is assumed that, given any realization $\bar{w}$ of $\mathcal{W}$ (e.g., the index of an agent or a single data point), it is possible to obtain the gradient $\nabla l(\bar{\z}, \bar{w})$ at any query point $\bar{\z}$. The optimization step at each iteration is then:
        \[ \z^{k+1} = \z^k - \alpha \nabla l(\z^k, w^k) \]
        \begin{remark}
            Monte Carlo approximation can be used to represent the expected value with a finite sequence of realizations:
            \[ \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})] \approx \frac{1}{K} \sum_{k=1}^{K} l(\z, w^k) \]
        \end{remark}
        \begin{theorem}[SGD convergence with constant step size] \marginnote{SGD convergence with constant step size}
            Given a function $l$ such that:
            \begin{itemize}
                \item $l$ is $\mu$-strongly convex with $L$-Lipschitz continuous gradient (i.e., bounded),
                \item $\nabla l(\z, \mathcal{W})$ is an unbiased estimate of $\nabla_\z \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})]$,
                \item $\Vert \nabla l(\z, \mathcal{W}) \Vert \leq M_\nabla$ almost surely (i.e., asymptotically with probability $1$) for some $M_\nabla > 0$.
            \end{itemize}
            With a constant step size $\alpha \leq \frac{1}{2}\mu$, it holds that at any time step $k$:
            \[ 
                \Vert \z^k - \z^* \Vert \leq 
                \underbrace{(1-2\mu\alpha)^k \left( \Vert \z^0 - \z^* \Vert - \frac{\alpha M_\nabla^2}{2\mu} \right)}_{\text{Error term}} + 
                \underbrace{\frac{\alpha M_\nabla^2}{2\mu}}_{\text{Residual term}} \]
            where the error diminishes over time and the residual term is constant.
        \end{theorem}
        \begin{theorem}[SGD convergence with diminishing step size] \marginnote{SGD convergence with diminishing step size}
            With a diminishing step size, both the error and the residual converge to $0$.
        \end{theorem}
    \item[Mini-batch SGD] \marginnote{Mini-batch SGD}
        SGD where the update at each time step $k$ is based on a set $\mathcal{I}^k \subset \{ 1, \dots, N \}$ of realizations of $\mathcal{W}$:
        \[ \z^{k+1} = \z^k - \alpha \sum_{i \in \mathcal{I}^k} \nabla l(\z^k, w^i) \]
    \item[Adaptive momentum (ADAM)] \marginnote{Adaptive momentum (ADAM)}
        Method based on the first and second momentum of the gradient:
        \[
            \begin{split}
                \vec{m}^{k+1} &= \beta_1 \vec{m}^k + (1-\beta_1) \nabla l(\z^k, w^k) \\
                \vec{v}^{k+1} &= \beta_2 \vec{v}^k + (1-\beta_2) \left( \nabla l(\z^k, w^k) \right)^2
            \end{split}
        \]
        where $\beta_1, \beta_2 \in (0, 1)$ are hyperparameters.
        The descent direction is defined as:
        \[
            \begin{gathered}
                \hat{\vec{m}} = \frac{1}{1 - \beta_1^{k+1}} \vec{m}^{k+1}
                \quad
                \hat{\vec{v}} = \frac{1}{1 - \beta_2^{k+1}} \vec{v}^{k+1} \\
                \vec{d}^k = - \frac{\hat{\vec{m}}}{\sqrt{\hat{\vec{v}}} + \varepsilon} \\
            \end{gathered}
        \]
        The update is performed as:
        \[ \z^{k+1} = \z^{k} + \alpha \vec{d}^k \]
 \end{description}
 \subsection{Learning paradigms}
 \begin{description}
    \item[Federated learning] \marginnote{Federated learning}
        Problem where $N$ agents with their local and private data $\mathcal{D}^{i}$ want to learn a common set of parameters $\z^*$ based on the same loss function (evaluated on different data points):
        \[
            \min_\z \sum_{i=1}^{N} l(\z; \mathcal{D}^i)
        \]
        A centralized parameter server (master) is responsible for aggregating the estimates of the agents (e.g., pick some nodes and average them).
        % \[
        %     \z^{t+1} = \z^k - \alpha \sum_{i \in I_k} \nabla l(\z; \mathcal{D}^i, p^i)
        % \]
    \item[Distributed learning] \marginnote{Distributed learning}
        Federated learning where there is no centralized entity and agents communicate with their neighbors only.
 \end{description}
 \begin{figure}[H]
    \centering
    \begin{subfigure}{0.45\linewidth}
        \centering
        \includegraphics[width=0.55\linewidth]{./img/federated_learning.png}
        \caption{Federated learning}
    \end{subfigure}
    \begin{subfigure}{0.45\linewidth}
        \centering
        \includegraphics[width=0.7\linewidth]{./img/distributed_learning.png}
        \caption{Distributed learning}
    \end{subfigure}
 \end{figure}
 \section{Distributed cost-coupled/consensus optimization}
 \begin{description}
    \item[Distributed cost-coupled optimization] \marginnote{Distributed cost-coupled optimization}
        Optimization problem with $N$ agents that communicate according to a graph $G$ aiming at learning a common set of parameters $\z$ such that:
        \[
            \min_{\z \in Z} \sum_{i=1}^{N} l_i(\z)
        \]
        where:
        \begin{itemize}
            \item Each agent $i$ knows its loss $l_i$ (based on its available data) and the parameter space $Z$,
            \item At each time step $k$, each agent $i$ estimates a set of parameters $\z_i^k$.
        \end{itemize}
        \begin{figure}[H]
            \centering
            \includegraphics[width=0.45\linewidth]{./img/_distributed_cost_coupled.pdf}
        \end{figure}
 \end{description}
 \subsection{Optimization algorithms}
 \begin{remark}
    Using as direction the sum of the gradients of all agents is not possible as not everyone can communicate with everyone.
 \end{remark}
 \begin{description}
    \item[Distributed gradient algorithm] \marginnote{Distributed gradient algorithm}
        Method that estimates a (more precise) set of parameters as a weighted sum those of its neighbors' (self-loop included):
        \[ 
            \vec{v}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k 
        \]
        Then, the update step is performed using $\vec{v}_i^{k+1}$ and the agent's own local loss $l_i$:
        \[
            \begin{split}
                \z_i^{k+1} &= \vec{v}_i^{k+1} - \alpha^k \nabla l_i(\vec{v}_i^{k+1}) \\
                &= \left(\sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k\right) - \alpha^k \nabla l_i\left(\sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k\right)
            \end{split}
        \]
        \begin{theorem}[Distributed gradient algorithm convergence] \marginnote{Distributed gradient algorithm convergence}
            Assume that:
            \begin{itemize}
                \item The matrix $\matr{A}$ associated to the undirected and connected communication graph $G$ is doubly stochastic and such that $a_{ij} > 0$,
                \item The step size is diminishing,
                \item Each $l_i$ is convex, has gradients bounded by a scalar $C_i > 0$, and there exists at least one optimal solution.
            \end{itemize}
            Then, the sequence of local solutions $\{ \z_i^k \}_{k \in \mathbb{N}}$ of each agent $i$ produced using the distributed gradient algorithm converges to a common optimal solution $\z^*$:
            \[ \lim_{k \rightarrow \infty} \Vert \z_i^k - \z^* \Vert = 0 \]
        \end{theorem}
    \item[Distributed projected subgradient algorithm] \marginnote{Distributed projected subgradient algorithm}
        Distributed gradient algorithm extended to the case where $l_i$ are non-smooth convex functions and $\z$ is constrained to a closed convex set $Z \subseteq \mathbb{R}^d$. The distributed step is the following:
        \[
            \begin{split}
                \vec{v}_i^{k+1} &= \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k \\
                \z_i^{k+1} &= P_Z \big( \vec{v}_i^{k+1} - \alpha^k \tilde{\nabla} l_i(\vec{v}_i^{k+1}) \big)
            \end{split}
        \]
        where $P_Z(\cdot)$ is the Euclidean projection onto $Z$ and $\tilde{\nabla} l_i$ is a subgradient of $l_i$.
        \begin{theorem}[Distributed projected subgradient algorithm convergence] \marginnote{Distributed projected subgradient algorithm convergence}
            Assume that:
            \begin{itemize}
                \item The adjacency matrix $\matr{A}$ associated to $G$ is doubly stochastic and $a_{ij} > 0$,
                \item The step size is diminishing,
                \item Each $l_i$ is convex, has subgradients bounded by a scalar $C_i > 0$, and there exists at least one optimal solution.
            \end{itemize}
            Then, each agent converges to an optimal solution $\z^*$.
        \end{theorem}
 \end{description}
 \begin{theorem}
    The distributed gradient algorithm does not converge with a constant step size.
    \begin{proof}[Proof idea]
        % Assume that the starting guess in standard gradient descent is is a local minimum:
        % \[ \z^{0} = \z^* \]
        % We have that:
        % \[
        %     \begin{split}
        %         \z^* &= \z^* - \alpha \nabla l(\z^*) \\
        %         \z^* &= \z^* \\
        %     \end{split}
        % \]
        % $\z^*$ is an equilibrium.
        We want to check whether the optimum $\z^*$ with a constant step size $\alpha$ is an equilibrium:
        \[
            \begin{aligned}
                \z^* &= \sum_{j=1}^N a_{ij} \z^* - \alpha \nabla l_i \left( \sum_{j=1}^N a_{ij} \z^* \right) \\
                &= \z^* - \alpha \nabla l_i (\z^*) &&& \parbox{0.20\linewidth}{\footnotesize $\matr{A}$ doubly stochastic and $\z^*$ constant} \\
            \end{aligned}
        \]
        In general, $\nabla l_i(\z^*) \neq 0$ ($\z^*$ is the optimum for the whole problem, but $l_i$ depends on the subset of data available to the agent). Therefore, $\z^*$ is not an equilibrium.
    \end{proof}
 \end{theorem}
 \begin{description}
    \item[Dynamic average consensus] \marginnote{Dynamic average consensus}
        Consensus algorithm where each agent measures a signal $r_i^k$ and wants to estimate the average signal of all agents:
        \[ \bar{r}^k = \frac{1}{N} \sum_{i=1}^{N} r_i^k \]
        The average signal estimated by an agent is represented by a state $s_i^k$ and we want that $\lim_{k \rightarrow \infty} \Vert s_i^k - \bar{r}^k \Vert = 0$. This can be achieved using a perturbed consensus algorithm:
        \[ 
            s_i^{k+1} = 
            \underbrace{\sum_{j \in \mathcal{N}_i} a_{ij} s_j^k}_{\text{Consensus}} +
            \underbrace{\vphantom{\sum_{j \in \mathcal{N}_i}}(r_i^{k+1} - r_i^k)}_{\text{Innovation}}
        \]
        where:
        \begin{itemize}
            \item The consensus term converges to the states average.
            \item The local innovation allows converging to the common signal.
        \end{itemize}
        \begin{theorem}[Dynamic average consensus convergence]
            If the first-order differences are bounded (i.e., $\Vert r_i^{k+1} - r_i^{k} \Vert \leq C_1$), then the tracking error is bounded by some $C_2 > 0$:
            \[ \lim_{k \rightarrow \infty} \Vert s_i^k - \bar{r}^k \Vert \leq C_2 \]
            Moreover, the error is zeroed if the signal becomes constant after some time $k$ (i.e., $\Vert r_i^{k+1} - r_i^{k} \Vert \rightarrow 0$).
        \end{theorem}
    \item[Gradient tracking algorithm] \marginnote{Gradient tracking algorithm}
        Method that chooses the local descent direction attempting to asymptotically track the true gradient:
        \[ d_i^k \underset{k \rightarrow \infty}{\longrightarrow} - \frac{1}{N} \sum_{h=1}^N \nabla l_h(\z_h^k) \] 
        By using dynamic average consensus, we consider as signal the local gradient:
        \[ \vec{r}_i^k = \nabla l_i(\z_i^k) \]
        Then, the estimate of the average signal (i.e., gradient) is given by:
        \[
            \vec{s}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \vec{s}_j^k + \left( \nabla l_i(\z_i^{k+1}) - \nabla l_i(\z_i^k) \right)
        \]
        The update step is then performed as:
        \[ \z_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k - \alpha \vec{s}_i^k \]
        % Each agent accesses some $\nabla l_i(\z)$. It can be seen as a signal $r_i^k = \nabla l_i(\z_i^k)$ only available at agent $i$.
        % We want that the direction:
        % \[ d_i^k \underset{k \rightarrow \infty}{\longrightarrow} - \frac{1}{N} \sum_{h=1}^N r_h^k \] 
        % Ideally, we want:
        % \[
        %     \z_i^{k+1} = \sum_{j=1}^N a_{ij} \z_j^k - (N\alpha) \frac{1}{N} \sum_{j=1}^N \nabla l_k(z_k^k)
        % \]
 \end{description}