diff --git a/src/year2/distributed-autonomous-systems/img/_distributed_cost_coupled.pdf b/src/year2/distributed-autonomous-systems/img/_distributed_cost_coupled.pdf new file mode 100644 index 0000000..8addcb7 Binary files /dev/null and b/src/year2/distributed-autonomous-systems/img/_distributed_cost_coupled.pdf differ diff --git a/src/year2/distributed-autonomous-systems/img/distributed_learning.png b/src/year2/distributed-autonomous-systems/img/distributed_learning.png new file mode 100644 index 0000000..3a8c56b Binary files /dev/null and b/src/year2/distributed-autonomous-systems/img/distributed_learning.png differ diff --git a/src/year2/distributed-autonomous-systems/img/federated_learning.png b/src/year2/distributed-autonomous-systems/img/federated_learning.png new file mode 100644 index 0000000..91f6119 Binary files /dev/null and b/src/year2/distributed-autonomous-systems/img/federated_learning.png differ diff --git a/src/year2/distributed-autonomous-systems/sections/_leader_follower.tex b/src/year2/distributed-autonomous-systems/sections/_leader_follower.tex index 8c0c59f..e1002c7 100644 --- a/src/year2/distributed-autonomous-systems/sections/_leader_follower.tex +++ b/src/year2/distributed-autonomous-systems/sections/_leader_follower.tex @@ -269,7 +269,7 @@ \dot{\x}_l(t) &= \vec{v}_0 \quad & \x_l(0) &= \x_l^{(0)} \\ \end{aligned} \] - where $\vec{u}_f(t)$ is a distributed control action (can be seen as a correction) that processes the containment error $\vec{e}(t)$. It is composed of a proportional controller (i.e., value proportional to the error) and an integral controller (i.e., value proportional to the integral to the error): + where $\vec{u}_f(t)$ is a distributed control action (can be seen as a correction) that processes the containment error $\vec{e}(t)$. It is composed of a proportional controller (i.e., value proportional to the error) and an integral controller (i.e., value proportional to the integral of the error): \[ \begin{split} \vec{u}_f(t) &= \matr{K}_P \vec{e}(t) + \matr{K}_I \int_{0}^{t} \vec{e}(\tau) \,d\tau diff --git a/src/year2/distributed-autonomous-systems/sections/_optimization.tex b/src/year2/distributed-autonomous-systems/sections/_optimization.tex index 0e3bce8..021de20 100644 --- a/src/year2/distributed-autonomous-systems/sections/_optimization.tex +++ b/src/year2/distributed-autonomous-systems/sections/_optimization.tex @@ -418,7 +418,7 @@ -\section{Parallel optimization} +\section{Cost-coupled optimization} \begin{description} @@ -427,9 +427,14 @@ \[ \min_{\z \in \mathbb{R}^{d}} \sum_{i=1}^{N} l_i(\z) \] - +\end{description} + + +\subsection{Optimization methods} + +\begin{description} \item[Batch gradient method] \marginnote{Batch gradient method} - Compute the gradient method direction by considering all the losses: + Compute the direction for the gradient method by considering all the losses: \[ \z^{k+1} = \z^k - \alpha \sum_{i=1}^{N} \nabla l_i(\z^k) \] @@ -448,12 +453,263 @@ Two possible rules to select the agent at each iteration are: \begin{descriptionlist} \item[Cyclic] - $i^k = 1, 2, \dots, N, 1, 2, \dots, N, \dots$ + $i^k = 1, 2, \dots, N, 1, 2, \dots, N, \dots$, or cyclic in any order (essentially cyclic). \item[Randomized] Draw $i^k$ from a uniform distribution. \end{descriptionlist} \end{remark} - % \begin{remark} - % The step size should decrease to reach convergence. - % \end{remark} + + \begin{remark} + A single gradient is not necessarily a descent direction. + \end{remark} + + \begin{theorem} + If the step size is diminishing, the incremental gradient method converges. + \end{theorem} +\end{description} + + +\begin{description} + \item[Stochastic gradient descent (SGD)] \marginnote{Stochastic gradient descent (SGD)} + Instance of incremental gradient method where the selection rule follows an unknown distribution. + + The problem can be formulated as: + \[ \min_{\z \in \mathbb{R}^d} \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})] \] + where $\mathcal{W}$ is a random variable with possibly an unknown distribution. + + It is assumed that, given any realization $\bar{w}$ of $\mathcal{W}$ (e.g., the index of an agent or a single data point), it is possible to obtain the gradient $\nabla l(\bar{\z}, \bar{w})$ at any query point $\bar{\z}$. The optimization step at each iteration is then: + \[ \z^{k+1} = \z^k - \alpha \nabla l(\z^k, w^k) \] + + \begin{remark} + Monte Carlo approximation can be used to represent the expected value with a finite sequence of realizations: + \[ \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})] \approx \frac{1}{K} \sum_{k=1}^{K} l(\z, w^k) \] + \end{remark} + + \begin{theorem}[SGD convergence with constant step size] \marginnote{SGD convergence with constant step size} + Given a function $l$ such that: + \begin{itemize} + \item $l$ is $\mu$-strongly convex with $L$-Lipschitz continuous gradient (i.e., bounded), + \item $\nabla l(\z, \mathcal{W})$ is an unbiased estimate of $\nabla_\z \mathbb{E}_\mathcal{W}[l(\z, \mathcal{W})]$, + \item $\Vert \nabla l(\z, \mathcal{W}) \Vert \leq M_\nabla$ almost surely (i.e., asymptotically with probability $1$) for some $M_\nabla > 0$. + \end{itemize} + With a constant step size $\alpha \leq \frac{1}{2}\mu$, it holds that at any time step $k$: + \[ + \Vert \z^k - \z^* \Vert \leq + \underbrace{(1-2\mu\alpha)^k \left( \Vert \z^0 - \z^* \Vert - \frac{\alpha M_\nabla^2}{2\mu} \right)}_{\text{Error term}} + + \underbrace{\frac{\alpha M_\nabla^2}{2\mu}}_{\text{Residual term}} \] + where the error diminishes over time and the residual term is constant. + \end{theorem} + + \begin{theorem}[SGD convergence with diminishing step size] \marginnote{SGD convergence with diminishing step size} + With a diminishing step size, both the error and the residual converge to $0$. + \end{theorem} + + \item[Mini-batch SGD] \marginnote{Mini-batch SGD} + SGD where the update at each time step $k$ is based on a set $\mathcal{I}^k \subset \{ 1, \dots, N \}$ of realizations of $\mathcal{W}$: + \[ \z^{k+1} = \z^k - \alpha \sum_{i \in \mathcal{I}^k} \nabla l(\z^k, w^i) \] + + \item[Adaptive momentum (ADAM)] \marginnote{Adaptive momentum (ADAM)} + Method based on the first and second momentum of the gradient: + \[ + \begin{split} + \vec{m}^{k+1} &= \beta_1 \vec{m}^k + (1-\beta_1) \nabla l(\z^k, w^k) \\ + \vec{v}^{k+1} &= \beta_2 \vec{v}^k + (1-\beta_2) \left( \nabla l(\z^k, w^k) \right)^2 + \end{split} + \] + where $\beta_1, \beta_2 \in (0, 1)$ are hyperparameters. + + The descent direction is defined as: + \[ + \begin{gathered} + \hat{\vec{m}} = \frac{1}{1 - \beta_1^{k+1}} \vec{m}^{k+1} + \quad + \hat{\vec{v}} = \frac{1}{1 - \beta_2^{k+1}} \vec{v}^{k+1} \\ + \vec{d}^k = - \frac{\hat{\vec{m}}}{\sqrt{\hat{\vec{v}}} + \varepsilon} \\ + \end{gathered} + \] + The update is performed as: + \[ \z^{k+1} = \z^{k} + \alpha \vec{d}^k \] +\end{description} + + + +\subsection{Learning paradigms} + +\begin{description} + \item[Federated learning] \marginnote{Federated learning} + Problem where $N$ agents with their local and private data $\mathcal{D}^{i}$ want to learn a common set of parameters $\z^*$ based on the same loss function (evaluated on different data points): + \[ + \min_\z \sum_{i=1}^{N} l(\z; \mathcal{D}^i) + \] + A centralized parameter server (master) is responsible for aggregating the estimates of the agents (e.g., pick some nodes and average them). + % \[ + % \z^{t+1} = \z^k - \alpha \sum_{i \in I_k} \nabla l(\z; \mathcal{D}^i, p^i) + % \] + + \item[Distributed learning] \marginnote{Distributed learning} + Federated learning where there is no centralized entity and agents communicate with their neighbors only. +\end{description} + +\begin{figure}[H] + \centering + \begin{subfigure}{0.45\linewidth} + \centering + \includegraphics[width=0.55\linewidth]{./img/federated_learning.png} + \caption{Federated learning} + \end{subfigure} + \begin{subfigure}{0.45\linewidth} + \centering + \includegraphics[width=0.7\linewidth]{./img/distributed_learning.png} + \caption{Distributed learning} + \end{subfigure} +\end{figure} + + + +\section{Distributed cost-coupled/consensus optimization} + +\begin{description} + \item[Distributed cost-coupled optimization] \marginnote{Distributed cost-coupled optimization} + Optimization problem with $N$ agents that communicate according to a graph $G$ aiming at learning a common set of parameters $\z$ such that: + \[ + \min_{\z \in Z} \sum_{i=1}^{N} l_i(\z) + \] + where: + \begin{itemize} + \item Each agent $i$ knows its loss $l_i$ (based on its available data) and the parameter space $Z$, + \item At each time step $k$, each agent $i$ estimates a set of parameters $\z_i^k$. + \end{itemize} + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{./img/_distributed_cost_coupled.pdf} + \end{figure} +\end{description} + + +\subsection{Optimization algorithms} + +\begin{remark} + Using as direction the sum of the gradients of all agents is not possible as not everyone can communicate with everyone. +\end{remark} + +\begin{description} + \item[Distributed gradient algorithm] \marginnote{Distributed gradient algorithm} + Method that estimates a (more precise) set of parameters as a weighted sum those of its neighbors' (self-loop included): + \[ + \vec{v}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k + \] + Then, the update step is performed using $\vec{v}_i^{k+1}$ and the agent's own local loss $l_i$: + \[ + \begin{split} + \z_i^{k+1} &= \vec{v}_i^{k+1} - \alpha^k \nabla l_i(\vec{v}_i^{k+1}) \\ + &= \left(\sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k\right) - \alpha^k \nabla l_i\left(\sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k\right) + \end{split} + \] + + \begin{theorem}[Distributed gradient algorithm convergence] \marginnote{Distributed gradient algorithm convergence} + Assume that: + \begin{itemize} + \item The matrix $\matr{A}$ associated to the undirected and connected communication graph $G$ is doubly stochastic and such that $a_{ij} > 0$, + \item The step size is diminishing, + \item Each $l_i$ is convex, has gradients bounded by a scalar $C_i > 0$, and there exists at least one optimal solution. + \end{itemize} + Then, the sequence of local solutions $\{ \z_i^k \}_{k \in \mathbb{N}}$ of each agent $i$ produced using the distributed gradient algorithm converges to a common optimal solution $\z^*$: + \[ \lim_{k \rightarrow \infty} \Vert \z_i^k - \z^* \Vert = 0 \] + \end{theorem} + + \item[Distributed projected subgradient algorithm] \marginnote{Distributed projected subgradient algorithm} + Distributed gradient algorithm extended to the case where $l_i$ are non-smooth convex functions and $\z$ is constrained to a closed convex set $Z \subseteq \mathbb{R}^d$. The distributed step is the following: + \[ + \begin{split} + \vec{v}_i^{k+1} &= \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k \\ + \z_i^{k+1} &= P_Z \big( \vec{v}_i^{k+1} - \alpha^k \tilde{\nabla} l_i(\vec{v}_i^{k+1}) \big) + \end{split} + \] + where $P_Z(\cdot)$ is the Euclidean projection onto $Z$ and $\tilde{\nabla} l_i$ is a subgradient of $l_i$. + + \begin{theorem}[Distributed projected subgradient algorithm convergence] \marginnote{Distributed projected subgradient algorithm convergence} + Assume that: + \begin{itemize} + \item The adjacency matrix $\matr{A}$ associated to $G$ is doubly stochastic and $a_{ij} > 0$, + \item The step size is diminishing, + \item Each $l_i$ is convex, has subgradients bounded by a scalar $C_i > 0$, and there exists at least one optimal solution. + \end{itemize} + Then, each agent converges to an optimal solution $\z^*$. + \end{theorem} +\end{description} + + +\begin{theorem} + The distributed gradient algorithm does not converge with a constant step size. + + \begin{proof}[Proof idea] + % Assume that the starting guess in standard gradient descent is is a local minimum: + % \[ \z^{0} = \z^* \] + % We have that: + % \[ + % \begin{split} + % \z^* &= \z^* - \alpha \nabla l(\z^*) \\ + % \z^* &= \z^* \\ + % \end{split} + % \] + % $\z^*$ is an equilibrium. + + We want to check whether the optimum $\z^*$ with a constant step size $\alpha$ is an equilibrium: + \[ + \begin{aligned} + \z^* &= \sum_{j=1}^N a_{ij} \z^* - \alpha \nabla l_i \left( \sum_{j=1}^N a_{ij} \z^* \right) \\ + &= \z^* - \alpha \nabla l_i (\z^*) &&& \parbox{0.20\linewidth}{\footnotesize $\matr{A}$ doubly stochastic and $\z^*$ constant} \\ + \end{aligned} + \] + In general, $\nabla l_i(\z^*) \neq 0$ ($\z^*$ is the optimum for the whole problem, but $l_i$ depends on the subset of data available to the agent). Therefore, $\z^*$ is not an equilibrium. + \end{proof} +\end{theorem} + + +\begin{description} + \item[Dynamic average consensus] \marginnote{Dynamic average consensus} + Consensus algorithm where each agent measures a signal $r_i^k$ and wants to estimate the average signal of all agents: + \[ \bar{r}^k = \frac{1}{N} \sum_{i=1}^{N} r_i^k \] + The average signal estimated by an agent is represented by a state $s_i^k$ and we want that $\lim_{k \rightarrow \infty} \Vert s_i^k - \bar{r}^k \Vert = 0$. This can be achieved using a perturbed consensus algorithm: + \[ + s_i^{k+1} = + \underbrace{\sum_{j \in \mathcal{N}_i} a_{ij} s_j^k}_{\text{Consensus}} + + \underbrace{\vphantom{\sum_{j \in \mathcal{N}_i}}(r_i^{k+1} - r_i^k)}_{\text{Innovation}} + \] + where: + \begin{itemize} + \item The consensus term converges to the states average. + \item The local innovation allows converging to the common signal. + \end{itemize} + + \begin{theorem}[Dynamic average consensus convergence] + If the first-order differences are bounded (i.e., $\Vert r_i^{k+1} - r_i^{k} \Vert \leq C_1$), then the tracking error is bounded by some $C_2 > 0$: + \[ \lim_{k \rightarrow \infty} \Vert s_i^k - \bar{r}^k \Vert \leq C_2 \] + + Moreover, the error is zeroed if the signal becomes constant after some time $k$ (i.e., $\Vert r_i^{k+1} - r_i^{k} \Vert \rightarrow 0$). + \end{theorem} + + \item[Gradient tracking algorithm] \marginnote{Gradient tracking algorithm} + Method that chooses the local descent direction attempting to asymptotically track the true gradient: + \[ d_i^k \underset{k \rightarrow \infty}{\longrightarrow} - \frac{1}{N} \sum_{h=1}^N \nabla l_h(\z_h^k) \] + + By using dynamic average consensus, we consider as signal the local gradient: + \[ \vec{r}_i^k = \nabla l_i(\z_i^k) \] + Then, the estimate of the average signal (i.e., gradient) is given by: + \[ + \vec{s}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \vec{s}_j^k + \left( \nabla l_i(\z_i^{k+1}) - \nabla l_i(\z_i^k) \right) + \] + The update step is then performed as: + \[ \z_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k - \alpha \vec{s}_i^k \] + + % Each agent accesses some $\nabla l_i(\z)$. It can be seen as a signal $r_i^k = \nabla l_i(\z_i^k)$ only available at agent $i$. + + % We want that the direction: + % \[ d_i^k \underset{k \rightarrow \infty}{\longrightarrow} - \frac{1}{N} \sum_{h=1}^N r_h^k \] + + % Ideally, we want: + % \[ + % \z_i^{k+1} = \sum_{j=1}^N a_{ij} \z_j^k - (N\alpha) \frac{1}{N} \sum_{j=1}^N \nabla l_k(z_k^k) + % \] \end{description} \ No newline at end of file