diff --git a/src/year2/distributed-autonomous-systems/sections/_optimization.tex b/src/year2/distributed-autonomous-systems/sections/_optimization.tex index 503cc54..3cda825 100644 --- a/src/year2/distributed-autonomous-systems/sections/_optimization.tex +++ b/src/year2/distributed-autonomous-systems/sections/_optimization.tex @@ -718,7 +718,7 @@ \[ \vec{r}_i^k = \nabla l_i(\z_i^k) \] Then, the estimate of the average signal (i.e., gradient) is given by: \[ - \vec{s}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \vec{s}_j^k + \left( \nabla l_i(\z_i^{k+1}) - \nabla l_i(\z_i^k) \right) + \vec{s}_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \vec{s}_j^k + \left( \nabla l_i(\z_i^{k+1}) - \nabla l_i(\z_i^k) \right) \qquad \s_i^0 = \nabla l_i(\z_i^0) \] The update step is then performed as: \[ \z_i^{k+1} = \sum_{j \in \mathcal{N}_i} a_{ij} \z_j^k - \alpha \vec{s}_i^k \] @@ -748,9 +748,161 @@ \,\,\land\,\, \rho \Vert \z_i^{k+1} - \z^* \Vert \leq \rho^k \Vert \z_i^0 - \z^* \Vert \] + + { + \indenttbox + \begin{remark} + It can be shown that gradient tracking also works with non-convex optimization and, under the correct assumptions, converges to a stationary point. + \end{remark} + } + + \begin{proof} + Consider the gradient tracking algorithm written in matrix form: + \[ + \begin{aligned} + \z^{k+1} &= \A \z^k - \alpha \s^k \\ + \s^{k+1} &= \A \s^k + (\nabla \vec{l}(\z^{k+1}) - \nabla \vec{l}(\z^k)) + \end{aligned} + \] + where $\nabla \vec{l}(\z^k) = \begin{bmatrix} l_1(\z^k_1) & \dots & l_N(\z^k_N) \end{bmatrix}$. + + % \begin{remark} + % In the vector case, the Kronecker product should be applied on $\A$. + % \end{remark} + + \begin{description} + \item[Equilibrium] + We want to find the equilibrium points $(\z_\text{eq}, \s_\text{eq})$ that satisfies: + \[ + \begin{aligned} + \s_\text{eq} &= \A \s_\text{eq} + \nabla \vec{l}(\z_\text{eq}) - \nabla \vec{l}(\z_\text{eq}) &\iff& (\matr{I} - \A) \s_\text{eq} = 0 \\ + \z_\text{eq} &= \A\z_\text{eq} - \alpha \s_\text{eq} &\iff& (\matr{I} - \A) \z_\text{eq} = -\alpha \s_\text{eq} \\ + \end{aligned} + \] + It must be that: + \begin{itemize} + \item $\s_\text{eq} \in \text{ker}(\matr{I} - \A) = \{ \vec{1}\beta_1 \mid \beta_1 \in \R \}$ (as $\A$ is doubly stochastic). + \item $(\matr{I} - \A) \z_\text{eq} = - \alpha \vec{1} \beta_1$. As $\vec{1} (-\alpha \beta_1) \in \text{ker}(\matr{I} - \A)$, it must be that $\beta_1 = 0$ (as the image cannot be mapped into the kernel). + \end{itemize} + Therefore, we end up with: + \[ + \begin{split} + \s_\text{eq} &= \vec{1}\beta_1 = 0 \\ + \z_\text{eq} &= \A\z_\text{eq} - \alpha 0 = \matr{1} \beta_2 \quad \text{ i.e., eigenvector of $\A$} \\ + \end{split} + \] + + In addition, by pre-multiplying the equation of $\s$ by $\vec{1}^T$, we obtain: + \[ + \begin{split} + \vec{1}^T \s^{k+1} &= \vec{1}^T \A \s^k + \vec{1}^T \nabla \vec{l}(\z^{k+1}) - \vec{1}^T \nabla \vec{l}(\z^{k}) \\ + &= \vec{1}^T \s^k + \vec{1}^T \nabla \vec{l}(\z^{k+1}) - \vec{1}^T \nabla \vec{l}(\z^{k}) + \end{split} + \] + Which shows the following invariance condition: + \[ + \begin{aligned} + \vec{1}^T \s^{k+1} - \vec{1}^T \nabla \vec{l}(\z^{k+1}) + &= \vec{1}^T \s^k - \vec{1}^T \nabla \vec{l}(\z^{k}) \\ + &= \vec{1}^T \s_\text{eq} - \vec{1}^T \nabla \vec{l}(\z_\text{eq}) \\ + &= \vec{1}^T \s^0 - \vec{1}^T \nabla \vec{l}(\z^{0}) \\ + \end{aligned} + \] + Thus, we have that: + \[ + \begin{split} + \vec{1}^T \s_\text{eq} - \vec{1}^T \nabla \vec{l}(\z_\text{eq}) + &= \vec{1}^T \s^0 - \vec{1}^T \nabla \vec{l}(\z^{0}) \\ + \iff 0 - \vec{1}^T \nabla \vec{l}(\vec{1}\beta_2) &= 0 \\ + \end{split} + \] + Then, it must be that $\z_\text{eq} = \vec{1}\beta_2$ is an optimum with $\beta_2 = z^*$. + + \item[Stability] + % Change in coordinates to avoid having $\z^{k+1}$ in $\s^{k}$. The (non-linear) transformation is: + % \[ + % \begin{bmatrix} + % \z^k \\ \s^k + % \end{bmatrix} + % \mapsto + % \begin{bmatrix} + % \z^k \\ \vec{\xi}^k + % \end{bmatrix} + % = + % \begin{bmatrix} + % \z^k \\ \alpha (\nabla \vec{l}(\z^k) - \s^k) + % \end{bmatrix} + % \] + + % \[ + % \begin{split} + % \z^{k+1} + % &= \A\z^k - \alpha ( \frac{1}{\alpha} \vec{\xi}^k + \nabla \vec{l}(\z^k) ) \\ + % \vec{\xi}^k + % &= \alpha \nabla \vec{l}(\z^{k+1}) - \alpha (\A \s^k + \nabla \vec{l}(\z^{k+1}) - \nabla \vec{l} (\z^k)) \\ + % &= - \alpha \A (-\frac{1}{\alpha} \xi^k + \nabla \vec{l}(\z^k)) + \alpha \nabla \vec{l}(\z^k) \\ + % &= \A \vec{\xi}^k - \alpha(\A - \vec{I}) \nabla \vec{l}(\z^k) + % \end{split} + % \] + + % In matrix form: + % \[ + % \begin{bmatrix} + % \z^{k+1} \\ \vec{\xi}^{k+1} = \begin{bmatrix} + % \A & \matr{I} \\ 0 & \A + % \end{bmatrix} + % \begin{bmatrix} + % \z^k \\ \vec{\xi}^k + % \end{bmatrix} + % - alpha \begin{bmatrix} + % \matr{I} \\ \A \matr{I} + % \end{bmatrix} + % \nabla \vec{l}(\z^k) + % \end{bmatrix} + % \] + % The initialization is: + % \[ + % \begin{split} + % \z^0 \in \R^N \\ + % \vec{\xi}^{0} = \alpha (\nabla \vec{l}(\z^0) - \s^0) = 0 + % \end{split} + % \] + % The equilibrium has been shifted to: + % \[ + % \begin{split} + % \z_\text{eq} = \vec{1} \z^* \\ + % \vec{\xi}_\text{eq} = \alpha \nabla l(\vec{1} \z^*) = \alpha \begin{bmatrix} + % \nabla l_1(\z^*) \\ \vdots \\ \nabla l_N(\z^*) + % \end{bmatrix} + % \end{split} + % \] + + + % \[ + % \begin{gathered} + % \begin{bmatrix} + % \z^{k+1} \\ \vec{\xi}^{k+1} = \begin{bmatrix} + % \A & \matr{I} \\ 0 & \A + % \end{bmatrix} + % \begin{bmatrix} + % \z^k \\ \vec{\xi}^k + % \end{bmatrix} + % \begin{bmatrix} + % \matr{I} \\ \A \matr{I} + % \end{bmatrix} + % \u^k + % \end{bmatrix} \\ + % \vec{y}^k = \begin{bmatrix} + % \matr{I} & 0 + % \end{bmatrix} + % \begin{bmatrix} + % \z^k \\ \vec{\xi}^{k} + % \end{bmatrix} \\ + % -- \\ + % \u^k = \nabla \vec{l}(\vec{y}^k) + % \end{gathered} + % \] + \end{description} + \end{proof} \end{theorem} \end{description} - -\begin{remark} - It can be shown that gradient tracking also works with non-convex optimization and, under the correct assumptions, converges to a stationary point. -\end{remark} \ No newline at end of file