diff --git a/src/year2/distributed-autonomous-systems/sections/_optimization.tex b/src/year2/distributed-autonomous-systems/sections/_optimization.tex
index b331e6b..cebeee6 100644
--- a/src/year2/distributed-autonomous-systems/sections/_optimization.tex
+++ b/src/year2/distributed-autonomous-systems/sections/_optimization.tex
@@ -430,7 +430,43 @@
 \end{description}
 
 
-\subsection{Optimization methods}
+\subsection{Learning paradigms}
+
+\begin{description}
+    \item[Federated learning] \marginnote{Federated learning}
+        Problem where $N$ agents with their local and private data $\mathcal{D}^{i}$ want to learn a common set of parameters $\z^*$ based on the same loss function (evaluated on different data points):
+        \[
+            \min_\z \sum_{i=1}^{N} l(\z; \mathcal{D}^i)
+        \]
+        A centralized parameter server (master) is responsible for aggregating the estimates of the agents (e.g., pick some nodes and average them).
+        % \[
+        %     \z^{t+1} = \z^k - \alpha \sum_{i \in I_k} \nabla l(\z; \mathcal{D}^i, p^i)
+        % \]
+
+    \item[Distributed learning] \marginnote{Distributed learning}
+        Federated learning where there is no centralized entity and agents communicate with their neighbors only.
+\end{description}
+
+\begin{figure}[H]
+    \centering
+    \begin{subfigure}{0.4\linewidth}
+        \centering
+        \includegraphics[width=0.55\linewidth]{./img/federated_learning.png}
+        \caption{Federated learning}
+    \end{subfigure}
+    \begin{subfigure}{0.4\linewidth}
+        \centering
+        \includegraphics[width=0.7\linewidth]{./img/distributed_learning.png}
+        \caption{Distributed learning}
+    \end{subfigure}
+\end{figure}
+
+
+
+\section{Federated learning}
+
+
+\subsection{Batch gradient method}
 
 \begin{description}
     \item[Batch gradient method] \marginnote{Batch gradient method}
@@ -442,7 +478,12 @@
         \begin{remark}
             Computation in this way can be expensive.
         \end{remark}
+\end{description}
 
+
+\subsection{Incremental gradient method}
+
+\begin{description}
     \item[Incremental gradient method] \marginnote{Incremental gradient method}
         At each iteration $k$, compute the direction by considering the loss of a single agent $i^k$:
         \[
@@ -469,6 +510,8 @@
 \end{description}
 
 
+\subsection{Stochastic gradient descent}
+
 \begin{description}
     \item[Stochastic gradient descent (SGD)] \marginnote{Stochastic gradient descent (SGD)}
         Instance of incremental gradient method where the selection rule follows an unknown distribution.
@@ -507,7 +550,12 @@
     \item[Mini-batch SGD] \marginnote{Mini-batch SGD}
         SGD where the update at each time step $k$ is based on a set $\mathcal{I}^k \subset \{ 1, \dots, N \}$ of realizations of $\mathcal{W}$:
         \[ \z^{k+1} = \z^k - \alpha \sum_{i \in \mathcal{I}^k} \nabla l(\z^k, w^i) \]
+\end{description}
 
+
+\subsection{Adaptive momentum}
+
+\begin{description}
     \item[Adaptive momentum (ADAM)] \marginnote{Adaptive momentum (ADAM)}
         Method based on the first and second momentum of the gradient:
         \[
@@ -533,39 +581,6 @@
 
 
 
-\subsection{Learning paradigms}
-
-\begin{description}
-    \item[Federated learning] \marginnote{Federated learning}
-        Problem where $N$ agents with their local and private data $\mathcal{D}^{i}$ want to learn a common set of parameters $\z^*$ based on the same loss function (evaluated on different data points):
-        \[
-            \min_\z \sum_{i=1}^{N} l(\z; \mathcal{D}^i)
-        \]
-        A centralized parameter server (master) is responsible for aggregating the estimates of the agents (e.g., pick some nodes and average them).
-        % \[
-        %     \z^{t+1} = \z^k - \alpha \sum_{i \in I_k} \nabla l(\z; \mathcal{D}^i, p^i)
-        % \]
-
-    \item[Distributed learning] \marginnote{Distributed learning}
-        Federated learning where there is no centralized entity and agents communicate with their neighbors only.
-\end{description}
-
-\begin{figure}[H]
-    \centering
-    \begin{subfigure}{0.45\linewidth}
-        \centering
-        \includegraphics[width=0.55\linewidth]{./img/federated_learning.png}
-        \caption{Federated learning}
-    \end{subfigure}
-    \begin{subfigure}{0.45\linewidth}
-        \centering
-        \includegraphics[width=0.7\linewidth]{./img/distributed_learning.png}
-        \caption{Distributed learning}
-    \end{subfigure}
-\end{figure}
-
-
-
 \section{Distributed cost-coupled/consensus optimization}
 
 \begin{description}
@@ -642,9 +657,6 @@
     Then, each agent converges to an optimal solution $\z^*$.
 \end{theorem}
 
-
-\subsection{Gradient tracking algorithm}
-
 \begin{theorem}
     The distributed gradient algorithm does not converge with a constant step size.
 
@@ -672,6 +684,8 @@
 \end{theorem}
 
 
+\subsection{Gradient tracking algorithm}
+
 \begin{description}
     \item[Dynamic average consensus] \marginnote{Dynamic average consensus}
         Consensus algorithm where each agent measures a signal $r_i^k$ and wants to estimate the average signal of all agents: