Add Q-learning

2025-12-14 18:51:52 +01:00 · 2024-05-22 21:58:45 +02:00
parent 9d6b6aa7b1
commit ef7ba6ff68
2 changed files with 189 additions and 0 deletions
--- a/src/year1/deep-learning/dl.tex
+++ b/src/year1/deep-learning/dl.tex
@ -13,5 +13,6 @@
    \input{./sections/_computer_vision.tex}
    \input{./sections/_generative_models.tex}
    \input{./sections/_sequence_modeling.tex}
+    \input{./sections/_reinforcement_learning.tex}
    
 \end{document}
--- a/src/year1/deep-learning/sections/_reinforcement_learning.tex
+++ b/src/year1/deep-learning/sections/_reinforcement_learning.tex
@ -0,0 +1,188 @@
+\chapter{Reinforcement learning}
+
+
+\begin{description}
+    \item[Reinforcement learning (RL)] \marginnote{Reinforcement learning (RL)}
+        Learning a behavior (policy) by taking actions in a mutable environment that responds with rewards.
+
+    \item[Policy] \marginnote{Policy}
+        Probability distribution $\pi(a_t | s_t)$ that given the current state $s_t$ indicates 
+        the likelihood of an action $a_t$.
+
+    \item[Future cumulative reward] \marginnote{Future cumulative reward}
+        Starting from a time step $t$, the future cumulative reward $R$ is the sum of all the local rewards $r_t$:
+        \[ R = \sum_{i \geq 1} r_i \]
+
+        \begin{description}
+            \item[Future discounted cumulative reward] \marginnote{Future discounted cumulative reward} 
+                Take into account the fact that future rewards are less certain than closer ones.
+                \[ R = \sum_{i \geq 1} \gamma^{(i)} r_i \]
+                where $0 <\gamma^{(i)} \leq 1$ is a discount factor that decreases exponentially over time.
+        \end{description}
+
+    \item[Markov decision process] \marginnote{Markov decision process}
+        The environment can be modeled as a Markov decision process where future actions only depend on the current state.
+        This is defined by the tuple $(\mathcal{S}, \mathcal{A}, \mathcal{R}, \mathcal{P}, \gamma)$
+        where:
+        \begin{itemize}
+            \item $\mathcal{S}$ is the set of possible states.
+            \item $\mathcal{A}$ is the set of possible actions.
+            \item $\mathcal{R}$ is the set of rewards given state and action.
+            \item $\mathcal{P}$ is the transition probability given state and action.
+            \item $\gamma$ is the discount factor.
+        \end{itemize}
+
+    \item[RL problem] \marginnote{RL problem}
+        Problem involving an agent that interacts with an environment.
+        At each time step $t$, the following happens:
+        \begin{enumerate}
+            \item From the current state $s_t$, the agent selects an action $a_t$ according to a policy $\pi(a_t | s_t)$.
+            \item The environment answers with a local reward $r_t \sim \mathcal{R}(r_t | s_t, a_t)$.
+            \item The environment samples the next state $s_{t+1} \sim \mathcal{P}(s_{t+1} | s_t, a_t)$.
+            \item The agent updates its policy accordingly.
+        \end{enumerate}
+
+        \begin{remark}
+            A policy defines a trajectory:
+            \[ (s_0, a_0) \mapsto (r_1, s_1, a_1) \mapsto \dots \]
+        \end{remark}
+
+    \item[Episode] \marginnote{Episode}
+        Sequence of interactions between agent and environment from an initial state to a final state.
+
+        \begin{remark}
+            It is roughly the equivalent of an epoch.
+        \end{remark}
+
+    \item[Optimal policy] \marginnote{Optimal policy}
+        Policy $\pi^*$ that maximizes the average future reward over all possible trajectories:
+        \[ \pi^* = \arg\max_\pi \mathbb{E}\left[ \sum_{t \geq 0} \gamma^{(t)} r_t \right] \]
+
+    \item[Model-based approach] \marginnote{Model-based}
+        Method that needs to learn the transition probability $\mathcal{P}(s_{t+1} | s_t, a_t)$.
+
+    \item[Model-free approach] \marginnote{Model-free}
+        Method that only learns to make actions based on past experience.
+
+        There are mainly two techniques:
+        \begin{descriptionlist}
+            \item[Value-based] \marginnote{Value-based}
+                Learn a value function $V(s)$ that evaluates each state $s$.
+                The policy is implicit, the best action is the one that brings to the state with the best evaluation.
+            
+            \item[Policy-based] \marginnote{Policy-based}
+                Directly improve the probability distribution defined by the policy.
+        \end{descriptionlist}
+\end{description}
+
+
+
+\section{$Q$-learning}
+
+$Q$-learning is a value-based approach that learns a function $Q$ that acts as a proxy for the value function $V$.
+
+\begin{description}
+    \item[$Q$-value] \marginnote{$Q$-value}
+        Measures the goodness of an action $a$ in the state $s$ by considering its future reward:
+        \[ Q(s, a) = \mathbb{E}_{\substack{s_0 = s\\a_0 = a}} \left[ \sum_{t \geq 0} \gamma^{(t)} r_t \right] \]
+
+    \item[Value function] \marginnote{Value function}
+        Measures the goodness of a state $s$ by considering its future reward:
+        \[ V(s) = \mathbb{E}_{s_0 = s} \left[ \sum_{t \geq 0} \gamma^{(t)} r_t \right] \]
+
+        Given $Q$, $V$ can be computed as:
+        \[ V(s) = \sum_{a} \pi(a | s) Q (s, a) \]
+
+        \begin{remark}
+            Given $V$, $Q$ can be computed as:
+            \[ Q(s_t, a_t) = \sum_{s_{t+1}} \mathcal{P}(s_{t+1} | s_t, a_t) V(s_{t+1}) \]
+            but this requires a model-based approach as $\mathcal{P}$ is needed.
+        \end{remark}
+
+    \item[Optimal $Q$-value] \marginnote{Optimal $Q$-value}
+        The optimal $Q$-value $Q^*$ is the one that maximizes the expected cumulative reward 
+        achievable starting from state $s$ with action $a$:
+        \[ Q^*(s, a) = \max_\pi \mathbb{E}_{\substack{s_0 = s\\a_0 = a}} \left[ \sum_{t \geq 0} \gamma^{(t)} r_t \right] \]
+
+    \item[Optimal policy] \marginnote{Optimal policy}
+        The optimal policy $\pi^*$ is the one that makes the best action according to the optimal $Q$-value $Q^*$.
+\end{description}
+
+
+\subsection{Training}
+\begin{description}
+    \item[Bellman equation] \marginnote{Bellman equation}
+        Expresses the optimal $Q$-value in terms of subproblems:
+        \[ Q^*(s_t, a_t) = \mathbb{E}_{s_{t+1}} \left[ r_t + \gamma \max_{a_{t+1}} Q^*(s_{t+1}, a_{t+1}) \right] \]
+        where $\max_{a_{t+1}} Q^*(s_{t+1}, a_{t+1}) = R_{s_{t+1}} = V^*(s_{t+1})$ is the optimal future cumulative reward from $s_{t+1}$ with action $a_{t+1}$.
+
+        $Q^*$ can then be iteratively computed as follows:
+        \[ Q^{(i+1)}(s_t, a_t) = Q^{(i)}(s_t, a_t) + \alpha\left( r_t + \gamma \max_{a_{t+1}} Q^{(i)}(s_{t+1}, a_{t+1}) - Q^{(i)}(s_t, a_t) \right) \]
+        where:
+        \begin{itemize}
+            \item $\alpha$ is the learning rate.
+            \item The update step aims to impose $Q^{(i)}(s_t, a_t) = r_t + \gamma \max_{a_{t+1}} Q^*(s_{t+1}, a_{t+1})$
+                (i.e. respect the Bellman equation).
+        \end{itemize}
+
+    \item[$Q$-learning transition] \marginnote{$Q$-learning transition}
+        Tuple of form:
+        \[ (s_t, a_t, r_t, T, s_{t+1}) \]
+        where:
+        \begin{itemize}
+            \item $s_t$ is the current state.
+            \item $a_t$ is the action performed at the current step.
+            \item $r_t$ is the reward at the current step.
+            \item $T$ is a boolean indicating if the episode has ended.
+            \item $s_{t+1}$ is the next state after doing the action.
+        \end{itemize}
+
+        For training, transitions are collected in a buffer by exploring the environment.
+        Collected transitions can be replayed in any order, this has the advantage of:
+        \begin{itemize}
+            \item Avoid using correlated consecutive samples.
+            \item Avoid biases caused by the exploitation of unbalanced transitions.
+        \end{itemize}
+
+        \begin{remark}
+            $Q$-learning is an off-policy method. Training does not rely on a policy and only needs local transitions.
+        \end{remark}
+
+    \item[Epsilon greedy strategy] \marginnote{Epsilon greedy strategy}
+        Introduce an exploration rate $\varepsilon$, initially set to 1 and progressively reduced during training.
+        $\varepsilon$ is the probability of choosing a random action (exploration) instead of choosing the best-known action (exploitation).
+
+    \item[Algorithm] \marginnote{$Q$-learning training}
+        Given:
+        \begin{itemize}
+            \item A $Q$-table (to store the $Q$-values $Q(s, a)$), 
+            \item A replay buffer $D$,
+            \item The initial state $s_0$,
+            \item The learning rate $\alpha$ and the discount factor $\gamma$,
+            \item The exploration rate $\varepsilon$, 
+        \end{itemize}
+        an episode of $Q$-learning training does the following:
+        \begin{itemize}
+            \item Until the episode is not terminated:
+            \begin{enumerate}
+                \item Choose the next action as:
+                    \[ a_t = \begin{cases}
+                        \texttt{random}(\mathcal{A}) & \text{with probability $\varepsilon$} \\
+                        \max_a Q(s_t, a) & \text{with probability $\varepsilon$} \\
+                    \end{cases} \]
+                \item Perform $a_t$ and observe the reward $r_t$ and the next state $s_{t+1}$.
+                \item Store $(s_t, a_t, r_t, T, s_{t+1})$ in $D$.
+                \item Sample a random mini-batch $B$ from $D$. For each transition in $B$:
+                \begin{enumerate}
+                    \item Estimate the cumulative future reward:
+                        \[ R = \begin{cases}
+                            r_t & \text{if the episode is terminated} \\
+                            r_t + \gamma \max_{a_{t+1}} Q(s_{t+1}, a_{t+1}) & \text{otherwise} \\
+                        \end{cases} \]
+                    \item Update the $Q$-table as:
+                        \[ Q(s_t, a_t) = Q(s_t, a_t) + \alpha(R - Q(s_t, a_t)) \]
+                \end{enumerate}
+                \item Decrement $\varepsilon$.
+            \end{enumerate}
+        \end{itemize} 
+\end{description}