Add CN2 reinforcement learning

2026-02-04 07:41:43 +01:00 · 2024-05-26 18:38:29 +02:00
parent 767bd94ee6
commit d292a243e3
39 changed files with 899 additions and 1 deletions
--- a/src/year1/cognition-and-neuroscience/module2/cn2.tex
+++ b/src/year1/cognition-and-neuroscience/module2/cn2.tex
@ -16,6 +16,7 @@

    \input{./sections/_object_recognition.tex}
    \input{./sections/_nn_recognition.tex}
+    \input{./sections/_dopamine.tex}
    
    \printbibliography[heading=bibintoc]

--- a/src/year1/cognition-and-neuroscience/module2/img/actor_critic.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/actor_critic.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl2.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl2.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl3.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl3.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl4.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl4.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl5.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl5.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl6.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl6.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl7.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl7.png
--- a/src/year1/cognition-and-neuroscience/module2/img/distr_rl8.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/distr_rl8.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dm_processes.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dm_processes.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dm_theories.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dm_theories.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_flow.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_flow.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_general1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_general1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden2.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden2.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden3.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden3.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden4.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden4.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden5.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden5.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect2.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect2.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_pathways.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_pathways.png
--- a/src/year1/cognition-and-neuroscience/module2/img/dopamine_transfer.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/dopamine_transfer.png
--- a/src/year1/cognition-and-neuroscience/module2/img/human_sr1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/human_sr1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/human_sr2.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/human_sr2.png
--- a/src/year1/cognition-and-neuroscience/module2/img/human_sr3.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/human_sr3.png
--- a/src/year1/cognition-and-neuroscience/module2/img/perceptual_dm.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/perceptual_dm.png
--- a/src/year1/cognition-and-neuroscience/module2/img/pfc_learning.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/pfc_learning.png
--- a/src/year1/cognition-and-neuroscience/module2/img/pfc_vs_dopamine.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/pfc_vs_dopamine.png
--- a/src/year1/cognition-and-neuroscience/module2/img/rat_maze1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/rat_maze1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/rat_maze2.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/rat_maze2.png
--- a/src/year1/cognition-and-neuroscience/module2/img/saccade1.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/saccade1.png
--- a/src/year1/cognition-and-neuroscience/module2/img/saccade2.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/saccade2.png
--- a/src/year1/cognition-and-neuroscience/module2/img/saccade3.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/saccade3.png
--- a/src/year1/cognition-and-neuroscience/module2/img/sr_mismatch.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/sr_mismatch.png
--- a/src/year1/cognition-and-neuroscience/module2/img/valuation_circuitry.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/valuation_circuitry.png
--- a/src/year1/cognition-and-neuroscience/module2/img/value_dm.png
+++ b/src/year1/cognition-and-neuroscience/module2/img/value_dm.png
--- a/src/year1/cognition-and-neuroscience/module2/references.bib
+++ b/src/year1/cognition-and-neuroscience/module2/references.bib
@ -81,4 +81,157 @@
    pages = {e2014196118},
    year = {2021},
    doi = {10.1073/pnas.2014196118},
-}
+}
+
+
+@article{monkey_dopamine,
+    title = {Dopamine neurons of the monkey midbrain: contingencies of responses to active touch during self-initiated arm movements},
+    author = {Romo, R. and Schultz, W.},
+    journal = {Journal of Neurophysiology},
+    volume = {63},
+    number = {3},
+    pages = {592-606},
+    year = {1990},
+    doi = {10.1152/jn.1990.63.3.592},
+}
+
+
+@article{dopamine_transfer,
+    title = {A Neural Substrate of Prediction and Reward},
+    author = {Wolfram Schultz  and Peter Dayan  and P. Read Montague },
+    journal = {Science},
+    volume = {275},
+    number = {5306},
+    pages = {1593-1599},
+    year = {1997},
+    doi = {10.1126/science.275.5306.1593},
+}
+
+
+@article{dopamine_transfer2,
+    title = {Behavioral Theories and the Neurophysiology of Reward}, 
+    author = {Schultz, Wolfram},
+    journal= {Annual Review of Psychology},
+    year = {2006},
+    volume = {57},
+    number = {Volume 57, 2006},
+    pages = {87-115},
+    publisher = {Annual Reviews},
+    doi = {10.1146/annurev.psych.56.091103.070229},
+}
+
+
+@article{dopamine_bidirectional,
+    title = {Adaptive Coding of Reward Value by Dopamine Neurons},
+    author = {Philippe N. Tobler  and Christopher D. Fiorillo  and Wolfram Schultz },
+    journal = {Science},
+    volume = {307},
+    number = {5715},
+    pages = {1642-1645},
+    year = {2005},
+    doi = {10.1126/science.1105370},
+}
+
+
+@article{dopamine_probability,
+    title = {Discrete Coding of Reward Probability and Uncertainty by Dopamine Neurons},
+    author = {Christopher D. Fiorillo  and Philippe N. Tobler  and Wolfram Schultz },
+    journal = {Science},
+    volume = {299},
+    number = {5614},
+    pages = {1898-1902},
+    year = {2003},
+    doi = {10.1126/science.1077349},
+}
+
+
+@article{dopamine_temporal,
+    title = {Dopamine neurons report an error in the temporal prediction of reward during learning},
+    author = {Hollerman, Jeffrey R. and Schultz, Wolfram},
+    journal = {Nature Neuroscience},
+    year = {1998},
+    day = {01},
+    volume = {1},
+    number = {4},
+    pages = {304-309},
+    doi = {10.1038/1124},
+}
+
+
+@article{saccade,
+    title = {Dopamine neurons can represent context-dependent prediction error},
+    author = {Nakahara, Hiroyuki and Itoh, Hideaki and Kawagoe, Reiko and Takikawa, Yoriko and Hikosaka, Okihide},
+    journal = {Neuron},
+    publisher = {Elsevier BV},
+    volume = {41},
+    numbe = {2},
+    pages = {269--280},
+    year = {2004},
+    doi = {10.1016/S0896-6273(03)00869-9},
+}
+
+
+@article {indirect_learning,
+    title = {Midbrain dopamine neurons compute inferred and cached value prediction errors in a common framework},
+    author = {Sadacca, Brian F and Jones, Joshua L and Schoenbaum, Geoffrey},
+    volume = {5},
+    year = {2016},
+    pages = {e13665},
+    journal = {eLife},
+    issn = {2050-084X},
+    publisher = {eLife Sciences Publications, Ltd},
+    doi = {10.7554/eLife.13665},
+}
+
+
+@article{dopamine_hidden,
+    title = {Dopamine reward prediction errors reflect hidden-state inference across time},
+    author = {Starkweather, Clara Kwon and Babayan, Benedicte M. and Uchida, Naoshige and Gershman, Samuel J.},
+    journal = {Nature Neuroscience},
+    year = {2017},
+    volume = {20},
+    number = {4},
+    pages = {581-589},
+    issn = {1546-1726},
+    doi = {10.1038/nn.4520},
+}
+
+
+@article{dopamine_general,
+    title = {Optogenetic blockade of dopamine transients prevents learning induced by changes in reward features},
+    author = {Chang, Chun Yun and Gardner, Matthew and Di Tillio, Maria Gonzalez and Schoenbaum, Geoffrey},
+    journal = {Curr. Biol.},
+    publisher = {Elsevier BV},
+    volume = {27},
+    number = {22},
+    pages = {3480--3486.e3},
+    year = {2017},
+    doi = {10.1016/j.cub.2017.09.049},
+}
+
+
+@article{sr_learner,
+    title = {The successor representation in human reinforcement learning},
+    author = {Momennejad, I. and Russek, E. M. and Cheong, J. H. and Botvinick, M. M. and Daw, N. D. and Gershman, S. J.},
+    journal = {Nature Human Behaviour},
+    year = {2017},
+    volume = {1},
+    number = {9},
+    pages = {680-692},
+    issn = {2397-3374},
+    doi = {10.1038/s41562-017-0180-8},
+}
+
+
+@article{distributional_rl_brain,
+    title = {A distributional code for value in dopamine-based reinforcement learning},
+    author = {Dabney, Will and Kurth-Nelson, Zeb and Uchida, Naoshige and Starkweather, Clara Kwon and Hassabis, Demis and Munos, R{\'e}mi and Botvinick, Matthew},
+    journal = {Nature},
+    publisher = {Springer Science and Business Media LLC},
+    volume = {577},
+    number = {7792},
+    pages = {671--675},
+    year = {2020},
+    doi = {10.1038/s41586-019-1924-6},
+}
+
--- a/src/year1/cognition-and-neuroscience/module2/sections/_dopamine.tex
+++ b/src/year1/cognition-and-neuroscience/module2/sections/_dopamine.tex
@ -0,0 +1,744 @@
+\chapter{Dopamine in reinforcement learning}
+
+
+\section{Decision making}
+
+\begin{description}
+    \item[Decision-making] \marginnote{Decision-making}
+        Voluntary process that leads to the selection of an action based on sensory information.
+
+        Decisions are inherently non-deterministic as:
+        \begin{itemize}
+            \item Agents make inconsistent choices.
+            \item Agents make choices unaware of the full consequences (uncertainty).
+            \item Internal and external signals are noisy.
+        \end{itemize}
+
+        \begin{remark}
+            From an evolutionary point of view, stochasticity in decisions increases the chances of survival.
+        \end{remark}
+
+        \begin{remark}
+            Decision-making studied within the neuroscience field is actually studying cognition.
+            It involves studying the neural processes underlying a variety of mental functions.
+        \end{remark}
+    
+
+    \item[Perceptual decision-making] \marginnote{Perceptual decision-making}
+        An agent selects between action $A$ and $B$ based on weak or noisy external signals (e.g. ``do you see $A$ or $B$?'').
+
+        In this case, uncertainty comes from the external stimulus.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.55\linewidth]{./img/perceptual_dm.png}
+        \end{figure}
+
+    \item[Value-based decision-making] \marginnote{Value-based decision-making}
+        An agent selects between action $A$ and $B$ based on its subjective preferences (e.g. ``do you prefer $A$ or $B$?'').
+
+        In this case, uncertainty comes from the value associated with the action.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.55\linewidth]{./img/value_dm.png}
+        \end{figure}
+
+    
+    \item[Decision-making processes]
+        Decision-making involves the following processes:
+        \begin{descriptionlist}
+            \item[Representation] \marginnote{Representation}
+                States, actions, and internal and external factors are identified.
+
+            \item[Valuation] \marginnote{Valuation}
+                A value is assigned to the possible alternatives.
+
+            \item[Choice] \marginnote{Choice}
+                Values are compared and a proper action is selected.
+
+            \item[Outcome evaluation] \marginnote{Outcome evaluation}
+                After performing the action, the desirability of the outcome is measured (reward prediction error).
+
+            \item[Learning] \marginnote{Learning}
+                Feedback signals are used to update the processes and improve the quality of future decisions.
+        \end{descriptionlist}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.55\linewidth]{./img/dm_processes.png}
+        \end{figure}
+
+
+    \item[Valuation circuitry] \marginnote{Valuation circuitry}
+        Involves neurons sensitive to reward value.
+        They are spread in the brain, both in the cortical and subcortical regions.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.38\linewidth]{./img/valuation_circuitry.png}
+        \end{figure}
+
+        
+    \item[Decision-making theories] \marginnote{Decision-making theories}
+        \begin{descriptionlist}
+            \item[Economic learning] \marginnote{Economic learning}
+                Decision-making involving the selection of an action with the maximum utility.
+
+            \item[Reinforcement learning] \marginnote{Reinforcement learning}
+                Decision-making involving the probabilistic selection of an action.
+        \end{descriptionlist}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.55\linewidth]{./img/dm_theories.png}
+        \end{figure}
+\end{description}
+
+
+
+\section{Reinforcement learning}
+
+\begin{description}
+    \item[Reinforcement learning (RL)] \marginnote{Reinforcement learning (RL)}
+        Learn a mapping between states and actions aiming to maximize the expected cumulative future reward.
+
+        \begin{description}
+            \item[Markov conditional independence] At any time step, all future states and rewards only depend on the current state and action. 
+        \end{description}
+
+
+    \item[Bellman equation] \marginnote{Bellman equation}
+        Given an action $a_t$ performed in the state $s_t$ following a policy $\pi$,
+        the expected future reward is given by the following equation:
+        \[ Q_\pi(s_t, a_t) = r_t + \gamma \sum_{s_{t+1}} \prob{s_{t+1 | s_t, a_t}} Q_\pi(s_{t+1}, \pi(s_{t+1})) \]
+        where $\gamma$ is a discount factor.
+\end{description}
+
+
+\subsection{RL classes}
+
+\begin{description}
+    \item[Model-based] \marginnote{Model-based}
+        Aims to learn the right-hand side of the Bellman equation.
+        This requires knowing the state transition distribution $\mathcal{P}$ which is costly.
+
+    \item[Model-free] \marginnote{Model-free}
+        Aims to directly learn the left-hand side of the Bellman equation by estimating $Q_\pi$ from experience.
+        Agents use states, actions and rewards they experienced by averaging them to update a table of long-run reward predictions that
+        approximate the right-hand side of the Bellman equation.
+
+        \begin{description}
+            \item[Temporal difference learning] \marginnote{Temporal difference learning}
+                The reward prediction error at time $t$ is obtained by comparing the expected reward at time $t$ and at the next time step $t+1$:
+                \[ \delta_t = r_t + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t, a_t) \]
+        \end{description}
+\end{description}
+
+\begin{example}[Rat in maze]
+    A rat has to navigate a maze with two crossroads and two different outcomes.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.4\linewidth]{./img/rat_maze1.png}
+    \end{figure}
+
+    Two strategies can be developed:
+    \begin{descriptionlist}
+        \item[Model-based]
+            By learning the model of the environment, the rat can decide its path by using a search tree.
+            The path can be changed depending on its motivational state (e.g. hungry or thirsty) showing a goal-directed behavior.
+
+        \item[Model-free] 
+            The value of each state-action pair is stored and action selection consists of choosing the highest cached value at the current state.
+            Values do not consider the identity of the outcome and are therefore decoupled from the motivational state of the animal.
+
+            Nevertheless, if the motivational state is stored as part of the environmental state, the animal would be able to account for it.
+    \end{descriptionlist}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.6\linewidth]{./img/rat_maze2.png}
+    \end{figure}
+\end{example}
+
+
+
+\section{Dopaminergic system}
+
+There is strong evidence showing that the dopaminergic system is highly involved in reinforcement learning 
+for predicting both natural rewards and addictive drugs.
+
+\begin{description}
+    \item[Dopamine pathways] \marginnote{Dopamine pathways}
+        Dopamine projections include:
+        \begin{descriptionlist}
+            \item[Nigrostriatal system] Mostly associated with motor functions (action policy).
+            \item[Meso-cortico-limbic system] Mostly associated with motivation (value function).
+        \end{descriptionlist}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.5\linewidth]{./img/dopamine_pathways.png}
+        \end{figure}
+
+    \item[Actor/critic architecture] \marginnote{Actor/critic architecture}
+        Model with two components:
+        \begin{descriptionlist}
+            \item[Critic]
+                Takes as input a state and is responsible for learning and storing state values.
+                It also receives the reward from the environment and 
+                computes, through a temporal difference module, 
+                the prediction error $\delta_t$ that is used to update its own state values and train the actor.
+
+            \item[Actor] 
+                Takes as input a state and maps it to an action policy $\pi(a, s)$ that is used to determine the action to perform.
+        \end{descriptionlist}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.65\linewidth]{./img/actor_critic.png}
+            \caption{
+                \parbox[t]{0.6\linewidth}{
+                    Actor/critic architecture (A) and a possible mapping of the architecture onto neural substrates (B)
+                }
+            }
+        \end{figure}
+\end{description}
+
+
+\begin{description}
+    \item[Dopamine properties]
+        \phantom{}
+
+        \begin{description}
+            \item[Phasic response] \marginnote{Dopamine phasic response}
+                Depending on the stimulus, dopamine neurons can show excitatory or inhibitory responses.
+                This can be interpreted as a reward prediction error.
+
+                \begin{remark}
+                    About 75\% of dopamine neurons are activated when there is a rewarding stimulus
+                    and about 14\% of dopamine neurons are activated in response to an aversive stimulus.
+                \end{remark}
+
+                \begin{casestudy}[Dopamine as reward prediction error \cite{monkey_dopamine}]
+                    A monkey is required to touch the content of a box for which it does not have vision.
+
+                    It has been seen that dopamine neurons respond differently based on the content of the box.
+                    This is consistent with the fact that dopamine is used as a prediction error signal.
+                
+                    \begin{figure}[H]
+                        \centering
+                        \begin{subfigure}{0.48\linewidth}
+                            \centering
+                            \includegraphics[width=0.9\linewidth]{../module1/img/dopamine_monkey1.png}
+                        \end{subfigure}
+                        \begin{subfigure}{0.48\linewidth}
+                            \centering
+                            \includegraphics[width=0.9\linewidth]{../module1/img/dopamine_monkey2.png}
+                        \end{subfigure}
+                    \end{figure}
+                \end{casestudy}
+            
+
+            \item[Bidirectional prediction] \marginnote{Dopamine bidirectional prediction}
+                Dopamine captures both an improvement (positive prediction error) and a worsening (negative prediction error) of the reward.
+
+                \begin{casestudy}[Dopamine bidirectional prediction error \cite{dopamine_bidirectional}]
+                    It has been observed that the dopaminergic response differs depending on the amount of reward.
+
+                    \begin{figure}[H]
+                        \centering
+                        \includegraphics[width=0.65\linewidth]{../module1/img/dopamine_expected2.png}
+                        \caption{
+                            \parbox[t]{0.65\linewidth}{
+                                Dopamine response of a monkey trained on a medium amount of reward
+                            }
+                        }
+                    \end{figure}
+                \end{casestudy}
+
+
+            \item[Transfer] \marginnote{Dopamine transfer}
+                Dopaminergic activity shifts from responding to the reward to responding to the conditioned stimulus that predicts it.
+
+                \begin{casestudy}[Dopamine transfer \cite{dopamine_transfer, dopamine_transfer2}]
+                    It has been seen that the dopaminergic response transfers from the moment of receiving the reward to the stimuli associated with it (CS).
+                    This is in line with the temporal difference model. 
+
+                    \begin{figure}[H]
+                        \centering
+                        \begin{subfigure}{0.48\linewidth}
+                            \centering
+                            \includegraphics[width=0.8\linewidth]{../module1/img/dopamine_transfer_cs.png}
+                        \end{subfigure}
+                        \begin{subfigure}{0.48\linewidth}
+                            \centering
+                            \includegraphics[width=0.9\linewidth]{./img/dopamine_transfer.png}
+                        \end{subfigure}
+                    \end{figure}
+                \end{casestudy}
+
+
+            \item[Probability encoding] \marginnote{Probability encoding}
+                The dopaminergic response varies with the reward probability.
+
+                \begin{casestudy}[Dopamine probability encoding \cite{dopamine_probability}]
+                    It has been shown that dopamine responds differently based on the probability of receiving a reward.
+                    
+                    For high uncertainty (50\% probability of reward), a tonic response that starts from the CS and grows up to the reward time has been observed.
+
+                    \begin{figure}[H]
+                        \centering
+                        \includegraphics[width=0.8\linewidth]{../module1/img/dopamine_probability.png}
+                    \end{figure}
+                \end{casestudy}
+
+
+            \item[Temporal prediction] \marginnote{Dopamine temporal prediction}
+                Apart from encoding the unexpectedness of an event occurring, dopamine also accounts for the time the reward is expected to be delivered,
+                and responds accordingly if the delivery happens earlier or later.
+
+                \begin{casestudy}[Dopamine temporal prediction \cite{dopamine_temporal}]
+                    It has been shown that dopamine responds differently based on the time the reward is delivered.
+
+                    If the delivery happens earlier, the dopaminergic response increases.
+                    On the other hand, if the delivery happens later, dopamine neurons first pass a depressed phase.
+
+                    \begin{figure}[H]
+                        \centering
+                        \includegraphics[width=0.45\linewidth]{../module1/img/dopamine_timing.png}
+                    \end{figure}
+                \end{casestudy}
+    \end{description}
+\end{description}
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.7\linewidth]{./img/dopamine_flow.png}
+    \caption{Dopamine flow in the dopaminergic system}
+\end{figure}
+
+\begin{remark}[PFC neurons]
+    Differently from dopamine neurons, PFC neurons during learning fire in response to the reward and progressively start to fire also at the CS.
+    In addition, the strength of the response does not depend on the expectedness of the reward 
+    (i.e. it only acts as a prediction for the reward and not for the error).
+
+    \begin{figure}[H]
+        \centering
+        \begin{subfigure}{0.34\linewidth}
+            \centering
+            \includegraphics[width=0.9\linewidth]{./img/pfc_learning.png}
+            \caption{PFC response during learning}
+        \end{subfigure}
+        \begin{subfigure}{0.62\linewidth}
+            \centering
+            \includegraphics[width=0.9\linewidth]{./img/pfc_vs_dopamine.png}
+            \caption{PFC vs dopamine (DA)}
+        \end{subfigure}
+    \end{figure}
+\end{remark}
+
+
+
+\section{Reward prediction error (RPE) theory of dopamine}
+
+
+\subsection{Dopamine is not fully model-free}
+
+There is strong evidence that midbrain dopamine is used to report RPE as in model-free RL.
+RPE theory of dopamine states that:
+\begin{itemize}
+    \item Dopamine reflects the value of the observable state, which can be seen as a quantitative summary of future reward.
+    \item State values are directly learned through experience.
+    \item Dopamine only signals surprising events that bring a reward.
+    \item Dopamine does not make inferences on the model of the environment.
+\end{itemize}
+
+However, individuals also learn about the model of the world (e.g. cognitive map) and this knowledge can affect the neuronal prediction error,
+but, there is evidence that this acquisition involves cortical signals (e.g. PFC) rather than dopamine.
+Despite that, dopamine still seems to integrate predictive information from models.
+
+\begin{casestudy}[Monkey saccade \cite{saccade}]
+    Monkeys are required to solve a memory-guided saccade task where, after fixation, 
+    a light is flashed in one of the four directions indicating the saccade to be made after the fixation point went off.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.4\linewidth]{./img/saccade1.png}
+        \caption{Structure of the task}
+    \end{figure}
+
+    Experiments are done in sub-blocks of four trials, each corresponding to a direction.
+    Moreover, the probability of reward increases with the number of non-rewarded trials (post-reward trial number, \texttt{PNR}).
+    If $\texttt{PNR} = 1$, the probability of reward is the lowest ($0.0625)$, while if $\texttt{PNR} = 7$, the reward probability is $1.0$.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.6\linewidth]{./img/saccade2.png}
+        \caption{Structure of a block}
+    \end{figure}
+
+
+    It is expected that the animal's reward prediction increases after each non-rewarded trial.
+    In other words, as the reward is more likely after each non-rewarded trial, positive prediction error should decrease and 
+    negative prediction error should be stronger (i.e. decrease).
+
+    Results show that dopamine neurons are less active if the reward is delivered
+    and more depressed if the reward is omitted after each non-rewarded trial.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.25\linewidth]{./img/saccade3.png}
+    \end{figure}
+
+    The results are in contrast with an exclusive model-free view of dopamine as, if this were the case, 
+    learning would only involve past non-rewarded trials causing positive prediction error to decrease and negative prediction error to be weaker (i.e. increase).
+    Therefore, dopamine might process prediction error in both model-free and model-based approaches.
+\end{casestudy}
+
+\begin{casestudy}[Dopamine indirect learning \cite{indirect_learning}]
+    Rats are exposed to:
+    \begin{descriptionlist}
+        \item[Pre-conditioning] Sound stimuli $A \mapsto B$ and $C \mapsto D$ are paired together.
+        \item[Conditioning] The stimulus $B$ is paired with a reward. 
+    \end{descriptionlist}
+    Results show that rats respond to both $B$ and $A$ in a correlated manner.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.65\linewidth]{./img/dopamine_indirect1.png}
+    \end{figure}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.45\linewidth]{./img/dopamine_indirect2.png}
+    \end{figure}
+
+    The results show that dopamine might also reflect values learned indirectly.
+    This is in contrast with the temporal difference learning of model-free RL in which only directly experienced states are learned.
+\end{casestudy}
+
+\begin{casestudy}[Dopamine RPE reflects inference over hidden states \cite{dopamine_hidden}]
+    Rats are trained to associate odors with rewards.
+    Two types of tasks are considered:
+    \begin{descriptionlist}
+        \item[Task 1] 
+            Odors are always associated with a reward. 
+            Odor $A$ is delivered with a delay sampled from a Gaussian distribution, odors $B$ and $C$ are deterministic and odor $D$ is for control.
+
+        \item[Task 2] 
+            As above, but odors are associated with a reward $90\%$ of the time. 
+    \end{descriptionlist}
+    The period in which no reward is expected is called ITI, while the period in which the animal expects a reward is the ISI (i.e. after the odor onset).
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.65\linewidth]{./img/dopamine_hidden1.png}
+        \caption{Tasks representation}
+    \end{figure}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.5\linewidth]{./img/dopamine_hidden2.png}
+        \caption{
+            \parbox[t]{0.7\linewidth}{
+                Licking behavior in the two tasks.
+                It can be seen that, for each odor, licking depends on the time of arrival of the reward.
+                On task 2, licking is more uncertain.
+            }
+        }
+    \end{figure}
+
+    Considering odor $A$, results show that:
+    \begin{itemize}
+        \item For task 1, the dopaminergic response gets smaller over time within the ISI
+        \item For task 2, the dopaminergic response grows, hinting at the fact that some sort of inference about the state is being made.
+    \end{itemize} 
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.65\linewidth]{./img/dopamine_hidden3.png}
+    \end{figure}
+
+    An explanation is that the animal has to solve the problem of determining whether it is in an ITI or ISI period:
+    \begin{itemize}
+        \item For task 1, the rat can easily determine in which period it is.
+        \item For task 2, as the reward is not always delivered, being in ISI and ITI is not always clear.
+    \end{itemize}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.5\linewidth]{./img/dopamine_hidden4.png}
+    \end{figure}
+
+    The dopaminergic activity for the two tasks can be explained as follows:
+    \begin{itemize}
+        \item For task 1, after the stimulus, the probability of receiving a reward increases over time.
+            Therefore, the RPE is increasingly suppressed.
+        \item For task 2, as the reward fails to arrive, the belief state progressively shifts towards the ITI state.
+            Therefore, if the reward is delivered later, the RPE is high as it was unexpected.
+    \end{itemize}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.55\linewidth]{./img/dopamine_hidden5.png}
+        \caption{
+            \parbox[t]{0.6\linewidth}{
+                Experiment represented as sub-states (top) and RPE to a reward over time (bottom)
+            }
+        }
+    \end{figure}
+
+    These results hint at the fact that:
+    \begin{itemize}
+        \item The brain is not limited to passively observing the environment but also makes latent inferences.
+        \item The results can be modeled using a temporal difference model that incorporates hidden-state inference.
+    \end{itemize}
+\end{casestudy}
+
+
+\subsection{Dopamine as generalized prediction error}
+
+Dopamine might not be limited to only predicting reward error but is also involved in a more general state prediction error.
+
+\begin{casestudy}[Dopamine state change prediction \cite{dopamine_general}]
+    Rats are exposed to the following training steps:
+    \begin{descriptionlist}
+        \item[Conditioning] 
+            The following stimuli are associated with some rewards
+            (it must be ensured that rewards are liked in the same way, i.e. same value but different identity):
+            \begin{itemize}
+                \item $V_B$ is associated with two units of banana.
+                \item $V_{UB}$ is associated with two units of chocolate.
+            \end{itemize}
+
+        \item[Compound training]
+            New stimuli are paired with the previously learned ones:
+            \begin{itemize}
+                \item $A_B$ is paired with $V_B$. Because of blocking, $A_B$ should not be learned as a rewarding stimulus.
+                \item $A_{UB}$ is paired with $V_{UB}$. The reward is changed to achieve identity unblocking.
+            \end{itemize}
+    \end{descriptionlist}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.6\linewidth]{./img/dopamine_general1.png}
+    \end{figure}
+
+    It has been shown that the animal learns the new CS $A_{UB}$ and dopamine responds to the change even if only the identity of the reward changed and the value remained the same.
+\end{casestudy}
+
+
+\subsection{Successor representation}
+
+\begin{description}
+    \item[Sensory prediction error (SPE)] \marginnote{Sensory prediction error (SPE)}
+        Generalized prediction error over sensory features that estimates the successor representation of a state.
+
+    \item[Successor representation (SR)] \marginnote{Successor representation (SR)}
+        The SR of a state $s$ is a mapping $M(s, \cdot)$ where $M(s, s')$ indicates the expected occupancy of $s'$ by starting from $s$ 
+        (i.e. how likely it is to end up in $s'$ from $s$).
+
+        A SR learner predicts the value of a state by taking into account the reward $R$ and the successor representation $M$:
+        \[ 
+            V(s_t) = R(s_{t}) M(s_t, s_{t+1}) 
+            \hspace{2em}
+            \left( \text{\cite{sr_learner} says } V(s_t) = \sum_{t+1} R(s_{t+1}) M(s_t, s_{t+1}) \right)
+        \]
+        % A SR learner predicts the value of a state as \cite{sr_learner}:
+        % \[ V(s_t) = \sum_{t+1} R(s_{t+1}) M(s_t, s_{t+1}) \]
+\end{description}
+
+\begin{remark}
+    SR learning might be a middle ground between model-free and model-based methods.
+    SR computes the future reward by combining the efficiency of model-free approaches and some flexibility from model-based RL (i.e. caching of state transitions).
+
+    This is suited for tasks where the states are more or less stable but rewards and goals change frequently.
+\end{remark}
+
+\begin{remark}
+    A criticism against this method is that it is space-wise expensive as it involves a prediction error for states (the matrix $M$).
+    This space requirement finds a mismatch in the number of neurons available in the brain as 
+    dopaminergic neurons, which are responsible for updating the values, might not be enough.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.4\linewidth]{./img/sr_mismatch.png}
+    \end{figure}
+\end{remark}
+
+\begin{casestudy}[SR in humans \cite{sr_learner}]
+    The experiment is divided into the following phases:
+    \begin{descriptionlist}
+        \item[Learning phase] 
+            Candidates are exposed to a sequence of three stimuli, where the third is associated with a reward.
+            After a bunch of trials (with different stimuli), they are asked to indicate which starting stimulus is the one leading to the greater future reward.
+
+        \item[Re-learning phase]
+            Two types of revaluation are considered:
+            \begin{descriptionlist}
+                \item[Reward revaluation] 
+                    Final rewards are swapped with the stimuli unchanged (i.e. value change).
+                \item[Transition revaluation] 
+                    Third stimuli are swapped with the rewards unchanged (i.e. state change). 
+            \end{descriptionlist}
+            Candidates are again exposed to the sequence of stimuli starting from the middle one (i.e. the first stimulus is dropped).
+            
+            Expected results are:
+            \begin{itemize}
+                \item Model-free approaches should fail on both changes.
+                \item Model-based approaches should succeed in both changes.
+                \item SR-based approaches should succeed in the reward change but not in the transition change.
+            \end{itemize}
+    \end{descriptionlist}
+
+    \begin{figure}[H]
+        \centering
+        \begin{subfigure}{0.48\linewidth}
+            \centering
+            \includegraphics[width=0.9\linewidth]{./img/human_sr1.png}
+        \end{subfigure}
+        \begin{subfigure}{0.48\linewidth}
+            \centering
+            \includegraphics[width=0.95\linewidth]{./img/human_sr2.png}
+        \end{subfigure}
+    \end{figure}
+
+    Results show that:
+    \begin{itemize}
+        \item Revaluation scores (i.e. change in preference) on reward revaluation are slightly better than transition revaluation.
+        \item Reaction time for reward revaluation is faster (cached rewards can be easily updated) but 
+            slower for the transition revaluation (cannot rely on cached states as they require more time to be updated).
+    \end{itemize} 
+    This suggests that humans might use successor representation learning with some form of model-based approach.
+    Because of the differences in score and reaction time, learning cannot be fully model-based.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.6\linewidth]{./img/human_sr3.png}
+    \end{figure}
+\end{casestudy}
+
+
+
+\section{Distributional reinforcement learning}
+
+\begin{description}
+    \item[Distributional reinforcement learning] \marginnote{Distributional reinforcement learning}
+        RL methods that aim to learn the full distribution of the expected reward instead of the mean expected reward.
+\end{description}
+
+\begin{remark}
+    Certain deep RL algorithms improve with distributional RL.
+\end{remark}
+
+\begin{remark}
+    In traditional temporal-difference learning, predictors learn similar values.
+
+    In distributional temporal-difference learning, there are optimistic and pessimistic predictors with different scaling
+    that expect larger and smaller future rewards, respectively.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/distr_rl1.png}
+        \caption{
+            \parbox[t]{0.9\linewidth}{
+                Traditional RL (left) and distributional RL (right). In distributional RL, red nodes are optimistic and blue nodes are pessimistic.
+            }
+        }
+    \end{figure}
+\end{remark}
+
+\begin{description}
+    \item[Reversal point] \marginnote{Reversal point}
+        $r_0$ is the reversal point of a dopaminergic neuron if:
+        \begin{itemize}
+            \item A reward $r < r_0$ expresses a negative error.
+            \item A reward $r > r_0$ expresses a positive error.
+        \end{itemize}
+
+        \begin{remark}
+            In traditional temporal-difference learning, the reversal point of individual neurons should be approximately identical.
+        \end{remark}
+\end{description}
+
+\begin{casestudy}[Distributional RL in dopamine response \cite{distributional_rl_brain}]
+    Single dopaminergic neurons are recorded in rats.
+    Rats are trained on two different tasks:
+    \begin{descriptionlist}
+        \item[Variable-magnitude]
+            A random amount of reward is given to the rat. The reward is anticipated by an odor stimulus in half of the trials.
+
+        \item[Variable-probability]
+            Three odor stimuli are each associated with a probability of reward (90\%, 50\%, 10\%).
+            A control odor is associated with no reward.
+    \end{descriptionlist}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.4\linewidth]{./img/distr_rl2.png}
+    \end{figure}
+
+    Results on the variable-magnitude task show that:
+    \begin{itemize}
+        \item Neurons in simulated classical RL carry approximately the same RPE signal for each magnitude and have similar reversal points ($\sim 0$).
+        \item Neurons in simulated distributional RL have different reversal points and there is more variety in responses 
+            (e.g. RPEs of optimistic neurons are positive only for large magnitudes and vice versa for pessimistic neurons).
+        \item Measured neural data are more similar to the simulated distributional RL data.
+    \end{itemize}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.7\linewidth]{./img/distr_rl3.png}
+        \caption{
+            \parbox[t]{0.7\linewidth}{
+                Simulated (a) and measured (b) neurons.
+                Points at the same y-axis represent the same neuron and are sorted by reversal point.
+                The color of the dots represents the magnitude of the reward.
+            }
+        }
+    \end{figure}
+
+    Results on the variable-probability task show that:
+    \begin{itemize}
+        \item Neurons in simulated classical RL do not show differences when comparing the stimulus with 50\% reward against the 10\% and 90\% responses.
+        \item Neurons in simulated distributional RL vary a lot when responding to the 50\% reward probability stimulus.
+        \item Measured neural data are more similar to the simulated distributional RL data.
+    \end{itemize}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.7\linewidth]{./img/distr_rl4.png}
+        \caption{
+            \parbox[t]{0.7\linewidth}{
+                Simulated (a) and measured (b) neurons.
+                T-statistics comparing each cell's response to the stimulus associated with the 50\% reward
+                against the mean stimulus response across cells.
+            }
+        }
+    \end{figure}
+
+    Responses of dopamine neurons show that some cells are in fact more optimistic and some more pessimistic
+    depending on how they respond to the 50\% stimulus.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.8\linewidth]{./img/distr_rl5.png}
+        \caption{Activities of four dopaminergic neurons}
+    \end{figure}
+
+    An explanation for this behavior of having different reversal points
+    is that the weights for positive ($\alpha^+$) and negative ($\alpha^-$) RPEs are different, or, 
+    more specifically, the asymmetric scaling factor $\frac{\alpha^+}{\alpha^+ + \alpha^-}$ is different.
+    This creates a disequilibrium that can be rebalanced by changing the reversal points of the neurons.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.65\linewidth]{./img/distr_rl6.png}
+    \end{figure}
+
+    Indeed, measurements show that the reversal point and the asymmetric scaling factor are correlated
+    indicating the need to shift the reversal point to reach equilibrium.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.65\linewidth]{./img/distr_rl7.png}
+    \end{figure}
+
+    By decoding reward distributions from neural responses, it can be seen that:
+    \begin{itemize}
+        \item Classical RL is not able to predict the correct distribution.
+        \item Distributional RL and neuronal data are able to approximate the reward distribution.
+    \end{itemize}
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.7\linewidth]{./img/distr_rl8.png}
+    \end{figure}
+\end{casestudy}