diff --git a/src/year1/cognition-and-neuroscience/module2/cn2.tex b/src/year1/cognition-and-neuroscience/module2/cn2.tex index 4046b0e..875361a 100644 --- a/src/year1/cognition-and-neuroscience/module2/cn2.tex +++ b/src/year1/cognition-and-neuroscience/module2/cn2.tex @@ -16,6 +16,7 @@ \input{./sections/_object_recognition.tex} \input{./sections/_nn_recognition.tex} + \input{./sections/_dopamine.tex} \printbibliography[heading=bibintoc] diff --git a/src/year1/cognition-and-neuroscience/module2/img/actor_critic.png b/src/year1/cognition-and-neuroscience/module2/img/actor_critic.png new file mode 100644 index 0000000..d5aef6b Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/actor_critic.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl1.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl1.png new file mode 100644 index 0000000..b79e40b Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl2.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl2.png new file mode 100644 index 0000000..0aa5567 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl2.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl3.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl3.png new file mode 100644 index 0000000..f191b4f Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl3.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl4.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl4.png new file mode 100644 index 0000000..a25bc29 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl4.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl5.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl5.png new file mode 100644 index 0000000..81faa62 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl5.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl6.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl6.png new file mode 100644 index 0000000..9b4d5b4 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl6.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl7.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl7.png new file mode 100644 index 0000000..63f9398 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl7.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/distr_rl8.png b/src/year1/cognition-and-neuroscience/module2/img/distr_rl8.png new file mode 100644 index 0000000..5570b5b Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/distr_rl8.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dm_processes.png b/src/year1/cognition-and-neuroscience/module2/img/dm_processes.png new file mode 100644 index 0000000..b81de83 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dm_processes.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dm_theories.png b/src/year1/cognition-and-neuroscience/module2/img/dm_theories.png new file mode 100644 index 0000000..1e62aa6 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dm_theories.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_flow.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_flow.png new file mode 100644 index 0000000..6d949f7 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_flow.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_general1.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_general1.png new file mode 100644 index 0000000..df2eeaa Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_general1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden1.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden1.png new file mode 100644 index 0000000..45a4601 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden2.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden2.png new file mode 100644 index 0000000..e8cc2e1 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden2.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden3.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden3.png new file mode 100644 index 0000000..cd3d9ef Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden3.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden4.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden4.png new file mode 100644 index 0000000..7d129cd Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden4.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden5.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden5.png new file mode 100644 index 0000000..16ef4be Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_hidden5.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect1.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect1.png new file mode 100644 index 0000000..a2e622b Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect2.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect2.png new file mode 100644 index 0000000..cd6d4fd Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_indirect2.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_pathways.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_pathways.png new file mode 100644 index 0000000..e467382 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_pathways.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/dopamine_transfer.png b/src/year1/cognition-and-neuroscience/module2/img/dopamine_transfer.png new file mode 100644 index 0000000..cbbed56 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/dopamine_transfer.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/human_sr1.png b/src/year1/cognition-and-neuroscience/module2/img/human_sr1.png new file mode 100644 index 0000000..d7c6746 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/human_sr1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/human_sr2.png b/src/year1/cognition-and-neuroscience/module2/img/human_sr2.png new file mode 100644 index 0000000..c8661ec Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/human_sr2.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/human_sr3.png b/src/year1/cognition-and-neuroscience/module2/img/human_sr3.png new file mode 100644 index 0000000..d0ea467 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/human_sr3.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/perceptual_dm.png b/src/year1/cognition-and-neuroscience/module2/img/perceptual_dm.png new file mode 100644 index 0000000..03e1ecf Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/perceptual_dm.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/pfc_learning.png b/src/year1/cognition-and-neuroscience/module2/img/pfc_learning.png new file mode 100644 index 0000000..fd964f2 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/pfc_learning.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/pfc_vs_dopamine.png b/src/year1/cognition-and-neuroscience/module2/img/pfc_vs_dopamine.png new file mode 100644 index 0000000..a111712 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/pfc_vs_dopamine.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/rat_maze1.png b/src/year1/cognition-and-neuroscience/module2/img/rat_maze1.png new file mode 100644 index 0000000..cb3f522 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/rat_maze1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/rat_maze2.png b/src/year1/cognition-and-neuroscience/module2/img/rat_maze2.png new file mode 100644 index 0000000..bed7c81 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/rat_maze2.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/saccade1.png b/src/year1/cognition-and-neuroscience/module2/img/saccade1.png new file mode 100644 index 0000000..b00ad6c Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/saccade1.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/saccade2.png b/src/year1/cognition-and-neuroscience/module2/img/saccade2.png new file mode 100644 index 0000000..1f1185e Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/saccade2.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/saccade3.png b/src/year1/cognition-and-neuroscience/module2/img/saccade3.png new file mode 100644 index 0000000..2ef3889 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/saccade3.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/sr_mismatch.png b/src/year1/cognition-and-neuroscience/module2/img/sr_mismatch.png new file mode 100644 index 0000000..93d6c4a Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/sr_mismatch.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/valuation_circuitry.png b/src/year1/cognition-and-neuroscience/module2/img/valuation_circuitry.png new file mode 100644 index 0000000..c523274 Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/valuation_circuitry.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/img/value_dm.png b/src/year1/cognition-and-neuroscience/module2/img/value_dm.png new file mode 100644 index 0000000..a0cb16b Binary files /dev/null and b/src/year1/cognition-and-neuroscience/module2/img/value_dm.png differ diff --git a/src/year1/cognition-and-neuroscience/module2/references.bib b/src/year1/cognition-and-neuroscience/module2/references.bib index 84ce3fe..bdec98c 100644 --- a/src/year1/cognition-and-neuroscience/module2/references.bib +++ b/src/year1/cognition-and-neuroscience/module2/references.bib @@ -81,4 +81,157 @@ pages = {e2014196118}, year = {2021}, doi = {10.1073/pnas.2014196118}, -} \ No newline at end of file +} + + +@article{monkey_dopamine, + title = {Dopamine neurons of the monkey midbrain: contingencies of responses to active touch during self-initiated arm movements}, + author = {Romo, R. and Schultz, W.}, + journal = {Journal of Neurophysiology}, + volume = {63}, + number = {3}, + pages = {592-606}, + year = {1990}, + doi = {10.1152/jn.1990.63.3.592}, +} + + +@article{dopamine_transfer, + title = {A Neural Substrate of Prediction and Reward}, + author = {Wolfram Schultz and Peter Dayan and P. Read Montague }, + journal = {Science}, + volume = {275}, + number = {5306}, + pages = {1593-1599}, + year = {1997}, + doi = {10.1126/science.275.5306.1593}, +} + + +@article{dopamine_transfer2, + title = {Behavioral Theories and the Neurophysiology of Reward}, + author = {Schultz, Wolfram}, + journal= {Annual Review of Psychology}, + year = {2006}, + volume = {57}, + number = {Volume 57, 2006}, + pages = {87-115}, + publisher = {Annual Reviews}, + doi = {10.1146/annurev.psych.56.091103.070229}, +} + + +@article{dopamine_bidirectional, + title = {Adaptive Coding of Reward Value by Dopamine Neurons}, + author = {Philippe N. Tobler and Christopher D. Fiorillo and Wolfram Schultz }, + journal = {Science}, + volume = {307}, + number = {5715}, + pages = {1642-1645}, + year = {2005}, + doi = {10.1126/science.1105370}, +} + + +@article{dopamine_probability, + title = {Discrete Coding of Reward Probability and Uncertainty by Dopamine Neurons}, + author = {Christopher D. Fiorillo and Philippe N. Tobler and Wolfram Schultz }, + journal = {Science}, + volume = {299}, + number = {5614}, + pages = {1898-1902}, + year = {2003}, + doi = {10.1126/science.1077349}, +} + + +@article{dopamine_temporal, + title = {Dopamine neurons report an error in the temporal prediction of reward during learning}, + author = {Hollerman, Jeffrey R. and Schultz, Wolfram}, + journal = {Nature Neuroscience}, + year = {1998}, + day = {01}, + volume = {1}, + number = {4}, + pages = {304-309}, + doi = {10.1038/1124}, +} + + +@article{saccade, + title = {Dopamine neurons can represent context-dependent prediction error}, + author = {Nakahara, Hiroyuki and Itoh, Hideaki and Kawagoe, Reiko and Takikawa, Yoriko and Hikosaka, Okihide}, + journal = {Neuron}, + publisher = {Elsevier BV}, + volume = {41}, + numbe = {2}, + pages = {269--280}, + year = {2004}, + doi = {10.1016/S0896-6273(03)00869-9}, +} + + +@article {indirect_learning, + title = {Midbrain dopamine neurons compute inferred and cached value prediction errors in a common framework}, + author = {Sadacca, Brian F and Jones, Joshua L and Schoenbaum, Geoffrey}, + volume = {5}, + year = {2016}, + pages = {e13665}, + journal = {eLife}, + issn = {2050-084X}, + publisher = {eLife Sciences Publications, Ltd}, + doi = {10.7554/eLife.13665}, +} + + +@article{dopamine_hidden, + title = {Dopamine reward prediction errors reflect hidden-state inference across time}, + author = {Starkweather, Clara Kwon and Babayan, Benedicte M. and Uchida, Naoshige and Gershman, Samuel J.}, + journal = {Nature Neuroscience}, + year = {2017}, + volume = {20}, + number = {4}, + pages = {581-589}, + issn = {1546-1726}, + doi = {10.1038/nn.4520}, +} + + +@article{dopamine_general, + title = {Optogenetic blockade of dopamine transients prevents learning induced by changes in reward features}, + author = {Chang, Chun Yun and Gardner, Matthew and Di Tillio, Maria Gonzalez and Schoenbaum, Geoffrey}, + journal = {Curr. Biol.}, + publisher = {Elsevier BV}, + volume = {27}, + number = {22}, + pages = {3480--3486.e3}, + year = {2017}, + doi = {10.1016/j.cub.2017.09.049}, +} + + +@article{sr_learner, + title = {The successor representation in human reinforcement learning}, + author = {Momennejad, I. and Russek, E. M. and Cheong, J. H. and Botvinick, M. M. and Daw, N. D. and Gershman, S. J.}, + journal = {Nature Human Behaviour}, + year = {2017}, + volume = {1}, + number = {9}, + pages = {680-692}, + issn = {2397-3374}, + doi = {10.1038/s41562-017-0180-8}, +} + + +@article{distributional_rl_brain, + title = {A distributional code for value in dopamine-based reinforcement learning}, + author = {Dabney, Will and Kurth-Nelson, Zeb and Uchida, Naoshige and Starkweather, Clara Kwon and Hassabis, Demis and Munos, R{\'e}mi and Botvinick, Matthew}, + journal = {Nature}, + publisher = {Springer Science and Business Media LLC}, + volume = {577}, + number = {7792}, + pages = {671--675}, + year = {2020}, + doi = {10.1038/s41586-019-1924-6}, +} + diff --git a/src/year1/cognition-and-neuroscience/module2/sections/_dopamine.tex b/src/year1/cognition-and-neuroscience/module2/sections/_dopamine.tex new file mode 100644 index 0000000..645189c --- /dev/null +++ b/src/year1/cognition-and-neuroscience/module2/sections/_dopamine.tex @@ -0,0 +1,744 @@ +\chapter{Dopamine in reinforcement learning} + + +\section{Decision making} + +\begin{description} + \item[Decision-making] \marginnote{Decision-making} + Voluntary process that leads to the selection of an action based on sensory information. + + Decisions are inherently non-deterministic as: + \begin{itemize} + \item Agents make inconsistent choices. + \item Agents make choices unaware of the full consequences (uncertainty). + \item Internal and external signals are noisy. + \end{itemize} + + \begin{remark} + From an evolutionary point of view, stochasticity in decisions increases the chances of survival. + \end{remark} + + \begin{remark} + Decision-making studied within the neuroscience field is actually studying cognition. + It involves studying the neural processes underlying a variety of mental functions. + \end{remark} + + + \item[Perceptual decision-making] \marginnote{Perceptual decision-making} + An agent selects between action $A$ and $B$ based on weak or noisy external signals (e.g. ``do you see $A$ or $B$?''). + + In this case, uncertainty comes from the external stimulus. + + \begin{figure}[H] + \centering + \includegraphics[width=0.55\linewidth]{./img/perceptual_dm.png} + \end{figure} + + \item[Value-based decision-making] \marginnote{Value-based decision-making} + An agent selects between action $A$ and $B$ based on its subjective preferences (e.g. ``do you prefer $A$ or $B$?''). + + In this case, uncertainty comes from the value associated with the action. + + \begin{figure}[H] + \centering + \includegraphics[width=0.55\linewidth]{./img/value_dm.png} + \end{figure} + + + \item[Decision-making processes] + Decision-making involves the following processes: + \begin{descriptionlist} + \item[Representation] \marginnote{Representation} + States, actions, and internal and external factors are identified. + + \item[Valuation] \marginnote{Valuation} + A value is assigned to the possible alternatives. + + \item[Choice] \marginnote{Choice} + Values are compared and a proper action is selected. + + \item[Outcome evaluation] \marginnote{Outcome evaluation} + After performing the action, the desirability of the outcome is measured (reward prediction error). + + \item[Learning] \marginnote{Learning} + Feedback signals are used to update the processes and improve the quality of future decisions. + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.55\linewidth]{./img/dm_processes.png} + \end{figure} + + + \item[Valuation circuitry] \marginnote{Valuation circuitry} + Involves neurons sensitive to reward value. + They are spread in the brain, both in the cortical and subcortical regions. + + \begin{figure}[H] + \centering + \includegraphics[width=0.38\linewidth]{./img/valuation_circuitry.png} + \end{figure} + + + \item[Decision-making theories] \marginnote{Decision-making theories} + \begin{descriptionlist} + \item[Economic learning] \marginnote{Economic learning} + Decision-making involving the selection of an action with the maximum utility. + + \item[Reinforcement learning] \marginnote{Reinforcement learning} + Decision-making involving the probabilistic selection of an action. + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.55\linewidth]{./img/dm_theories.png} + \end{figure} +\end{description} + + + +\section{Reinforcement learning} + +\begin{description} + \item[Reinforcement learning (RL)] \marginnote{Reinforcement learning (RL)} + Learn a mapping between states and actions aiming to maximize the expected cumulative future reward. + + \begin{description} + \item[Markov conditional independence] At any time step, all future states and rewards only depend on the current state and action. + \end{description} + + + \item[Bellman equation] \marginnote{Bellman equation} + Given an action $a_t$ performed in the state $s_t$ following a policy $\pi$, + the expected future reward is given by the following equation: + \[ Q_\pi(s_t, a_t) = r_t + \gamma \sum_{s_{t+1}} \prob{s_{t+1 | s_t, a_t}} Q_\pi(s_{t+1}, \pi(s_{t+1})) \] + where $\gamma$ is a discount factor. +\end{description} + + +\subsection{RL classes} + +\begin{description} + \item[Model-based] \marginnote{Model-based} + Aims to learn the right-hand side of the Bellman equation. + This requires knowing the state transition distribution $\mathcal{P}$ which is costly. + + \item[Model-free] \marginnote{Model-free} + Aims to directly learn the left-hand side of the Bellman equation by estimating $Q_\pi$ from experience. + Agents use states, actions and rewards they experienced by averaging them to update a table of long-run reward predictions that + approximate the right-hand side of the Bellman equation. + + \begin{description} + \item[Temporal difference learning] \marginnote{Temporal difference learning} + The reward prediction error at time $t$ is obtained by comparing the expected reward at time $t$ and at the next time step $t+1$: + \[ \delta_t = r_t + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t, a_t) \] + \end{description} +\end{description} + +\begin{example}[Rat in maze] + A rat has to navigate a maze with two crossroads and two different outcomes. + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/rat_maze1.png} + \end{figure} + + Two strategies can be developed: + \begin{descriptionlist} + \item[Model-based] + By learning the model of the environment, the rat can decide its path by using a search tree. + The path can be changed depending on its motivational state (e.g. hungry or thirsty) showing a goal-directed behavior. + + \item[Model-free] + The value of each state-action pair is stored and action selection consists of choosing the highest cached value at the current state. + Values do not consider the identity of the outcome and are therefore decoupled from the motivational state of the animal. + + Nevertheless, if the motivational state is stored as part of the environmental state, the animal would be able to account for it. + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/rat_maze2.png} + \end{figure} +\end{example} + + + +\section{Dopaminergic system} + +There is strong evidence showing that the dopaminergic system is highly involved in reinforcement learning +for predicting both natural rewards and addictive drugs. + +\begin{description} + \item[Dopamine pathways] \marginnote{Dopamine pathways} + Dopamine projections include: + \begin{descriptionlist} + \item[Nigrostriatal system] Mostly associated with motor functions (action policy). + \item[Meso-cortico-limbic system] Mostly associated with motivation (value function). + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/dopamine_pathways.png} + \end{figure} + + \item[Actor/critic architecture] \marginnote{Actor/critic architecture} + Model with two components: + \begin{descriptionlist} + \item[Critic] + Takes as input a state and is responsible for learning and storing state values. + It also receives the reward from the environment and + computes, through a temporal difference module, + the prediction error $\delta_t$ that is used to update its own state values and train the actor. + + \item[Actor] + Takes as input a state and maps it to an action policy $\pi(a, s)$ that is used to determine the action to perform. + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/actor_critic.png} + \caption{ + \parbox[t]{0.6\linewidth}{ + Actor/critic architecture (A) and a possible mapping of the architecture onto neural substrates (B) + } + } + \end{figure} +\end{description} + + +\begin{description} + \item[Dopamine properties] + \phantom{} + + \begin{description} + \item[Phasic response] \marginnote{Dopamine phasic response} + Depending on the stimulus, dopamine neurons can show excitatory or inhibitory responses. + This can be interpreted as a reward prediction error. + + \begin{remark} + About 75\% of dopamine neurons are activated when there is a rewarding stimulus + and about 14\% of dopamine neurons are activated in response to an aversive stimulus. + \end{remark} + + \begin{casestudy}[Dopamine as reward prediction error \cite{monkey_dopamine}] + A monkey is required to touch the content of a box for which it does not have vision. + + It has been seen that dopamine neurons respond differently based on the content of the box. + This is consistent with the fact that dopamine is used as a prediction error signal. + + \begin{figure}[H] + \centering + \begin{subfigure}{0.48\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{../module1/img/dopamine_monkey1.png} + \end{subfigure} + \begin{subfigure}{0.48\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{../module1/img/dopamine_monkey2.png} + \end{subfigure} + \end{figure} + \end{casestudy} + + + \item[Bidirectional prediction] \marginnote{Dopamine bidirectional prediction} + Dopamine captures both an improvement (positive prediction error) and a worsening (negative prediction error) of the reward. + + \begin{casestudy}[Dopamine bidirectional prediction error \cite{dopamine_bidirectional}] + It has been observed that the dopaminergic response differs depending on the amount of reward. + + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{../module1/img/dopamine_expected2.png} + \caption{ + \parbox[t]{0.65\linewidth}{ + Dopamine response of a monkey trained on a medium amount of reward + } + } + \end{figure} + \end{casestudy} + + + \item[Transfer] \marginnote{Dopamine transfer} + Dopaminergic activity shifts from responding to the reward to responding to the conditioned stimulus that predicts it. + + \begin{casestudy}[Dopamine transfer \cite{dopamine_transfer, dopamine_transfer2}] + It has been seen that the dopaminergic response transfers from the moment of receiving the reward to the stimuli associated with it (CS). + This is in line with the temporal difference model. + + \begin{figure}[H] + \centering + \begin{subfigure}{0.48\linewidth} + \centering + \includegraphics[width=0.8\linewidth]{../module1/img/dopamine_transfer_cs.png} + \end{subfigure} + \begin{subfigure}{0.48\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{./img/dopamine_transfer.png} + \end{subfigure} + \end{figure} + \end{casestudy} + + + \item[Probability encoding] \marginnote{Probability encoding} + The dopaminergic response varies with the reward probability. + + \begin{casestudy}[Dopamine probability encoding \cite{dopamine_probability}] + It has been shown that dopamine responds differently based on the probability of receiving a reward. + + For high uncertainty (50\% probability of reward), a tonic response that starts from the CS and grows up to the reward time has been observed. + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{../module1/img/dopamine_probability.png} + \end{figure} + \end{casestudy} + + + \item[Temporal prediction] \marginnote{Dopamine temporal prediction} + Apart from encoding the unexpectedness of an event occurring, dopamine also accounts for the time the reward is expected to be delivered, + and responds accordingly if the delivery happens earlier or later. + + \begin{casestudy}[Dopamine temporal prediction \cite{dopamine_temporal}] + It has been shown that dopamine responds differently based on the time the reward is delivered. + + If the delivery happens earlier, the dopaminergic response increases. + On the other hand, if the delivery happens later, dopamine neurons first pass a depressed phase. + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{../module1/img/dopamine_timing.png} + \end{figure} + \end{casestudy} + \end{description} +\end{description} + +\begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/dopamine_flow.png} + \caption{Dopamine flow in the dopaminergic system} +\end{figure} + +\begin{remark}[PFC neurons] + Differently from dopamine neurons, PFC neurons during learning fire in response to the reward and progressively start to fire also at the CS. + In addition, the strength of the response does not depend on the expectedness of the reward + (i.e. it only acts as a prediction for the reward and not for the error). + + \begin{figure}[H] + \centering + \begin{subfigure}{0.34\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{./img/pfc_learning.png} + \caption{PFC response during learning} + \end{subfigure} + \begin{subfigure}{0.62\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{./img/pfc_vs_dopamine.png} + \caption{PFC vs dopamine (DA)} + \end{subfigure} + \end{figure} +\end{remark} + + + +\section{Reward prediction error (RPE) theory of dopamine} + + +\subsection{Dopamine is not fully model-free} + +There is strong evidence that midbrain dopamine is used to report RPE as in model-free RL. +RPE theory of dopamine states that: +\begin{itemize} + \item Dopamine reflects the value of the observable state, which can be seen as a quantitative summary of future reward. + \item State values are directly learned through experience. + \item Dopamine only signals surprising events that bring a reward. + \item Dopamine does not make inferences on the model of the environment. +\end{itemize} + +However, individuals also learn about the model of the world (e.g. cognitive map) and this knowledge can affect the neuronal prediction error, +but, there is evidence that this acquisition involves cortical signals (e.g. PFC) rather than dopamine. +Despite that, dopamine still seems to integrate predictive information from models. + +\begin{casestudy}[Monkey saccade \cite{saccade}] + Monkeys are required to solve a memory-guided saccade task where, after fixation, + a light is flashed in one of the four directions indicating the saccade to be made after the fixation point went off. + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/saccade1.png} + \caption{Structure of the task} + \end{figure} + + Experiments are done in sub-blocks of four trials, each corresponding to a direction. + Moreover, the probability of reward increases with the number of non-rewarded trials (post-reward trial number, \texttt{PNR}). + If $\texttt{PNR} = 1$, the probability of reward is the lowest ($0.0625)$, while if $\texttt{PNR} = 7$, the reward probability is $1.0$. + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/saccade2.png} + \caption{Structure of a block} + \end{figure} + + + It is expected that the animal's reward prediction increases after each non-rewarded trial. + In other words, as the reward is more likely after each non-rewarded trial, positive prediction error should decrease and + negative prediction error should be stronger (i.e. decrease). + + Results show that dopamine neurons are less active if the reward is delivered + and more depressed if the reward is omitted after each non-rewarded trial. + \begin{figure}[H] + \centering + \includegraphics[width=0.25\linewidth]{./img/saccade3.png} + \end{figure} + + The results are in contrast with an exclusive model-free view of dopamine as, if this were the case, + learning would only involve past non-rewarded trials causing positive prediction error to decrease and negative prediction error to be weaker (i.e. increase). + Therefore, dopamine might process prediction error in both model-free and model-based approaches. +\end{casestudy} + +\begin{casestudy}[Dopamine indirect learning \cite{indirect_learning}] + Rats are exposed to: + \begin{descriptionlist} + \item[Pre-conditioning] Sound stimuli $A \mapsto B$ and $C \mapsto D$ are paired together. + \item[Conditioning] The stimulus $B$ is paired with a reward. + \end{descriptionlist} + Results show that rats respond to both $B$ and $A$ in a correlated manner. + + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/dopamine_indirect1.png} + \end{figure} + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{./img/dopamine_indirect2.png} + \end{figure} + + The results show that dopamine might also reflect values learned indirectly. + This is in contrast with the temporal difference learning of model-free RL in which only directly experienced states are learned. +\end{casestudy} + +\begin{casestudy}[Dopamine RPE reflects inference over hidden states \cite{dopamine_hidden}] + Rats are trained to associate odors with rewards. + Two types of tasks are considered: + \begin{descriptionlist} + \item[Task 1] + Odors are always associated with a reward. + Odor $A$ is delivered with a delay sampled from a Gaussian distribution, odors $B$ and $C$ are deterministic and odor $D$ is for control. + + \item[Task 2] + As above, but odors are associated with a reward $90\%$ of the time. + \end{descriptionlist} + The period in which no reward is expected is called ITI, while the period in which the animal expects a reward is the ISI (i.e. after the odor onset). + + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/dopamine_hidden1.png} + \caption{Tasks representation} + \end{figure} + + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/dopamine_hidden2.png} + \caption{ + \parbox[t]{0.7\linewidth}{ + Licking behavior in the two tasks. + It can be seen that, for each odor, licking depends on the time of arrival of the reward. + On task 2, licking is more uncertain. + } + } + \end{figure} + + Considering odor $A$, results show that: + \begin{itemize} + \item For task 1, the dopaminergic response gets smaller over time within the ISI + \item For task 2, the dopaminergic response grows, hinting at the fact that some sort of inference about the state is being made. + \end{itemize} + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/dopamine_hidden3.png} + \end{figure} + + An explanation is that the animal has to solve the problem of determining whether it is in an ITI or ISI period: + \begin{itemize} + \item For task 1, the rat can easily determine in which period it is. + \item For task 2, as the reward is not always delivered, being in ISI and ITI is not always clear. + \end{itemize} + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/dopamine_hidden4.png} + \end{figure} + + The dopaminergic activity for the two tasks can be explained as follows: + \begin{itemize} + \item For task 1, after the stimulus, the probability of receiving a reward increases over time. + Therefore, the RPE is increasingly suppressed. + \item For task 2, as the reward fails to arrive, the belief state progressively shifts towards the ITI state. + Therefore, if the reward is delivered later, the RPE is high as it was unexpected. + \end{itemize} + \begin{figure}[H] + \centering + \includegraphics[width=0.55\linewidth]{./img/dopamine_hidden5.png} + \caption{ + \parbox[t]{0.6\linewidth}{ + Experiment represented as sub-states (top) and RPE to a reward over time (bottom) + } + } + \end{figure} + + These results hint at the fact that: + \begin{itemize} + \item The brain is not limited to passively observing the environment but also makes latent inferences. + \item The results can be modeled using a temporal difference model that incorporates hidden-state inference. + \end{itemize} +\end{casestudy} + + +\subsection{Dopamine as generalized prediction error} + +Dopamine might not be limited to only predicting reward error but is also involved in a more general state prediction error. + +\begin{casestudy}[Dopamine state change prediction \cite{dopamine_general}] + Rats are exposed to the following training steps: + \begin{descriptionlist} + \item[Conditioning] + The following stimuli are associated with some rewards + (it must be ensured that rewards are liked in the same way, i.e. same value but different identity): + \begin{itemize} + \item $V_B$ is associated with two units of banana. + \item $V_{UB}$ is associated with two units of chocolate. + \end{itemize} + + \item[Compound training] + New stimuli are paired with the previously learned ones: + \begin{itemize} + \item $A_B$ is paired with $V_B$. Because of blocking, $A_B$ should not be learned as a rewarding stimulus. + \item $A_{UB}$ is paired with $V_{UB}$. The reward is changed to achieve identity unblocking. + \end{itemize} + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/dopamine_general1.png} + \end{figure} + + It has been shown that the animal learns the new CS $A_{UB}$ and dopamine responds to the change even if only the identity of the reward changed and the value remained the same. +\end{casestudy} + + +\subsection{Successor representation} + +\begin{description} + \item[Sensory prediction error (SPE)] \marginnote{Sensory prediction error (SPE)} + Generalized prediction error over sensory features that estimates the successor representation of a state. + + \item[Successor representation (SR)] \marginnote{Successor representation (SR)} + The SR of a state $s$ is a mapping $M(s, \cdot)$ where $M(s, s')$ indicates the expected occupancy of $s'$ by starting from $s$ + (i.e. how likely it is to end up in $s'$ from $s$). + + A SR learner predicts the value of a state by taking into account the reward $R$ and the successor representation $M$: + \[ + V(s_t) = R(s_{t}) M(s_t, s_{t+1}) + \hspace{2em} + \left( \text{\cite{sr_learner} says } V(s_t) = \sum_{t+1} R(s_{t+1}) M(s_t, s_{t+1}) \right) + \] + % A SR learner predicts the value of a state as \cite{sr_learner}: + % \[ V(s_t) = \sum_{t+1} R(s_{t+1}) M(s_t, s_{t+1}) \] +\end{description} + +\begin{remark} + SR learning might be a middle ground between model-free and model-based methods. + SR computes the future reward by combining the efficiency of model-free approaches and some flexibility from model-based RL (i.e. caching of state transitions). + + This is suited for tasks where the states are more or less stable but rewards and goals change frequently. +\end{remark} + +\begin{remark} + A criticism against this method is that it is space-wise expensive as it involves a prediction error for states (the matrix $M$). + This space requirement finds a mismatch in the number of neurons available in the brain as + dopaminergic neurons, which are responsible for updating the values, might not be enough. + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/sr_mismatch.png} + \end{figure} +\end{remark} + +\begin{casestudy}[SR in humans \cite{sr_learner}] + The experiment is divided into the following phases: + \begin{descriptionlist} + \item[Learning phase] + Candidates are exposed to a sequence of three stimuli, where the third is associated with a reward. + After a bunch of trials (with different stimuli), they are asked to indicate which starting stimulus is the one leading to the greater future reward. + + \item[Re-learning phase] + Two types of revaluation are considered: + \begin{descriptionlist} + \item[Reward revaluation] + Final rewards are swapped with the stimuli unchanged (i.e. value change). + \item[Transition revaluation] + Third stimuli are swapped with the rewards unchanged (i.e. state change). + \end{descriptionlist} + Candidates are again exposed to the sequence of stimuli starting from the middle one (i.e. the first stimulus is dropped). + + Expected results are: + \begin{itemize} + \item Model-free approaches should fail on both changes. + \item Model-based approaches should succeed in both changes. + \item SR-based approaches should succeed in the reward change but not in the transition change. + \end{itemize} + \end{descriptionlist} + + \begin{figure}[H] + \centering + \begin{subfigure}{0.48\linewidth} + \centering + \includegraphics[width=0.9\linewidth]{./img/human_sr1.png} + \end{subfigure} + \begin{subfigure}{0.48\linewidth} + \centering + \includegraphics[width=0.95\linewidth]{./img/human_sr2.png} + \end{subfigure} + \end{figure} + + Results show that: + \begin{itemize} + \item Revaluation scores (i.e. change in preference) on reward revaluation are slightly better than transition revaluation. + \item Reaction time for reward revaluation is faster (cached rewards can be easily updated) but + slower for the transition revaluation (cannot rely on cached states as they require more time to be updated). + \end{itemize} + This suggests that humans might use successor representation learning with some form of model-based approach. + Because of the differences in score and reaction time, learning cannot be fully model-based. + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/human_sr3.png} + \end{figure} +\end{casestudy} + + + +\section{Distributional reinforcement learning} + +\begin{description} + \item[Distributional reinforcement learning] \marginnote{Distributional reinforcement learning} + RL methods that aim to learn the full distribution of the expected reward instead of the mean expected reward. +\end{description} + +\begin{remark} + Certain deep RL algorithms improve with distributional RL. +\end{remark} + +\begin{remark} + In traditional temporal-difference learning, predictors learn similar values. + + In distributional temporal-difference learning, there are optimistic and pessimistic predictors with different scaling + that expect larger and smaller future rewards, respectively. + + \begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/distr_rl1.png} + \caption{ + \parbox[t]{0.9\linewidth}{ + Traditional RL (left) and distributional RL (right). In distributional RL, red nodes are optimistic and blue nodes are pessimistic. + } + } + \end{figure} +\end{remark} + +\begin{description} + \item[Reversal point] \marginnote{Reversal point} + $r_0$ is the reversal point of a dopaminergic neuron if: + \begin{itemize} + \item A reward $r < r_0$ expresses a negative error. + \item A reward $r > r_0$ expresses a positive error. + \end{itemize} + + \begin{remark} + In traditional temporal-difference learning, the reversal point of individual neurons should be approximately identical. + \end{remark} +\end{description} + +\begin{casestudy}[Distributional RL in dopamine response \cite{distributional_rl_brain}] + Single dopaminergic neurons are recorded in rats. + Rats are trained on two different tasks: + \begin{descriptionlist} + \item[Variable-magnitude] + A random amount of reward is given to the rat. The reward is anticipated by an odor stimulus in half of the trials. + + \item[Variable-probability] + Three odor stimuli are each associated with a probability of reward (90\%, 50\%, 10\%). + A control odor is associated with no reward. + \end{descriptionlist} + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/distr_rl2.png} + \end{figure} + + Results on the variable-magnitude task show that: + \begin{itemize} + \item Neurons in simulated classical RL carry approximately the same RPE signal for each magnitude and have similar reversal points ($\sim 0$). + \item Neurons in simulated distributional RL have different reversal points and there is more variety in responses + (e.g. RPEs of optimistic neurons are positive only for large magnitudes and vice versa for pessimistic neurons). + \item Measured neural data are more similar to the simulated distributional RL data. + \end{itemize} + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/distr_rl3.png} + \caption{ + \parbox[t]{0.7\linewidth}{ + Simulated (a) and measured (b) neurons. + Points at the same y-axis represent the same neuron and are sorted by reversal point. + The color of the dots represents the magnitude of the reward. + } + } + \end{figure} + + Results on the variable-probability task show that: + \begin{itemize} + \item Neurons in simulated classical RL do not show differences when comparing the stimulus with 50\% reward against the 10\% and 90\% responses. + \item Neurons in simulated distributional RL vary a lot when responding to the 50\% reward probability stimulus. + \item Measured neural data are more similar to the simulated distributional RL data. + \end{itemize} + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/distr_rl4.png} + \caption{ + \parbox[t]{0.7\linewidth}{ + Simulated (a) and measured (b) neurons. + T-statistics comparing each cell's response to the stimulus associated with the 50\% reward + against the mean stimulus response across cells. + } + } + \end{figure} + + Responses of dopamine neurons show that some cells are in fact more optimistic and some more pessimistic + depending on how they respond to the 50\% stimulus. + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/distr_rl5.png} + \caption{Activities of four dopaminergic neurons} + \end{figure} + + An explanation for this behavior of having different reversal points + is that the weights for positive ($\alpha^+$) and negative ($\alpha^-$) RPEs are different, or, + more specifically, the asymmetric scaling factor $\frac{\alpha^+}{\alpha^+ + \alpha^-}$ is different. + This creates a disequilibrium that can be rebalanced by changing the reversal points of the neurons. + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/distr_rl6.png} + \end{figure} + + Indeed, measurements show that the reversal point and the asymmetric scaling factor are correlated + indicating the need to shift the reversal point to reach equilibrium. + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/distr_rl7.png} + \end{figure} + + By decoding reward distributions from neural responses, it can be seen that: + \begin{itemize} + \item Classical RL is not able to predict the correct distribution. + \item Distributional RL and neuronal data are able to approximate the reward distribution. + \end{itemize} + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/distr_rl8.png} + \end{figure} +\end{casestudy} \ No newline at end of file