diff --git a/src/year1/deep-learning/dl.tex b/src/year1/deep-learning/dl.tex
index 748fac8..323ac1b 100644
--- a/src/year1/deep-learning/dl.tex
+++ b/src/year1/deep-learning/dl.tex
@@ -11,5 +11,6 @@
     \input{./sections/_training.tex}
     \input{./sections/_computer_vision.tex}
     \input{./sections/_generative_models.tex}
+    \input{./sections/_rnn.tex}
     
 \end{document}
\ No newline at end of file
diff --git a/src/year1/deep-learning/img/cline.png b/src/year1/deep-learning/img/cline.png
new file mode 100644
index 0000000..ce79157
Binary files /dev/null and b/src/year1/deep-learning/img/cline.png differ
diff --git a/src/year1/deep-learning/img/cline_update.png b/src/year1/deep-learning/img/cline_update.png
new file mode 100644
index 0000000..1bb3798
Binary files /dev/null and b/src/year1/deep-learning/img/cline_update.png differ
diff --git a/src/year1/deep-learning/img/forget_gate.png b/src/year1/deep-learning/img/forget_gate.png
new file mode 100644
index 0000000..faa8ed9
Binary files /dev/null and b/src/year1/deep-learning/img/forget_gate.png differ
diff --git a/src/year1/deep-learning/img/lstm.png b/src/year1/deep-learning/img/lstm.png
new file mode 100644
index 0000000..e962a3c
Binary files /dev/null and b/src/year1/deep-learning/img/lstm.png differ
diff --git a/src/year1/deep-learning/img/output_gate.png b/src/year1/deep-learning/img/output_gate.png
new file mode 100644
index 0000000..8513516
Binary files /dev/null and b/src/year1/deep-learning/img/output_gate.png differ
diff --git a/src/year1/deep-learning/img/rnn.png b/src/year1/deep-learning/img/rnn.png
new file mode 100644
index 0000000..22afb33
Binary files /dev/null and b/src/year1/deep-learning/img/rnn.png differ
diff --git a/src/year1/deep-learning/img/update_gate.png b/src/year1/deep-learning/img/update_gate.png
new file mode 100644
index 0000000..4934b45
Binary files /dev/null and b/src/year1/deep-learning/img/update_gate.png differ
diff --git a/src/year1/deep-learning/sections/_rnn.tex b/src/year1/deep-learning/sections/_rnn.tex
new file mode 100644
index 0000000..fcd4d4c
--- /dev/null
+++ b/src/year1/deep-learning/sections/_rnn.tex
@@ -0,0 +1,124 @@
+\chapter{Sequence modeling}
+
+
+\section{Memoryless approach}
+\marginnote{Memoryless approach}
+
+Neural network that takes as input a fixed number of elements of the sequence.
+
+\begin{remark}
+    They are not ideal for long-term dependencies.
+\end{remark}
+
+
+
+\section{Recurrent neural network}
+
+\begin{description}
+    \item[Recurrent neural network (RNN)] \marginnote{Recurrent neural network}
+        Neural network in which hidden states have backward connections in such a way that each state depends on the past history.
+
+        Inputs are processed one time step at a time as they cannot be parallelized since each step needs the hidden state at the previous time step.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.65\linewidth]{./img/rnn.png}
+            \caption{Example of RNN (left) and its unfolded version (right)}
+        \end{figure}
+
+    \item[Backpropagation] \marginnote{RNN backpropagation}
+        Weight updates in RNNs are computed by averaging the gradients of each time step (i.e. a forward pass involves processing an entire sequence).
+
+        By seeing an RNN in its unfolded form, this way of updating the weights guarantees that the parameters of the network remain the same for each time step.
+
+        \begin{remark}
+            For long sequences, it is very easy for the gradient to explode or vanish.
+        \end{remark}
+
+    \item[Hidden state initialization] \marginnote{Hidden state initialization}
+        There are different ways to set the initial hidden state at $t=0$:
+        \begin{itemize}
+            \item Initialize to zero.
+            \item Sample from a known distribution.
+            \item Learned during training.
+        \end{itemize}
+\end{description}
+
+
+
+\subsection{Long-short term memory}
+\marginnote{Long-short term memory}
+
+Traditional RNNs usually only carry to the next time step the output of the current step.
+
+Long-short term memory is an architecture of RNN that, along side the output of the previous layer, allows the model itself to learn what to "remember".
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.6\linewidth]{./img/lstm.png}
+\end{figure}
+
+
+Let:
+\begin{itemize}
+    \item $W_g$ and $b_g$ be the weights and biases of the component $g$,
+    \item $h_t$ the output of at time step $t$,
+    \item $x_t$ the input at time step $t$,
+\end{itemize}
+an LSTM has the following components:
+\begin{descriptionlist}
+    \item[Forget gate] 
+        Computes a mask $f_t$ that will decide which part of the memory to preserve.\\
+        \begin{minipage}{0.6\linewidth}
+            \[ f_t = \sigma( W_f \cdot [h_{t-1}, x_t] + b_f) \]
+        \end{minipage}
+        \begin{minipage}{0.35\linewidth}
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/forget_gate.png}
+        \end{minipage}
+
+    \item[Update gate] 
+        It is composed of two parts:
+        \begin{descriptionlist}
+            \item[Input gate] Computes a mask $i_t$ that decides which part of the input to preserve.
+            \item[\texttt{tanh} layer] Creates a vector $\tilde{C}_t$ of new candidate values to potentially be saved in the memory.
+        \end{descriptionlist}
+        \begin{minipage}{0.6\linewidth}
+            \[
+                \begin{split}
+                    i_t &= \sigma( W_i \cdot [h_{t-1}, x_t] + b_i) \\
+                    \tilde{C}_t &= \texttt{tanh}( W_C \cdot [h_{t-1}, x_t] + b_C) \\
+                \end{split}
+            \]
+        \end{minipage}
+        \begin{minipage}{0.35\linewidth}
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/update_gate.png}
+        \end{minipage}
+
+    \item[C-line] 
+        Represents the memory of the network.
+        At each step, the memory $C_{t-1}$ of the previous step is updated and a new state $C_t$ is outputted to the next step.\\
+        \begin{minipage}{0.6\linewidth}
+            \[ C_t = f_t * C_{t-1} + i_t * \tilde{C}_t \]
+        \end{minipage}
+        \begin{minipage}{0.35\linewidth}
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/cline_update.png}
+        \end{minipage}
+
+    \item[Output gate] 
+        The output $h_t$ at step $t$ is determined by the current input and the updated memory.\\
+        \begin{minipage}{0.6\linewidth}
+            \[
+                \begin{split}
+                    o_t &= \sigma( W_o \cdot [h_{t-1}, x_t] + b_o) \\
+                    h_t &= o_t * \texttt{tanh}(C_t) \\
+                \end{split}
+            \]
+        \end{minipage}
+        \begin{minipage}{0.35\linewidth}
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/output_gate.png}
+        \end{minipage}
+\end{descriptionlist}
\ No newline at end of file