Add NLP LTSM, GRU, CNN, Transformer

This commit is contained in:
2024-11-09 13:30:41 +01:00
parent 17b02d219c
commit 605ff0b812
13 changed files with 338 additions and 9 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 99 KiB

View File

@ -13,5 +13,6 @@
\include{./sections/_classification.tex}
\include{./sections/_semantics.tex}
\include{./sections/_rnn.tex}
\include{./sections/_attention.tex}
\end{document}

View File

@ -0,0 +1,172 @@
\chapter{Attention-based architectures}
\begin{description}
\item[Sequence-to-sequence (seq2seq) model] \marginnote{Sequence-to-sequence (seq2seq) model}
Encoder-decoder architecture where:
\begin{descriptionlist}
\item[Encoder]
Processes the whole input sequence and outputs a representation of it.
\item[Decoder]
Processes the output of the encoder and produces the output sequence.
\end{descriptionlist}
\begin{remark}
Training is usually done using teacher forcing and averaging the loss of each output distribution.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_seq2seq.pdf}
\caption{Example of seq2seq network with RNNs}
\end{figure}
\end{description}
\section{Encoder-decoder RNN with attention}
\begin{description}
\item[Seq2seq RNN with attention] \marginnote{Seq2seq RNN with attention}
Architecture where the decoder can interact with each token processed by the encoder to determine dot-product attention scores (i.e., based on vector similarity).
The overall flow is the following:
\begin{enumerate}
\item The encoder computes its hidden states $\vec{h}^{(1)}, \dots, \vec{h}^{(N)} \in \mathbb{R}^{h}$.
\item The decoder processes the input tokens one at the time. Its hidden state is initialized with $\vec{h}^{(N)}$. Consider the token in position $t$, the output is determined as follows:
\begin{enumerate}
\item The decoder outputs the hidden state $\vec{s}^{(t)}$.
\item Attention scores $\vec{e}^{(t)}$ are determined as the dot product between $\vec{s}^{(t)}$ and $\vec{h}^{(i)}$:
\[
\vec{e}^{(t)} =
\begin{bmatrix}
\vec{s}^{(t)} \odot \vec{h}^{(1)} &
\cdots &
\vec{s}^{(t)} \odot \vec{h}^{(N)}
\end{bmatrix} \in \mathbb{R}^{N}
\]
$\vec{e}^{(t)}$ is used to determine the attention distribution $\vec{\alpha}^{(t)}$ that is required to obtain the attention output $\vec{a}^{(t)}$ as the weighted encoder hidden states :
\[
\begin{gathered}
\mathbb{R}^{N} \ni \vec{\alpha}^{(t)} = \texttt{softmax}(\vec{e}^{(t)}) \\
\mathbb{R}^{h} \ni \vec{a}^{t} = \sum_{i=1}^{N} \vec{\alpha}^{(t)}_i \vec{h}^{(i)}
\end{gathered}
\]
\item The overall representation of the $t$-th token is the concatenation of the attention and decoder output:
\[ \begin{bmatrix} \vec{a}^{(t)} \mid \vec{s}^{(t)} \end{bmatrix} \in \mathbb{R}^{2h} \]
\end{enumerate}
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_rnn_attention.pdf}
\end{figure}
\end{description}
\section{Convolutional neural networks for NLP}
\begin{description}
\item[1D convolution (NLP)] \marginnote{1D convolution}
Apply a kernel on a sequence of tokens within a window. A kernel of size $k$ with $d$-dimensional token embeddings is represented by a $k \times d$ weight matrix.
\begin{remark}
As in computer vision, multiple kernels can be stacked to increase the depth of the representation. Padding, stride, and dilation can also be used to change the receptive field. Pooling is also performed before passing to fully-connected layers.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_1d_convolution.pdf}
\caption{Example of three 1D convolutions with padding $1$}
\end{figure}
\end{description}
\begin{remark}
Convolutions are easy to parallelize.
\end{remark}
\begin{example}[CNN for sentence classification]
A possible multichannel CNN architecture for sentence classification works as follows:
\begin{itemize}
\item The input sequence is encoded by stacking both static and learned embeddings (of the same dimensionality).
\item Convolutions are applied to each channel. They can work with different widths.
\item Pooling is used to flatten the activations and avoid shape mismatch before passing through fully-connected layers.
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/cnn_sentence_classification.png}
\end{figure}
\end{example}
\begin{example}[Character-aware neural LM]
\phantom{}\\
\begin{minipage}{0.6\linewidth}
RNN-LM that works on a character level:
\begin{itemize}
\item Given a token, each character is embedded and concatenated.
\item Convolutions are used to refine the representation.
\item Pooling is used before passing the representation to the RNN.
\end{itemize}
\end{minipage}
\begin{minipage}{0.35\linewidth}
\centering
\includegraphics[width=0.95\linewidth]{./img/char_aware_rnn_cnn.png}
\end{minipage}
\end{example}
\section{Transformers}
\begin{description}
\item[Self-attention] \marginnote{Self-attention}
Component that allows to compute the representation of a token considering the other ones in the input sequence.
Given an input embedding $\vec{x}_i \in \mathbb{R}^{1 \times d_\text{model}}$, self-attention relies on the following values:
\begin{descriptionlist}
\item[Queries] \marginnote{Queries}
Used as the reference point for attention:
\[ \mathbb{R}^{1 \times d_k} \ni \vec{q}_i = \vec{x}_i \matr{W}_Q \]
\item[Keys] \marginnote{Keys}
Used as values to compare against the query:
\[ \mathbb{R}^{1 \times d_k} \ni \vec{k}_i = \vec{x}_i \matr{W}_K \]
\item[Values] \marginnote{Values}
Used to determine the output:
\[ \mathbb{R}^{1 \times d_v} \ni \vec{v}_i = \vec{x}_i \matr{W}_V \]
\end{descriptionlist}
where $\matr{W}_Q \in \mathbb{R}^{d_\text{model} \times d_k}$, $\matr{W}_K \in \mathbb{R}^{d_\text{model} \times d_k}$, and $\matr{W}_V \in \mathbb{R}^{d_\text{model} \times d_v}$ are parameters.
Then, the attention weights $\vec{\alpha}_{i,j}$ between two embeddings $\vec{x}_i$ and $\vec{x}_j$ are computed as:
\[
\begin{gathered}
\texttt{scores}(\vec{x}_i, \vec{x}_j) = \frac{\vec{q}_i \cdot \vec{k}_j}{\sqrt{d_k}} \\
\vec{\alpha}_{i,j} = \texttt{softmax}_j\left( \texttt{scores}(\vec{x}_i, \vec{x}_j) \right)
\end{gathered}
\]
The output $\vec{a}_i \in \mathbb{R}^{1 \times d_v}$ is a weighted sum of the values of each token:
\[ \vec{a}_i = \sum_{t} \vec{\alpha}_{i,t} \vec{v}_t \]
To maintain the input dimension, a final projection $\matr{W}_O \in \mathbb{R}^{d_v \times d_\text{model}}$ is applied.
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_self_attention.pdf}
\end{figure}
\item[Causal attention] \marginnote{Causal attention}
Self-attention mechanism where only past tokens can be used to determine the representation of a token at a specific position. It is computed by modifying the standard self-attention as:
\[
\begin{gathered}
\forall j \leq i: \texttt{scores}(\vec{x}_i, \vec{x}_j) = \frac{\vec{q}_i \cdot \vec{k}_j}{\sqrt{d_k}} \qquad
\forall j > i: \texttt{scores}(\vec{x}_i, \vec{x}_j) = \nullvec \\
\vec{\alpha}_{i,j} = \texttt{softmax}_j\left( \texttt{scores}(\vec{x}_i, \vec{x}_j) \right) \\
\vec{a}_i = \sum_{t: t \leq i} \vec{\alpha}_{i,t} \vec{v}_t
\end{gathered}
\]
\end{description}

View File

@ -33,7 +33,7 @@
\end{figure}
\begin{remark}
RNN-LMs generate the output autoregressively.
RNN-LMs allows to generate the output autoregressively.
\end{remark}
\begin{description}
@ -52,20 +52,176 @@
\end{description}
\end{description}
\begin{remark}
RNNs grow in width and not depth, and cannot be parallelized.
\end{remark}
\subsection{Long short-term memory}
\begin{remark}[Vanishing gradient]
In RNNS, the gradient of distant tokens vanishes through time. Therefore, long-term effects are hard to model.
\end{remark}
\begin{description}
\item[Long short-term memory (LSTM)] \marginnote{Long short-term memory (LSTM)}
Architecture where at each step $t$ outputs:
\begin{descriptionlist}
\item[Hidden state] $\vec{h}^{(t)} \in \mathbb{R}^{n}$ as in RNNs.
\item[Cell state] $\vec{c}^{(t)} \in \mathbb{R}^{n}$ with the responsibility of long-term memory.
\end{descriptionlist}
\begin{description}
\item[Gates]
Non-linear operators to manipulate the cell state (in the following part, $\matr{W}_*$, $\matr{U}_*$, and $\vec{b}_*$ are parameters).
\begin{description}
\item[Forget gate] \marginnote{Forget gate}
Controls what part of the cell state to keep:
\[ \vec{f}^{(t)} = \sigma\left( \matr{W}_f \vec{h}^{(t-1)} + \matr{U}_f \vec{x}^{(t)} + \vec{b}_f \right) \]
\item[Input gate] \marginnote{Input gate}
Controls what part of the input to writer in the cell state:
\[ \vec{i}^{(t)} = \sigma\left( \matr{W}_i \vec{h}^{(t-1)} + \matr{U}_i \vec{x}^{(t)} + \vec{b}_i \right) \]
\item[Output gate] \marginnote{Output gate}
Controls what part of the cell state to include in the output hidden state:
\[ \vec{o}^{(t)} = \sigma\left( \matr{W}_o \vec{h}^{(t-1)} + \matr{U}_o \vec{x}^{(t)} + \vec{b}_o \right) \]
\end{description}
Updates are done as follows:
\begin{descriptionlist}
\item[New cell state content] $\tilde{\vec{c}}^{(t)} = \texttt{tanh}\left( \matr{W}_c \vec{h}^{(t-1)} + \matr{U}_c \vec{x}^{(t)} + \vec{b}_c \right)$.
\item[Cell state] $\vec{c}^{(t)} = \vec{f}^{(t)} \cdot \vec{c}^{(t-1)} + \vec{i}^{(t)} \cdot \tilde{\vec{c}}^{(t)}$.
\item[Hidden state] $\vec{h}^{(t)} = \vec{o}^{(t)} \cdot \texttt{tanh}(\vec{c}^{(t)})$.
\end{descriptionlist}
\end{description}
\begin{remark}
LSTM makes it easier to preserve information over time, but it might still be affected by the vanishing gradient problem.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/_lstm.pdf}
\end{figure}
\end{description}
\subsection{Gated recurrent units}
\begin{description}
\item[Gated recurrent units (GRU)] \marginnote{Gated recurrent units (GRU)}
Architecture simpler than LSTM with fewer gates and without the cell state.
\begin{description}
\item[Gates] \phantom{}
\begin{description}
\item[Update gate] \marginnote{Update gate}
Controls what part of the current hidden state to keep:
\[ \vec{u}^{(t)} = \sigma\left( \matr{W}_u \vec{h}^{(t-1)} + \matr{U}_u \vec{x}^{(t)} + \vec{b}_u \right) \]
\item[Reset gate] \marginnote{Reset gate}
Controls what part of the previous hidden state to use:
\[ \vec{r}^{(t)} = \sigma\left( \matr{W}_r \vec{h}^{(t-1)} + \matr{U}_r \vec{x}^{(t)} + \vec{b}_r \right) \]
\end{description}
\end{description}
Updates are done as follows:
\begin{descriptionlist}
\item[New hidden state content] $\tilde{\vec{h}}^{(t)} = \texttt{tanh}\left( \matr{W}_h(\vec{r}^{(t)} \cdot \vec{h}^{(t-1)}) + \matr{U}_h \vec{x}^{(t)} + \vec{b}_h \right)$.
\item[Hidden state] $\vec{h}^{(t)} = (1-\vec{u}^{(t)}) \cdot \vec{h}^{(t-1)} + \vec{u}^{(t)} \cdot \tilde{\vec{h}}^{(t)}$.
\end{descriptionlist}
\end{description}
\begin{remark}
Being faster to train than LSTMs, GRUs are usually a good starting point.
\end{remark}
\subsection{Bidirectional RNN}
\begin{description}
\item[Bidirectional RNN] \marginnote{Bidirectional RNN}
Two independent RNNs (of any architecture) that processes the input left-to-right (forward) and right-to-left (backward), respectively. Usually, the output hidden state of a token $t$ is obtained as the concatenation of the hidden states $\vec{h}^{(t)}_{\text{forward}}$ and $\vec{h}^{(t)}_{\text{backward}}$ of both networks.
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_birnn.pdf}
\end{figure}
\begin{remark}
This architecture is not for language modelling (i.e., autoregressive models) as it is assumed that the whole input sequence is available at once.
\end{remark}
\begin{example}[Sequence classification]
For sequence classification, the last hidden state of the forward and backward contexts can be used as the representation of the whole sequence to pass to the classifier.
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/_birnn_seq_classification.pdf}
\end{figure}
\end{example}
\subsection{Stacked multi-layer RNN}
\begin{description}
\item[Stacked RNN] \marginnote{Stacked RNN}
Stack of RNNs (of any architecture) where:
\begin{itemize}
\item The RNN at the first layer $l=1$ processes the input tokens.
\item The input of any following layer $l \geq 2$ is the hidden state $\vec{h}^{(t)}_{l-1}$ of the previous layer.
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/_layered_rnn.pdf}
\end{figure}
\end{description}
\begin{remark}
Skip connections between different layers can help to stabilize the gradient.
\end{remark}
\section{Applications}
\subsection{Autoregressive generation}
\begin{description}
\item[Autoregressive generation] \marginnote{Autoregressive generation}
Repeatedly sample a token and feed it back to the network.
\item[Decoding strategy] \marginnote{Decoding strategy}
Method to select the output token from the output distribution. Possible approaches are:
\begin{descriptionlist}
\item[Greedy] Select the token with the highest probability.
\item[Sampling] Randomly sample the token following the probabilities of the output distribution.
\end{descriptionlist}
\begin{description}
\item[Decoding strategy] \marginnote{Decoding strategy}
Method to select the output token from the output distribution. Possible approaches are:
\begin{descriptionlist}
\item[Greedy] Select the token with the highest probability.
\item[Sampling] Randomly sample the token following the probabilities of the output distribution.
\end{descriptionlist}
\end{description}
\begin{description}
\item[Conditioned generation] \marginnote{Conditioned generation}
Provide an initial hidden state to the RNN (e.g., speech-to-text).
\end{description}
\item[Sequence labelling] \marginnote{Sequence labelling}
Assign a class to each input token (e.g., POS-tagging, named-entity recognition, structure prediction, \dots).
\item[Sequence classification] \marginnote{Sequence classification}
Assign a class to the whole input sequence (e.g., sentiment analysis, document-topic classification, \dots).
\item[Sentence encoding] \marginnote{Sentence encoding}
Produce a vector representation for the whole input sequence.
Possible approaches are:
\begin{itemize}
\item Use the final hidden state of the RNN.
\item Aggregate all the hidden states (e.g., mean).
\end{itemize}
\begin{example}[Question answering]
The RNN encoder embeds the question that is used alongside the context (i.e., source from which the answer has to be extracted) to solve a labelling task (i.e., classify each token of the context as non-relevant or relevant).
\end{example}
\end{description}