diff --git a/src/year2/natural-language-processing/img/_attention_block.pdf b/src/year2/natural-language-processing/img/_attention_block.pdf new file mode 100644 index 0000000..49d86f5 Binary files /dev/null and b/src/year2/natural-language-processing/img/_attention_block.pdf differ diff --git a/src/year2/natural-language-processing/img/_beam_search.pdf b/src/year2/natural-language-processing/img/_beam_search.pdf new file mode 100644 index 0000000..09be145 Binary files /dev/null and b/src/year2/natural-language-processing/img/_beam_search.pdf differ diff --git a/src/year2/natural-language-processing/img/_conditional_generation.pdf b/src/year2/natural-language-processing/img/_conditional_generation.pdf new file mode 100644 index 0000000..234f4d4 Binary files /dev/null and b/src/year2/natural-language-processing/img/_conditional_generation.pdf differ diff --git a/src/year2/natural-language-processing/img/_greedy_decoding_local_minimum.pdf b/src/year2/natural-language-processing/img/_greedy_decoding_local_minimum.pdf new file mode 100644 index 0000000..0978f2b Binary files /dev/null and b/src/year2/natural-language-processing/img/_greedy_decoding_local_minimum.pdf differ diff --git a/src/year2/natural-language-processing/img/_lm_head.pdf b/src/year2/natural-language-processing/img/_lm_head.pdf new file mode 100644 index 0000000..6bc6787 Binary files /dev/null and b/src/year2/natural-language-processing/img/_lm_head.pdf differ diff --git a/src/year2/natural-language-processing/img/_multi_head_attention.pdf b/src/year2/natural-language-processing/img/_multi_head_attention.pdf new file mode 100644 index 0000000..4dbab50 Binary files /dev/null and b/src/year2/natural-language-processing/img/_multi_head_attention.pdf differ diff --git a/src/year2/natural-language-processing/img/_positional_encoding.pdf b/src/year2/natural-language-processing/img/_positional_encoding.pdf new file mode 100644 index 0000000..4ffcb4c Binary files /dev/null and b/src/year2/natural-language-processing/img/_positional_encoding.pdf differ diff --git a/src/year2/natural-language-processing/img/_residual_stream.pdf b/src/year2/natural-language-processing/img/_residual_stream.pdf new file mode 100644 index 0000000..c97c668 Binary files /dev/null and b/src/year2/natural-language-processing/img/_residual_stream.pdf differ diff --git a/src/year2/natural-language-processing/img/_transformer_embedding.pdf b/src/year2/natural-language-processing/img/_transformer_embedding.pdf new file mode 100644 index 0000000..6dc1caf Binary files /dev/null and b/src/year2/natural-language-processing/img/_transformer_embedding.pdf differ diff --git a/src/year2/natural-language-processing/nlp.tex b/src/year2/natural-language-processing/nlp.tex index 2d1bbe4..4c4b073 100644 --- a/src/year2/natural-language-processing/nlp.tex +++ b/src/year2/natural-language-processing/nlp.tex @@ -14,5 +14,6 @@ \include{./sections/_semantics.tex} \include{./sections/_rnn.tex} \include{./sections/_attention.tex} + \include{./sections/_llm.tex} \end{document} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_attention.tex b/src/year2/natural-language-processing/sections/_attention.tex index aba1cd4..bf8bef5 100644 --- a/src/year2/natural-language-processing/sections/_attention.tex +++ b/src/year2/natural-language-processing/sections/_attention.tex @@ -121,12 +121,15 @@ -\section{Transformers} +\section{Transformer decoder (for language modelling)} + + +\subsection{Self-attention} \begin{description} \item[Self-attention] \marginnote{Self-attention} Component that allows to compute the representation of a token considering the other ones in the input sequence. - + Given an input embedding $\vec{x}_i \in \mathbb{R}^{1 \times d_\text{model}}$, self-attention relies on the following values: \begin{descriptionlist} \item[Queries] \marginnote{Queries} @@ -169,4 +172,75 @@ \vec{a}_i = \sum_{t: t \leq i} \vec{\alpha}_{i,t} \vec{v}_t \end{gathered} \] +\end{description} + + +\subsection{Components} + +\begin{description} + \item[Input embedding] \marginnote{Input embedding} + The input is tokenized using standard tokenizers (e.g., BPE, SentencePiece, \dots). Each token is encoded using a learned embedding matrix. + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{./img/_transformer_embedding.pdf} + \end{figure} + + \item[Positional encoding] \marginnote{Positional encoding} + Learned position embeddings to encode positional information are added to the input token embeddings. + + \begin{remark} + Without positional encoding, transformers are invariant to permutations. + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/_positional_encoding.pdf} + \end{figure} + + \item[Transformer block] \marginnote{Transformer block} + Module with the same input and output dimensionality (i.e., allows stacking multiple blocks) composed of: + \begin{descriptionlist} + \item[Multi-head attention] \marginnote{Multi-head attention} + Uses $h$ different self-attention blocks with different queries, keys, and values. Value vectors are of size $\frac{d_v}{h}$. The final projection $W_O$ is applied on the concatenation of the outputs of each head. + + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/_multi_head_attention.pdf} + \end{figure} + + \item[Feedforward layer] + Fully-connected 2-layer network applied at each position of the attention output: + \[ \texttt{FFN}(\vec{x}_i) = \texttt{ReLU}(\vec{x}_i\matr{W}_1 + \vec{b}_1)\matr{W}_2 + \vec{b}_2 \] + Where the hidden dimension $d_\text{ff}$ is usually larger than $d_\text{model}$. + + \item[Normalization layer] + Applies token-wise normalization (i.e., layer norm) to help training stability. + + \item[Residual connection] + Helps to propagate information during training. + + \begin{remark}[Residual stream] + An interpretation of residual connections is the residual stream where the input token in enhanced by the output of multi-head attention and the feedforward network. + + \begin{figure}[H] + \centering + \includegraphics[width=0.38\linewidth]{./img/_residual_stream.pdf} + \end{figure} + \end{remark} + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/_attention_block.pdf} + \caption{Overall attention block} + \end{figure} + + \item[Language modelling head] \marginnote{Language modelling head} + Takes as input the output corresponding to a token of the transformer blocks stack and outputs a distribution over the vocabulary. + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/_lm_head.pdf} + \end{figure} \end{description} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_llm.tex b/src/year2/natural-language-processing/sections/_llm.tex new file mode 100644 index 0000000..2080d51 --- /dev/null +++ b/src/year2/natural-language-processing/sections/_llm.tex @@ -0,0 +1,114 @@ +\chapter{Large language models} + + +\begin{description} + \item[Conditional generation] \marginnote{Conditional generation} + Generate text conditioned on the input tokens (i.e., prompt). + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{./img/_conditional_generation.pdf} + \end{figure} + + \begin{example}[Sentiment analysis] + Given the prompt: + \[ p = \texttt{the sentiment of the sentence `I like Jackie Chan' is} \] + Determine the probability of the tokens \texttt{positive} and \texttt{negative}: + \[ + \prob{\texttt{positive} \mid p} \qquad \prob{\texttt{negative} \mid p} + \] + \end{example} + + \begin{example}[Question answering] + Given the prompt: + \[ p = \texttt{Q: who wrote the book `The origin of Species'? A:} \] + Determine the tokens of the answer autoregressively: + \[ + \arg\max_{w_1} \prob{w_1 \mid p}, \arg\max_{w_2} \prob{w_2 \mid pw_1}, \dots + \] + \end{example} +\end{description} + + +\section{Decoding strategies} + +\begin{description} + \item[Greedy decoding] \marginnote{Greedy decoding} + Select the next token as the most probable of the output distribution. + + \begin{remark} + Greedy decoding risks getting stuck in a local optimum. + + \indenttbox + \begin{example} + Consider the following search tree of possible generated sequences: + \begin{figure}[H] + \centering + \includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf} + \end{figure} + + Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$. + \end{example} + \end{remark} + + \item[Beam search] \marginnote{Beam search} + Given a beam width $k$, perform a breadth-first search keeping at each branching level the top-$k$ tokens based on the probability of that sequence: + \[ \log\left( \prob{y \mid x} \right) = \sum_{i=1}^{t} \log\left( \prob{ y_i \mid x, y_1, \dots, y_{i-1} } \right) \] + + \begin{example} + Consider the following tree with beam width $k=2$: + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/_beam_search.pdf} + \end{figure} + The selected sequence is \texttt{[BOS] the green witch arrived [EOS]}. + \end{example} + + \begin{remark} + As each path might generate sequences of different length, the score is usually normalized by the number of tokens as: + \[ \log\left( \prob{y \mid x} \right) = \frac{1}{t} \sum_{i=1}^{t} \log\left( \prob{ y_i \mid x, y_1, \dots, y_{i-1} } \right) \] + \end{remark} + + \begin{remark} + The likelihood of the sequences generated using beam search is higher than using greedy decoding. However, beam search is still not optimal. + \end{remark} + + \item[Sampling] \marginnote{Sampling} + Sample the next token based on the output distribution. + + \begin{description} + \item[Random sampling] + Sample considering the distribution over the whole vocabulary. + + \begin{remark} + By adding-up all the low-probability words (which are most likely unreasonable as the next token), their actual chance of getting selected is relatively high. + \end{remark} + + \item[Temperature sampling] + Skew the distribution to emphasize the most likely words and decrease the probability of less likely words. Given the logits $\vec{u}$ and the temperature $\tau$, the output distribution $\vec{y}$ is determined as: + \[ \vec{y} = \texttt{softmax}\left( \frac{\vec{u}}{\tau} \right) \] + where: + \begin{itemize} + \item Higher temperatures (i.e., $\tau > 1$) allow for considering low-probability words. + \item Lower temperatures (i.e., $\tau \in (0, 1]$) focus on high-probability words. + \begin{remark} + A temperature of $\tau = 0$ corresponds to greedy decoding. + \end{remark} + \end{itemize} + + + \item[Top-k sampling] + Consider the top-$k$ most probable words and apply random sampling on their normalized distribution. + + \begin{remark} + $k=1$ corresponds to greedy decoding. + \end{remark} + + \begin{remark} + $k$ is fixed and does not account for the shape of the distribution. + \end{remark} + + \item[Top-p sampling] + Consider the most likely words such that their probability mass adds up to $p$. Then, apply random sampling on their normalized distribution. + \end{description} +\end{description} \ No newline at end of file