mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add NLP transformer + decoding strategies
This commit is contained in:
BIN
src/year2/natural-language-processing/img/_attention_block.pdf
Normal file
BIN
src/year2/natural-language-processing/img/_attention_block.pdf
Normal file
Binary file not shown.
BIN
src/year2/natural-language-processing/img/_beam_search.pdf
Normal file
BIN
src/year2/natural-language-processing/img/_beam_search.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
src/year2/natural-language-processing/img/_lm_head.pdf
Normal file
BIN
src/year2/natural-language-processing/img/_lm_head.pdf
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
src/year2/natural-language-processing/img/_residual_stream.pdf
Normal file
BIN
src/year2/natural-language-processing/img/_residual_stream.pdf
Normal file
Binary file not shown.
Binary file not shown.
@ -14,5 +14,6 @@
|
||||
\include{./sections/_semantics.tex}
|
||||
\include{./sections/_rnn.tex}
|
||||
\include{./sections/_attention.tex}
|
||||
\include{./sections/_llm.tex}
|
||||
|
||||
\end{document}
|
||||
@ -121,12 +121,15 @@
|
||||
|
||||
|
||||
|
||||
\section{Transformers}
|
||||
\section{Transformer decoder (for language modelling)}
|
||||
|
||||
|
||||
\subsection{Self-attention}
|
||||
|
||||
\begin{description}
|
||||
\item[Self-attention] \marginnote{Self-attention}
|
||||
Component that allows to compute the representation of a token considering the other ones in the input sequence.
|
||||
|
||||
|
||||
Given an input embedding $\vec{x}_i \in \mathbb{R}^{1 \times d_\text{model}}$, self-attention relies on the following values:
|
||||
\begin{descriptionlist}
|
||||
\item[Queries] \marginnote{Queries}
|
||||
@ -169,4 +172,75 @@
|
||||
\vec{a}_i = \sum_{t: t \leq i} \vec{\alpha}_{i,t} \vec{v}_t
|
||||
\end{gathered}
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Components}
|
||||
|
||||
\begin{description}
|
||||
\item[Input embedding] \marginnote{Input embedding}
|
||||
The input is tokenized using standard tokenizers (e.g., BPE, SentencePiece, \dots). Each token is encoded using a learned embedding matrix.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/_transformer_embedding.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Positional encoding] \marginnote{Positional encoding}
|
||||
Learned position embeddings to encode positional information are added to the input token embeddings.
|
||||
|
||||
\begin{remark}
|
||||
Without positional encoding, transformers are invariant to permutations.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/_positional_encoding.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Transformer block] \marginnote{Transformer block}
|
||||
Module with the same input and output dimensionality (i.e., allows stacking multiple blocks) composed of:
|
||||
\begin{descriptionlist}
|
||||
\item[Multi-head attention] \marginnote{Multi-head attention}
|
||||
Uses $h$ different self-attention blocks with different queries, keys, and values. Value vectors are of size $\frac{d_v}{h}$. The final projection $W_O$ is applied on the concatenation of the outputs of each head.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/_multi_head_attention.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Feedforward layer]
|
||||
Fully-connected 2-layer network applied at each position of the attention output:
|
||||
\[ \texttt{FFN}(\vec{x}_i) = \texttt{ReLU}(\vec{x}_i\matr{W}_1 + \vec{b}_1)\matr{W}_2 + \vec{b}_2 \]
|
||||
Where the hidden dimension $d_\text{ff}$ is usually larger than $d_\text{model}$.
|
||||
|
||||
\item[Normalization layer]
|
||||
Applies token-wise normalization (i.e., layer norm) to help training stability.
|
||||
|
||||
\item[Residual connection]
|
||||
Helps to propagate information during training.
|
||||
|
||||
\begin{remark}[Residual stream]
|
||||
An interpretation of residual connections is the residual stream where the input token in enhanced by the output of multi-head attention and the feedforward network.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.38\linewidth]{./img/_residual_stream.pdf}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/_attention_block.pdf}
|
||||
\caption{Overall attention block}
|
||||
\end{figure}
|
||||
|
||||
\item[Language modelling head] \marginnote{Language modelling head}
|
||||
Takes as input the output corresponding to a token of the transformer blocks stack and outputs a distribution over the vocabulary.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_lm_head.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
114
src/year2/natural-language-processing/sections/_llm.tex
Normal file
114
src/year2/natural-language-processing/sections/_llm.tex
Normal file
@ -0,0 +1,114 @@
|
||||
\chapter{Large language models}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Conditional generation] \marginnote{Conditional generation}
|
||||
Generate text conditioned on the input tokens (i.e., prompt).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/_conditional_generation.pdf}
|
||||
\end{figure}
|
||||
|
||||
\begin{example}[Sentiment analysis]
|
||||
Given the prompt:
|
||||
\[ p = \texttt{the sentiment of the sentence `I like Jackie Chan' is} \]
|
||||
Determine the probability of the tokens \texttt{positive} and \texttt{negative}:
|
||||
\[
|
||||
\prob{\texttt{positive} \mid p} \qquad \prob{\texttt{negative} \mid p}
|
||||
\]
|
||||
\end{example}
|
||||
|
||||
\begin{example}[Question answering]
|
||||
Given the prompt:
|
||||
\[ p = \texttt{Q: who wrote the book `The origin of Species'? A:} \]
|
||||
Determine the tokens of the answer autoregressively:
|
||||
\[
|
||||
\arg\max_{w_1} \prob{w_1 \mid p}, \arg\max_{w_2} \prob{w_2 \mid pw_1}, \dots
|
||||
\]
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Decoding strategies}
|
||||
|
||||
\begin{description}
|
||||
\item[Greedy decoding] \marginnote{Greedy decoding}
|
||||
Select the next token as the most probable of the output distribution.
|
||||
|
||||
\begin{remark}
|
||||
Greedy decoding risks getting stuck in a local optimum.
|
||||
|
||||
\indenttbox
|
||||
\begin{example}
|
||||
Consider the following search tree of possible generated sequences:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
|
||||
\end{figure}
|
||||
|
||||
Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
|
||||
\end{example}
|
||||
\end{remark}
|
||||
|
||||
\item[Beam search] \marginnote{Beam search}
|
||||
Given a beam width $k$, perform a breadth-first search keeping at each branching level the top-$k$ tokens based on the probability of that sequence:
|
||||
\[ \log\left( \prob{y \mid x} \right) = \sum_{i=1}^{t} \log\left( \prob{ y_i \mid x, y_1, \dots, y_{i-1} } \right) \]
|
||||
|
||||
\begin{example}
|
||||
Consider the following tree with beam width $k=2$:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_beam_search.pdf}
|
||||
\end{figure}
|
||||
The selected sequence is \texttt{[BOS] the green witch arrived [EOS]}.
|
||||
\end{example}
|
||||
|
||||
\begin{remark}
|
||||
As each path might generate sequences of different length, the score is usually normalized by the number of tokens as:
|
||||
\[ \log\left( \prob{y \mid x} \right) = \frac{1}{t} \sum_{i=1}^{t} \log\left( \prob{ y_i \mid x, y_1, \dots, y_{i-1} } \right) \]
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
The likelihood of the sequences generated using beam search is higher than using greedy decoding. However, beam search is still not optimal.
|
||||
\end{remark}
|
||||
|
||||
\item[Sampling] \marginnote{Sampling}
|
||||
Sample the next token based on the output distribution.
|
||||
|
||||
\begin{description}
|
||||
\item[Random sampling]
|
||||
Sample considering the distribution over the whole vocabulary.
|
||||
|
||||
\begin{remark}
|
||||
By adding-up all the low-probability words (which are most likely unreasonable as the next token), their actual chance of getting selected is relatively high.
|
||||
\end{remark}
|
||||
|
||||
\item[Temperature sampling]
|
||||
Skew the distribution to emphasize the most likely words and decrease the probability of less likely words. Given the logits $\vec{u}$ and the temperature $\tau$, the output distribution $\vec{y}$ is determined as:
|
||||
\[ \vec{y} = \texttt{softmax}\left( \frac{\vec{u}}{\tau} \right) \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item Higher temperatures (i.e., $\tau > 1$) allow for considering low-probability words.
|
||||
\item Lower temperatures (i.e., $\tau \in (0, 1]$) focus on high-probability words.
|
||||
\begin{remark}
|
||||
A temperature of $\tau = 0$ corresponds to greedy decoding.
|
||||
\end{remark}
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\item[Top-k sampling]
|
||||
Consider the top-$k$ most probable words and apply random sampling on their normalized distribution.
|
||||
|
||||
\begin{remark}
|
||||
$k=1$ corresponds to greedy decoding.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
$k$ is fixed and does not account for the shape of the distribution.
|
||||
\end{remark}
|
||||
|
||||
\item[Top-p sampling]
|
||||
Consider the most likely words such that their probability mass adds up to $p$. Then, apply random sampling on their normalized distribution.
|
||||
\end{description}
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user