mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2026-06-21 23:42:24 +00:00
Add NLP transformer + decoding strategies
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -14,5 +14,6 @@
|
||||
\include{./sections/_semantics.tex}
|
||||
\include{./sections/_rnn.tex}
|
||||
\include{./sections/_attention.tex}
|
||||
\include{./sections/_llm.tex}
|
||||
|
||||
\end{document}
|
||||
@@ -121,12 +121,15 @@
|
||||
|
||||
|
||||
|
||||
\section{Transformers}
|
||||
\section{Transformer decoder (for language modelling)}
|
||||
|
||||
|
||||
\subsection{Self-attention}
|
||||
|
||||
\begin{description}
|
||||
\item[Self-attention] \marginnote{Self-attention}
|
||||
Component that allows to compute the representation of a token considering the other ones in the input sequence.
|
||||
|
||||
|
||||
Given an input embedding $\vec{x}_i \in \mathbb{R}^{1 \times d_\text{model}}$, self-attention relies on the following values:
|
||||
\begin{descriptionlist}
|
||||
\item[Queries] \marginnote{Queries}
|
||||
@@ -169,4 +172,75 @@
|
||||
\vec{a}_i = \sum_{t: t \leq i} \vec{\alpha}_{i,t} \vec{v}_t
|
||||
\end{gathered}
|
||||
\]
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Components}
|
||||
|
||||
\begin{description}
|
||||
\item[Input embedding] \marginnote{Input embedding}
|
||||
The input is tokenized using standard tokenizers (e.g., BPE, SentencePiece, \dots). Each token is encoded using a learned embedding matrix.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/_transformer_embedding.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Positional encoding] \marginnote{Positional encoding}
|
||||
Learned position embeddings to encode positional information are added to the input token embeddings.
|
||||
|
||||
\begin{remark}
|
||||
Without positional encoding, transformers are invariant to permutations.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/_positional_encoding.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Transformer block] \marginnote{Transformer block}
|
||||
Module with the same input and output dimensionality (i.e., allows stacking multiple blocks) composed of:
|
||||
\begin{descriptionlist}
|
||||
\item[Multi-head attention] \marginnote{Multi-head attention}
|
||||
Uses $h$ different self-attention blocks with different queries, keys, and values. Value vectors are of size $\frac{d_v}{h}$. The final projection $W_O$ is applied on the concatenation of the outputs of each head.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/_multi_head_attention.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Feedforward layer]
|
||||
Fully-connected 2-layer network applied at each position of the attention output:
|
||||
\[ \texttt{FFN}(\vec{x}_i) = \texttt{ReLU}(\vec{x}_i\matr{W}_1 + \vec{b}_1)\matr{W}_2 + \vec{b}_2 \]
|
||||
Where the hidden dimension $d_\text{ff}$ is usually larger than $d_\text{model}$.
|
||||
|
||||
\item[Normalization layer]
|
||||
Applies token-wise normalization (i.e., layer norm) to help training stability.
|
||||
|
||||
\item[Residual connection]
|
||||
Helps to propagate information during training.
|
||||
|
||||
\begin{remark}[Residual stream]
|
||||
An interpretation of residual connections is the residual stream where the input token in enhanced by the output of multi-head attention and the feedforward network.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.38\linewidth]{./img/_residual_stream.pdf}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/_attention_block.pdf}
|
||||
\caption{Overall attention block}
|
||||
\end{figure}
|
||||
|
||||
\item[Language modelling head] \marginnote{Language modelling head}
|
||||
Takes as input the output corresponding to a token of the transformer blocks stack and outputs a distribution over the vocabulary.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_lm_head.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
@@ -0,0 +1,114 @@
|
||||
\chapter{Large language models}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Conditional generation] \marginnote{Conditional generation}
|
||||
Generate text conditioned on the input tokens (i.e., prompt).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/_conditional_generation.pdf}
|
||||
\end{figure}
|
||||
|
||||
\begin{example}[Sentiment analysis]
|
||||
Given the prompt:
|
||||
\[ p = \texttt{the sentiment of the sentence `I like Jackie Chan' is} \]
|
||||
Determine the probability of the tokens \texttt{positive} and \texttt{negative}:
|
||||
\[
|
||||
\prob{\texttt{positive} \mid p} \qquad \prob{\texttt{negative} \mid p}
|
||||
\]
|
||||
\end{example}
|
||||
|
||||
\begin{example}[Question answering]
|
||||
Given the prompt:
|
||||
\[ p = \texttt{Q: who wrote the book `The origin of Species'? A:} \]
|
||||
Determine the tokens of the answer autoregressively:
|
||||
\[
|
||||
\arg\max_{w_1} \prob{w_1 \mid p}, \arg\max_{w_2} \prob{w_2 \mid pw_1}, \dots
|
||||
\]
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Decoding strategies}
|
||||
|
||||
\begin{description}
|
||||
\item[Greedy decoding] \marginnote{Greedy decoding}
|
||||
Select the next token as the most probable of the output distribution.
|
||||
|
||||
\begin{remark}
|
||||
Greedy decoding risks getting stuck in a local optimum.
|
||||
|
||||
\indenttbox
|
||||
\begin{example}
|
||||
Consider the following search tree of possible generated sequences:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
|
||||
\end{figure}
|
||||
|
||||
Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
|
||||
\end{example}
|
||||
\end{remark}
|
||||
|
||||
\item[Beam search] \marginnote{Beam search}
|
||||
Given a beam width $k$, perform a breadth-first search keeping at each branching level the top-$k$ tokens based on the probability of that sequence:
|
||||
\[ \log\left( \prob{y \mid x} \right) = \sum_{i=1}^{t} \log\left( \prob{ y_i \mid x, y_1, \dots, y_{i-1} } \right) \]
|
||||
|
||||
\begin{example}
|
||||
Consider the following tree with beam width $k=2$:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_beam_search.pdf}
|
||||
\end{figure}
|
||||
The selected sequence is \texttt{[BOS] the green witch arrived [EOS]}.
|
||||
\end{example}
|
||||
|
||||
\begin{remark}
|
||||
As each path might generate sequences of different length, the score is usually normalized by the number of tokens as:
|
||||
\[ \log\left( \prob{y \mid x} \right) = \frac{1}{t} \sum_{i=1}^{t} \log\left( \prob{ y_i \mid x, y_1, \dots, y_{i-1} } \right) \]
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
The likelihood of the sequences generated using beam search is higher than using greedy decoding. However, beam search is still not optimal.
|
||||
\end{remark}
|
||||
|
||||
\item[Sampling] \marginnote{Sampling}
|
||||
Sample the next token based on the output distribution.
|
||||
|
||||
\begin{description}
|
||||
\item[Random sampling]
|
||||
Sample considering the distribution over the whole vocabulary.
|
||||
|
||||
\begin{remark}
|
||||
By adding-up all the low-probability words (which are most likely unreasonable as the next token), their actual chance of getting selected is relatively high.
|
||||
\end{remark}
|
||||
|
||||
\item[Temperature sampling]
|
||||
Skew the distribution to emphasize the most likely words and decrease the probability of less likely words. Given the logits $\vec{u}$ and the temperature $\tau$, the output distribution $\vec{y}$ is determined as:
|
||||
\[ \vec{y} = \texttt{softmax}\left( \frac{\vec{u}}{\tau} \right) \]
|
||||
where:
|
||||
\begin{itemize}
|
||||
\item Higher temperatures (i.e., $\tau > 1$) allow for considering low-probability words.
|
||||
\item Lower temperatures (i.e., $\tau \in (0, 1]$) focus on high-probability words.
|
||||
\begin{remark}
|
||||
A temperature of $\tau = 0$ corresponds to greedy decoding.
|
||||
\end{remark}
|
||||
\end{itemize}
|
||||
|
||||
|
||||
\item[Top-k sampling]
|
||||
Consider the top-$k$ most probable words and apply random sampling on their normalized distribution.
|
||||
|
||||
\begin{remark}
|
||||
$k=1$ corresponds to greedy decoding.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
$k$ is fixed and does not account for the shape of the distribution.
|
||||
\end{remark}
|
||||
|
||||
\item[Top-p sampling]
|
||||
Consider the most likely words such that their probability mass adds up to $p$. Then, apply random sampling on their normalized distribution.
|
||||
\end{description}
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user