Add NLP encoder-decoder + distillation

This commit is contained in:
2024-12-03 20:24:26 +01:00
parent 2236cc91fd
commit 03efe2bcca
7 changed files with 280 additions and 80 deletions

View File

@ -15,6 +15,6 @@
\include{./sections/_rnn.tex}
\include{./sections/_attention.tex}
\include{./sections/_llm.tex}
\include{./sections/_mlm.tex}
\include{./sections/_model_efficiency.tex}
\end{document}

View File

@ -1,6 +1,9 @@
\chapter{Large language models}
\section{Decoder-only architecture} \label{sec:llm}
\begin{description}
\item[Conditional generation] \marginnote{Conditional generation}
Generate text conditioned on the input tokens (i.e., prompt).
@ -30,7 +33,7 @@
\end{description}
\section{Decoding strategies}
\subsection{Decoding strategies}
\begin{description}
\item[Greedy decoding] \marginnote{Greedy decoding}
@ -42,12 +45,17 @@
\indenttbox
\begin{example}
Consider the following search tree of possible generated sequences:
\begin{figure}[H]
\centering
\includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
\end{figure}
Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
\begin{minipage}{0.35\linewidth}
\begin{figure}[H]
\centering
\includegraphics[width=\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
\end{figure}
\end{minipage}
\hfill
\begin{minipage}[b]{0.6\linewidth}
Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
\end{minipage}
\end{example}
\end{remark}
@ -114,10 +122,6 @@
\end{description}
\section{Training}
\subsection{Pre-training}
\begin{description}
@ -168,20 +172,7 @@
\begin{description}
\item[Parameter-efficient fine-tuning (PEFT)] \marginnote{Parameter-efficient fine-tuning (PEFT)}
Continue training a selected subset of parameters.
\begin{description}
\item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)}
Method to update weights by learning an offset that uses fewer parameters.
Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as:
\[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \]
\begin{figure}[H]
\centering
\includegraphics[width=0.35\linewidth]{./img/_lora.pdf}
\end{figure}
\end{description}
Continue training a selected subset of parameters (e.g., LoRA \Cref{sec:lora}).
\item[Task-specific fine-tuning] \marginnote{Task-specific fine-tuning}
Add a new trainable head on top of the model.
@ -190,4 +181,183 @@
\item[Supervised fine-tuning] \marginnote{Supervised fine-tuning}
Continue training using a supervised dataset to align the model to human's expectation.
\end{description}
\end{description}
\section{Encoder-only architecture} \label{sec:mlm}
\begin{description}
\item[Transformer encoder] \marginnote{Transformer encoder}
Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context.
\begin{remark}
This architecture does feature extraction and is more suited for classification tasks.
\end{remark}
\begin{description}
\item[Architecture]
Similar to a transformer decoder, but self-attention is not causal.
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf}
\end{figure}
\end{description}
\item[Contextual embedding] \marginnote{Contextual embedding}
Represent the meaning of word instances (i.e., dynamically depending on the surroundings).
\begin{remark}[Sequence embedding]
Encoders usually have a classifier token (e.g., \texttt{[CLS]}) to model the whole sentence.
\end{remark}
\begin{example}[Word sense disambiguation]
Task of determining the sense of each word of a sequence. Senses usually come from an existing ontology (e.g., WordNet). An approach to solve the problem is the following:
\begin{enumerate}
\item Compute the embedding $\vec{v}_i$ of words using a pre-trained encoder (e.g., BERT).
\item Represent the embedding of a sense as the average of the tokens of that sense:
\[ \vec{v}_s = \frac{1}{n} \sum_i \vec{v}_i \]
\item Predict the sense of a word $\vec{t}$ as:
\[ \arg\max_{s \in \texttt{senses}(\vec{t})} \texttt{distance}(\vec{t}, \vec{v}_s) \]
\end{enumerate}
\end{example}
\end{description}
\begin{description}
\item[Tokenizer fertility] \marginnote{Tokenizer fertility}
Average amount of tokens used to represent words.
\begin{remark}
Tokenizer fertility is relevant for inference speed.
\end{remark}
\item[Curse of multilinguality] \marginnote{Curse of multilinguality}
The performance of each language of a multilingual model tend to be worse than its monolingual counterpart.
\end{description}
\subsection{Pre-training}
\begin{description}
\item[Masked language modelling] \marginnote{Masked language modelling}
Task of predicting missing or corrupted tokens in a sequence.
\begin{remark}
Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added.
\end{remark}
\begin{example}
Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either:
\begin{itemize}
\item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time).
\item Replace it with a different token ($10\%$ of the time).
\item Do nothing ($10\%$ of the time).
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf}
\end{figure}
\indenttbox
\begin{remark}
BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety.
\end{remark}
\end{example}
\item[Span masking] \marginnote{Span masking}
Mask contiguous spans of words to obtain a harder training objective.
\begin{remark}
This approach generally produces better embeddings.
\end{remark}
\end{description}
\subsection{Fine-tuning}
\begin{description}
\item[Fine-tuning for classification]
Add a classification head on top of the classifier token.
\item[Fine-tuning for sequence-pair classification]
Use a model pre-trained to process pair of sequences. This is usually done by means of a special separator token (e.g., \texttt{[SEP]} in BERT).
\item[Fine-tuning for sequence labeling]
Add a classification head on top of each token. A conditional random field (CRF) layers can also be added to produce globally more coherent tags.
\begin{description}
\item[Named entity recognition (NER)] \marginnote{Named entity recognition (NER)}
Task of assigning to each word of a sequence its entity class. NER taggers usually also capture concepts spanning across multiple tokens. To achieve this, additional information is provided with the entity class:
\begin{descriptionlist}
\item[Begin] Starting token of a concept.
\item[Inside] Token belonging to the same span of the previous one.
\item[End] Last token of a span.
\item[Outside] Token outside the scope of the tagger.
\end{descriptionlist}
\begin{description}
\item[Metrics] \phantom{}
\begin{description}
\item[Recall] $\frac{\text{Correctly labeled responses}}{\text{Total that should have been labeled}}$
\item[Precision] $\frac{\text{Correctly labeled responses}}{\text{Total that has been labeled}}$
\end{description}
\begin{remark}
The entity (so, also a span of text) is the atomic unit for NER metrics.
\end{remark}
\end{description}
\end{description}
\end{description}
\begin{remark}[GLUE]
The General Language Understanding Evaluation (GLUE) benchmark is a common set of tasks used to evaluate natural language understanding models. It comprises tasks based on single sentences, multiple sentences, and inference from a sequence.
\end{remark}
\section{Encoder-decoder architecture}
\begin{description}
\item[Encoder-decoder architecture] \marginnote{Encoder-decoder architecture}
Model with both an encoder and decoder:
\begin{descriptionlist}
\item[Encoder]
Architecture as presented in \Cref{sec:mlm}. Its result is used to condition the output of the decoder.
\item[Decoder]
Architecture similar to the one presented in \Cref{sec:llm} with an additional cross-attention layer inserted before causal attention.
\begin{description}
\item[Cross-attention] \marginnote{Cross-attention}
Attention layer that uses the output of the encoder as keys and values, while the query is from the decoder.
\end{description}
\end{descriptionlist}
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_encoder_decoder.pdf}
\end{figure}
\end{description}
\subsection{Pre-training}
\begin{description}
\item[Span corruption] \marginnote{Span corruption}
Given an input sequence, replace different-length spans of text with a unique placeholder. The encoder takes as input the corrupted sequence, while the decoder has to predict the missing words.
\begin{remark}
It has been observed that targeted span masking works better compared to random span masking.
\end{remark}
\begin{example}
Given the sequence:
\[ \texttt{<bos> thank you \underline{for inviting} me to your party \underline{last} week <eos>} \]
Some spans of text are masked with placeholder tokens as follows:
\[ \texttt{<bos> thank you <X> me to your party <Y> week <eos>} \]
The masked sequence is passed through the encoder, while the decoder has to predict the masked tokens:
\[ \texttt{<bos> <X> for inviting <Y> last <Z> <eos>} \]
\end{example}
\end{description}

View File

@ -1,55 +0,0 @@
\chapter{Masked language models}
\section{Bidirectional transformer encoder}
\begin{description}
\item[Transformer encoder] \marginnote{Transformer encoder}
Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context.
\begin{remark}
This architecture does feature extraction and is more suited for classification tasks.
\end{remark}
\begin{description}
\item[Architecture]
Similar to a transformer decoder, but self-attention is not causal.
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf}
\end{figure}
\end{description}
\end{description}
\subsection{Masked language modelling}
\begin{description}
\item[Masked language modelling] \marginnote{Masked language modelling}
Main training task of transformer encoders. It consists of predicting missing or corrupted tokens in a sequence.
\begin{remark}
Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added.
\end{remark}
\begin{example}
Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either:
\begin{itemize}
\item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time).
\item Replace it with a different token ($10\%$ of the time).
\item Do nothing ($10\%$ of the time).
\end{itemize}
\begin{figure}[H]
\centering
\includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf}
\end{figure}
\indenttbox
\begin{remark}
BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety.
\end{remark}
\end{example}
\end{description}

View File

@ -0,0 +1,85 @@
\chapter{Efficient model utilization}
\section{Low-rank adaptation} \label{sec:lora}
\begin{description}
\item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)}
Method to update weights by learning an offset that uses fewer parameters.
Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as:
\[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \]
\begin{figure}[H]
\centering
\includegraphics[width=0.4\linewidth]{./img/_lora.pdf}
\end{figure}
\end{description}
\section{Model compression}
\subsection{Parameters compression}
\begin{description}
\item[Parameter sharing] \marginnote{Parameter sharing}
Use the same parameters between layers.
\item[Pruning] \marginnote{Pruning}
Remove weights with small impact on the loss.
\begin{remark}
Dropping some weights produce sparse matrices that are unoptimized for parallel hardware. Therefore, this approach does not always improve efficiency.
\end{remark}
\item[Quantization] \marginnote{Quantization}
Store and perform operations with lower precision floating-points (e.g., FP32 to FP4).
\end{description}
\subsection{Training compression}
\begin{description}
\item[Mixture of experts] \marginnote{Mixture of experts}
Specialize smaller models on subset of data and train a router to forward the input to the correct expert.
\begin{remark}
This approach can be easily deployed on distributed systems.
\end{remark}
\item[Knowledge distillation] \marginnote{Knowledge distillation}
Train a student model to emulate the teacher's hidden states. In a general setting, the output distribution of the teacher is used to create the student. Two losses are used:
\begin{descriptionlist}
\item[Distillation loss]
Matches the output distribution of the student to the one of the teacher. A softmax with higher temperature is usually used so that training contribution does not only come from the highest probability.
\item[Student loss]
Matches the output distribution of the student with the ground-truth (i.e., same loss of the training task).
\end{descriptionlist}
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_distillation.pdf}
\end{figure}
\item[Vocabulary transfer] \marginnote{Vocabulary transfer}
Use a domain-specific tokenizer to reduce the number of tokens to represent complex/domain-specific words and reduce the size of the embedding matrix.
\begin{description}
\item[Fast vocabulary transfer (FVT)] \marginnote{Fast vocabulary transfer (FVT)}
Given:
\begin{itemize}
\item A starting embedding model with tokenizer $\mathcal{T}_\text{s}$, vocabulary $V_\text{s}$, and embedding matrix $\matr{E}_\text{s}$,
\item A new tokenizer $\mathcal{T}_\text{dom}$ trained on a domain-specific corpus,
\end{itemize}
The embedding matrix $\matr{E}_\text{dom}$ for the vocabulary $V_\text{dom}$ of $\mathcal{T}_\text{dom}$ is built as follows:
\[
\forall t_i \in V_\text{dom}: \matr{E}_\text{dom}(t_i) = \frac{1}{|\mathcal{T}_\text{s}(t_i)|} \sum_{t_j \in \mathcal{T}_\text{s}(t_i)}\matr{E}_\text{s}(t_j)
\]
In other words, each token in $V_\text{dom}$ is encoded as the average of embeddings of the tokens that compose it in the starting embedding model (if the token appear in both vocabularies, the embedding is the same).
\end{description}
\end{description}