mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add NLP encoder-decoder + distillation
This commit is contained in:
BIN
src/year2/natural-language-processing/img/_distillation.pdf
Normal file
BIN
src/year2/natural-language-processing/img/_distillation.pdf
Normal file
Binary file not shown.
BIN
src/year2/natural-language-processing/img/_encoder_decoder.pdf
Normal file
BIN
src/year2/natural-language-processing/img/_encoder_decoder.pdf
Normal file
Binary file not shown.
@ -15,6 +15,6 @@
|
||||
\include{./sections/_rnn.tex}
|
||||
\include{./sections/_attention.tex}
|
||||
\include{./sections/_llm.tex}
|
||||
\include{./sections/_mlm.tex}
|
||||
\include{./sections/_model_efficiency.tex}
|
||||
|
||||
\end{document}
|
||||
@ -1,6 +1,9 @@
|
||||
\chapter{Large language models}
|
||||
|
||||
|
||||
|
||||
\section{Decoder-only architecture} \label{sec:llm}
|
||||
|
||||
\begin{description}
|
||||
\item[Conditional generation] \marginnote{Conditional generation}
|
||||
Generate text conditioned on the input tokens (i.e., prompt).
|
||||
@ -30,7 +33,7 @@
|
||||
\end{description}
|
||||
|
||||
|
||||
\section{Decoding strategies}
|
||||
\subsection{Decoding strategies}
|
||||
|
||||
\begin{description}
|
||||
\item[Greedy decoding] \marginnote{Greedy decoding}
|
||||
@ -42,12 +45,17 @@
|
||||
\indenttbox
|
||||
\begin{example}
|
||||
Consider the following search tree of possible generated sequences:
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
|
||||
\end{figure}
|
||||
|
||||
Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
|
||||
\begin{minipage}{0.35\linewidth}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
|
||||
\end{figure}
|
||||
\end{minipage}
|
||||
\hfill
|
||||
\begin{minipage}[b]{0.6\linewidth}
|
||||
Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
\end{remark}
|
||||
|
||||
@ -114,10 +122,6 @@
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Training}
|
||||
|
||||
|
||||
\subsection{Pre-training}
|
||||
|
||||
\begin{description}
|
||||
@ -168,20 +172,7 @@
|
||||
|
||||
\begin{description}
|
||||
\item[Parameter-efficient fine-tuning (PEFT)] \marginnote{Parameter-efficient fine-tuning (PEFT)}
|
||||
Continue training a selected subset of parameters.
|
||||
|
||||
\begin{description}
|
||||
\item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)}
|
||||
Method to update weights by learning an offset that uses fewer parameters.
|
||||
|
||||
Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as:
|
||||
\[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \]
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.35\linewidth]{./img/_lora.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
Continue training a selected subset of parameters (e.g., LoRA \Cref{sec:lora}).
|
||||
|
||||
\item[Task-specific fine-tuning] \marginnote{Task-specific fine-tuning}
|
||||
Add a new trainable head on top of the model.
|
||||
@ -190,4 +181,183 @@
|
||||
\item[Supervised fine-tuning] \marginnote{Supervised fine-tuning}
|
||||
Continue training using a supervised dataset to align the model to human's expectation.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Encoder-only architecture} \label{sec:mlm}
|
||||
|
||||
\begin{description}
|
||||
\item[Transformer encoder] \marginnote{Transformer encoder}
|
||||
Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context.
|
||||
|
||||
\begin{remark}
|
||||
This architecture does feature extraction and is more suited for classification tasks.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Architecture]
|
||||
Similar to a transformer decoder, but self-attention is not causal.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
\item[Contextual embedding] \marginnote{Contextual embedding}
|
||||
Represent the meaning of word instances (i.e., dynamically depending on the surroundings).
|
||||
|
||||
\begin{remark}[Sequence embedding]
|
||||
Encoders usually have a classifier token (e.g., \texttt{[CLS]}) to model the whole sentence.
|
||||
\end{remark}
|
||||
|
||||
\begin{example}[Word sense disambiguation]
|
||||
Task of determining the sense of each word of a sequence. Senses usually come from an existing ontology (e.g., WordNet). An approach to solve the problem is the following:
|
||||
\begin{enumerate}
|
||||
\item Compute the embedding $\vec{v}_i$ of words using a pre-trained encoder (e.g., BERT).
|
||||
\item Represent the embedding of a sense as the average of the tokens of that sense:
|
||||
\[ \vec{v}_s = \frac{1}{n} \sum_i \vec{v}_i \]
|
||||
\item Predict the sense of a word $\vec{t}$ as:
|
||||
\[ \arg\max_{s \in \texttt{senses}(\vec{t})} \texttt{distance}(\vec{t}, \vec{v}_s) \]
|
||||
\end{enumerate}
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
\begin{description}
|
||||
\item[Tokenizer fertility] \marginnote{Tokenizer fertility}
|
||||
Average amount of tokens used to represent words.
|
||||
|
||||
\begin{remark}
|
||||
Tokenizer fertility is relevant for inference speed.
|
||||
\end{remark}
|
||||
|
||||
\item[Curse of multilinguality] \marginnote{Curse of multilinguality}
|
||||
The performance of each language of a multilingual model tend to be worse than its monolingual counterpart.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Pre-training}
|
||||
|
||||
\begin{description}
|
||||
\item[Masked language modelling] \marginnote{Masked language modelling}
|
||||
Task of predicting missing or corrupted tokens in a sequence.
|
||||
|
||||
\begin{remark}
|
||||
Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added.
|
||||
\end{remark}
|
||||
|
||||
\begin{example}
|
||||
Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either:
|
||||
\begin{itemize}
|
||||
\item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time).
|
||||
\item Replace it with a different token ($10\%$ of the time).
|
||||
\item Do nothing ($10\%$ of the time).
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf}
|
||||
\end{figure}
|
||||
|
||||
\indenttbox
|
||||
\begin{remark}
|
||||
BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety.
|
||||
\end{remark}
|
||||
\end{example}
|
||||
|
||||
\item[Span masking] \marginnote{Span masking}
|
||||
Mask contiguous spans of words to obtain a harder training objective.
|
||||
|
||||
\begin{remark}
|
||||
This approach generally produces better embeddings.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Fine-tuning}
|
||||
|
||||
\begin{description}
|
||||
\item[Fine-tuning for classification]
|
||||
Add a classification head on top of the classifier token.
|
||||
|
||||
\item[Fine-tuning for sequence-pair classification]
|
||||
Use a model pre-trained to process pair of sequences. This is usually done by means of a special separator token (e.g., \texttt{[SEP]} in BERT).
|
||||
|
||||
\item[Fine-tuning for sequence labeling]
|
||||
Add a classification head on top of each token. A conditional random field (CRF) layers can also be added to produce globally more coherent tags.
|
||||
|
||||
\begin{description}
|
||||
\item[Named entity recognition (NER)] \marginnote{Named entity recognition (NER)}
|
||||
Task of assigning to each word of a sequence its entity class. NER taggers usually also capture concepts spanning across multiple tokens. To achieve this, additional information is provided with the entity class:
|
||||
\begin{descriptionlist}
|
||||
\item[Begin] Starting token of a concept.
|
||||
\item[Inside] Token belonging to the same span of the previous one.
|
||||
\item[End] Last token of a span.
|
||||
\item[Outside] Token outside the scope of the tagger.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{description}
|
||||
\item[Metrics] \phantom{}
|
||||
\begin{description}
|
||||
\item[Recall] $\frac{\text{Correctly labeled responses}}{\text{Total that should have been labeled}}$
|
||||
\item[Precision] $\frac{\text{Correctly labeled responses}}{\text{Total that has been labeled}}$
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
The entity (so, also a span of text) is the atomic unit for NER metrics.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\begin{remark}[GLUE]
|
||||
The General Language Understanding Evaluation (GLUE) benchmark is a common set of tasks used to evaluate natural language understanding models. It comprises tasks based on single sentences, multiple sentences, and inference from a sequence.
|
||||
\end{remark}
|
||||
|
||||
|
||||
|
||||
\section{Encoder-decoder architecture}
|
||||
|
||||
\begin{description}
|
||||
\item[Encoder-decoder architecture] \marginnote{Encoder-decoder architecture}
|
||||
Model with both an encoder and decoder:
|
||||
\begin{descriptionlist}
|
||||
\item[Encoder]
|
||||
Architecture as presented in \Cref{sec:mlm}. Its result is used to condition the output of the decoder.
|
||||
\item[Decoder]
|
||||
Architecture similar to the one presented in \Cref{sec:llm} with an additional cross-attention layer inserted before causal attention.
|
||||
|
||||
\begin{description}
|
||||
\item[Cross-attention] \marginnote{Cross-attention}
|
||||
Attention layer that uses the output of the encoder as keys and values, while the query is from the decoder.
|
||||
\end{description}
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_encoder_decoder.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Pre-training}
|
||||
|
||||
\begin{description}
|
||||
\item[Span corruption] \marginnote{Span corruption}
|
||||
Given an input sequence, replace different-length spans of text with a unique placeholder. The encoder takes as input the corrupted sequence, while the decoder has to predict the missing words.
|
||||
|
||||
\begin{remark}
|
||||
It has been observed that targeted span masking works better compared to random span masking.
|
||||
\end{remark}
|
||||
|
||||
\begin{example}
|
||||
Given the sequence:
|
||||
\[ \texttt{<bos> thank you \underline{for inviting} me to your party \underline{last} week <eos>} \]
|
||||
Some spans of text are masked with placeholder tokens as follows:
|
||||
\[ \texttt{<bos> thank you <X> me to your party <Y> week <eos>} \]
|
||||
The masked sequence is passed through the encoder, while the decoder has to predict the masked tokens:
|
||||
\[ \texttt{<bos> <X> for inviting <Y> last <Z> <eos>} \]
|
||||
\end{example}
|
||||
\end{description}
|
||||
@ -1,55 +0,0 @@
|
||||
\chapter{Masked language models}
|
||||
|
||||
|
||||
|
||||
\section{Bidirectional transformer encoder}
|
||||
|
||||
\begin{description}
|
||||
\item[Transformer encoder] \marginnote{Transformer encoder}
|
||||
Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context.
|
||||
|
||||
\begin{remark}
|
||||
This architecture does feature extraction and is more suited for classification tasks.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Architecture]
|
||||
Similar to a transformer decoder, but self-attention is not causal.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Masked language modelling}
|
||||
|
||||
\begin{description}
|
||||
\item[Masked language modelling] \marginnote{Masked language modelling}
|
||||
Main training task of transformer encoders. It consists of predicting missing or corrupted tokens in a sequence.
|
||||
|
||||
\begin{remark}
|
||||
Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added.
|
||||
\end{remark}
|
||||
|
||||
\begin{example}
|
||||
Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either:
|
||||
\begin{itemize}
|
||||
\item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time).
|
||||
\item Replace it with a different token ($10\%$ of the time).
|
||||
\item Do nothing ($10\%$ of the time).
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf}
|
||||
\end{figure}
|
||||
|
||||
\indenttbox
|
||||
\begin{remark}
|
||||
BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety.
|
||||
\end{remark}
|
||||
\end{example}
|
||||
\end{description}
|
||||
@ -0,0 +1,85 @@
|
||||
\chapter{Efficient model utilization}
|
||||
|
||||
|
||||
|
||||
\section{Low-rank adaptation} \label{sec:lora}
|
||||
|
||||
\begin{description}
|
||||
\item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)}
|
||||
Method to update weights by learning an offset that uses fewer parameters.
|
||||
|
||||
Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as:
|
||||
\[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \]
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/_lora.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Model compression}
|
||||
|
||||
|
||||
\subsection{Parameters compression}
|
||||
|
||||
\begin{description}
|
||||
\item[Parameter sharing] \marginnote{Parameter sharing}
|
||||
Use the same parameters between layers.
|
||||
|
||||
\item[Pruning] \marginnote{Pruning}
|
||||
Remove weights with small impact on the loss.
|
||||
|
||||
\begin{remark}
|
||||
Dropping some weights produce sparse matrices that are unoptimized for parallel hardware. Therefore, this approach does not always improve efficiency.
|
||||
\end{remark}
|
||||
|
||||
\item[Quantization] \marginnote{Quantization}
|
||||
Store and perform operations with lower precision floating-points (e.g., FP32 to FP4).
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Training compression}
|
||||
|
||||
\begin{description}
|
||||
\item[Mixture of experts] \marginnote{Mixture of experts}
|
||||
Specialize smaller models on subset of data and train a router to forward the input to the correct expert.
|
||||
|
||||
\begin{remark}
|
||||
This approach can be easily deployed on distributed systems.
|
||||
\end{remark}
|
||||
|
||||
\item[Knowledge distillation] \marginnote{Knowledge distillation}
|
||||
Train a student model to emulate the teacher's hidden states. In a general setting, the output distribution of the teacher is used to create the student. Two losses are used:
|
||||
\begin{descriptionlist}
|
||||
\item[Distillation loss]
|
||||
Matches the output distribution of the student to the one of the teacher. A softmax with higher temperature is usually used so that training contribution does not only come from the highest probability.
|
||||
|
||||
\item[Student loss]
|
||||
Matches the output distribution of the student with the ground-truth (i.e., same loss of the training task).
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_distillation.pdf}
|
||||
\end{figure}
|
||||
|
||||
\item[Vocabulary transfer] \marginnote{Vocabulary transfer}
|
||||
Use a domain-specific tokenizer to reduce the number of tokens to represent complex/domain-specific words and reduce the size of the embedding matrix.
|
||||
|
||||
\begin{description}
|
||||
\item[Fast vocabulary transfer (FVT)] \marginnote{Fast vocabulary transfer (FVT)}
|
||||
Given:
|
||||
\begin{itemize}
|
||||
\item A starting embedding model with tokenizer $\mathcal{T}_\text{s}$, vocabulary $V_\text{s}$, and embedding matrix $\matr{E}_\text{s}$,
|
||||
\item A new tokenizer $\mathcal{T}_\text{dom}$ trained on a domain-specific corpus,
|
||||
\end{itemize}
|
||||
The embedding matrix $\matr{E}_\text{dom}$ for the vocabulary $V_\text{dom}$ of $\mathcal{T}_\text{dom}$ is built as follows:
|
||||
\[
|
||||
\forall t_i \in V_\text{dom}: \matr{E}_\text{dom}(t_i) = \frac{1}{|\mathcal{T}_\text{s}(t_i)|} \sum_{t_j \in \mathcal{T}_\text{s}(t_i)}\matr{E}_\text{s}(t_j)
|
||||
\]
|
||||
In other words, each token in $V_\text{dom}$ is encoded as the average of embeddings of the tokens that compose it in the starting embedding model (if the token appear in both vocabularies, the embedding is the same).
|
||||
\end{description}
|
||||
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user