Add NLP encoder-decoder + distillation

2026-02-04 07:41:43 +01:00 · 2024-12-03 20:24:26 +01:00
parent 2236cc91fd
commit 03efe2bcca
7 changed files with 280 additions and 80 deletions
--- a/src/year2/natural-language-processing/img/_distillation.pdf
+++ b/src/year2/natural-language-processing/img/_distillation.pdf
--- a/src/year2/natural-language-processing/img/_encoder_decoder.pdf
+++ b/src/year2/natural-language-processing/img/_encoder_decoder.pdf
--- a/src/year2/natural-language-processing/nlp.tex
+++ b/src/year2/natural-language-processing/nlp.tex
@ -15,6 +15,6 @@
    \include{./sections/_rnn.tex}
    \include{./sections/_attention.tex}
    \include{./sections/_llm.tex}
-    \include{./sections/_mlm.tex}
+    \include{./sections/_model_efficiency.tex}

 \end{document}
--- a/src/year2/natural-language-processing/sections/_enc_dec_models.tex
+++ b/src/year2/natural-language-processing/sections/_enc_dec_models.tex
--- a/src/year2/natural-language-processing/sections/_llm.tex
+++ b/src/year2/natural-language-processing/sections/_llm.tex
@ -1,6 +1,9 @@
 \chapter{Large language models}


+
+\section{Decoder-only architecture} \label{sec:llm}
+
 \begin{description}
    \item[Conditional generation] \marginnote{Conditional generation}
        Generate text conditioned on the input tokens (i.e., prompt).
@ -30,7 +33,7 @@
 \end{description}


-\section{Decoding strategies}
+\subsection{Decoding strategies}

 \begin{description}
    \item[Greedy decoding] \marginnote{Greedy decoding}
@ -42,12 +45,17 @@
            \indenttbox
            \begin{example}
                Consider the following search tree of possible generated sequences:
-                \begin{figure}[H]
-                    \centering
-                    \includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
-                \end{figure}

-                Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
+                \begin{minipage}{0.35\linewidth}
+                    \begin{figure}[H]
+                        \centering
+                        \includegraphics[width=\linewidth]{./img/_greedy_decoding_local_minimum.pdf}
+                    \end{figure}
+                \end{minipage}
+                \hfill
+                \begin{minipage}[b]{0.6\linewidth}
+                    Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$.
+                \end{minipage}
            \end{example}
        \end{remark}

@ -114,10 +122,6 @@
 \end{description}


-
-\section{Training}
-
-
 \subsection{Pre-training}

 \begin{description}
@ -168,20 +172,7 @@

                \begin{description}
                    \item[Parameter-efficient fine-tuning (PEFT)] \marginnote{Parameter-efficient fine-tuning (PEFT)}
-                        Continue training a selected subset of parameters.
-
-                        \begin{description}
-                            \item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)}
-                                Method to update weights by learning an offset that uses fewer parameters.
-
-                                Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as:
-                                \[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \]
-
-                                \begin{figure}[H]
-                                    \centering
-                                    \includegraphics[width=0.35\linewidth]{./img/_lora.pdf}
-                                \end{figure}
-                        \end{description}
+                        Continue training a selected subset of parameters (e.g., LoRA \Cref{sec:lora}).

                    \item[Task-specific fine-tuning] \marginnote{Task-specific fine-tuning}
                        Add a new trainable head on top of the model.
@ -190,4 +181,183 @@
            \item[Supervised fine-tuning] \marginnote{Supervised fine-tuning}
                Continue training using a supervised dataset to align the model to human's expectation.
        \end{description}
+\end{description}
+
+
+
+\section{Encoder-only architecture} \label{sec:mlm}
+
+\begin{description}
+    \item[Transformer encoder] \marginnote{Transformer encoder}
+        Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context.
+
+        \begin{remark}
+            This architecture does feature extraction and is more suited for classification tasks.
+        \end{remark}
+
+        \begin{description}
+            \item[Architecture]
+                Similar to a transformer decoder, but self-attention is not causal.
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf}
+                \end{figure}
+        \end{description}
+
+    \item[Contextual embedding] \marginnote{Contextual embedding}
+        Represent the meaning of word instances (i.e., dynamically depending on the surroundings).
+
+        \begin{remark}[Sequence embedding]
+            Encoders usually have a classifier token (e.g., \texttt{[CLS]}) to model the whole sentence.
+        \end{remark}
+
+        \begin{example}[Word sense disambiguation]
+            Task of determining the sense of each word of a sequence. Senses usually come from an existing ontology (e.g., WordNet). An approach to solve the problem is the following:
+            \begin{enumerate}
+                \item Compute the embedding $\vec{v}_i$ of words using a pre-trained encoder (e.g., BERT).
+                \item Represent the embedding of a sense as the average of the tokens of that sense:
+                \[ \vec{v}_s = \frac{1}{n} \sum_i \vec{v}_i \]
+                \item Predict the sense of a word $\vec{t}$ as:
+                \[ \arg\max_{s \in \texttt{senses}(\vec{t})} \texttt{distance}(\vec{t}, \vec{v}_s) \]
+            \end{enumerate}
+        \end{example}
+\end{description}
+
+\begin{description}
+    \item[Tokenizer fertility] \marginnote{Tokenizer fertility}
+        Average amount of tokens used to represent words.
+
+        \begin{remark}
+            Tokenizer fertility is relevant for inference speed.
+        \end{remark}
+
+    \item[Curse of multilinguality] \marginnote{Curse of multilinguality}
+        The performance of each language of a multilingual model tend to be worse than its monolingual counterpart.
+\end{description}
+
+
+\subsection{Pre-training}
+
+\begin{description}
+    \item[Masked language modelling] \marginnote{Masked language modelling}
+        Task of predicting missing or corrupted tokens in a sequence.
+
+        \begin{remark}
+            Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added.
+        \end{remark}
+
+        \begin{example}
+            Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either:
+            \begin{itemize}
+                \item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time).
+                \item Replace it with a different token ($10\%$ of the time).
+                \item Do nothing ($10\%$ of the time).
+            \end{itemize}
+
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf}
+            \end{figure}
+
+            \indenttbox
+            \begin{remark}
+                BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety.
+            \end{remark}
+        \end{example}
+
+    \item[Span masking] \marginnote{Span masking}
+        Mask contiguous spans of words to obtain a harder training objective.
+
+        \begin{remark}
+            This approach generally produces better embeddings.
+        \end{remark}
+\end{description}
+
+
+\subsection{Fine-tuning}
+
+\begin{description}
+    \item[Fine-tuning for classification]
+        Add a classification head on top of the classifier token.
+
+    \item[Fine-tuning for sequence-pair classification]
+        Use a model pre-trained to process pair of sequences. This is usually done by means of a special separator token (e.g., \texttt{[SEP]} in BERT).
+
+    \item[Fine-tuning for sequence labeling]
+        Add a classification head on top of each token. A conditional random field (CRF) layers can also be added to produce globally more coherent tags.
+
+        \begin{description}
+            \item[Named entity recognition (NER)] \marginnote{Named entity recognition (NER)}
+            Task of assigning to each word of a sequence its entity class. NER taggers usually also capture concepts spanning across multiple tokens. To achieve this, additional information is provided with the entity class:
+            \begin{descriptionlist}
+                \item[Begin] Starting token of a concept.
+                \item[Inside] Token belonging to the same span of the previous one.
+                \item[End] Last token of a span.
+                \item[Outside] Token outside the scope of the tagger.
+            \end{descriptionlist}
+
+            \begin{description}
+                \item[Metrics] \phantom{}
+                    \begin{description}
+                        \item[Recall] $\frac{\text{Correctly labeled responses}}{\text{Total that should have been labeled}}$
+                        \item[Precision] $\frac{\text{Correctly labeled responses}}{\text{Total that has been labeled}}$
+                    \end{description}
+
+                    \begin{remark}
+                        The entity (so, also a span of text) is the atomic unit for NER metrics.
+                    \end{remark}
+            \end{description}
+        \end{description}
+\end{description}
+
+
+\begin{remark}[GLUE]
+    The General Language Understanding Evaluation (GLUE) benchmark is a common set of tasks used to evaluate natural language understanding models. It comprises tasks based on single sentences, multiple sentences, and inference from a sequence.
+\end{remark}
+
+
+
+\section{Encoder-decoder architecture}
+
+\begin{description}
+    \item[Encoder-decoder architecture] \marginnote{Encoder-decoder architecture}
+        Model with both an encoder and decoder:
+        \begin{descriptionlist}
+            \item[Encoder] 
+                Architecture as presented in \Cref{sec:mlm}. Its result is used to condition the output of the decoder.
+            \item[Decoder] 
+                Architecture similar to the one presented in \Cref{sec:llm} with an additional cross-attention layer inserted before causal attention.
+
+                \begin{description}
+                    \item[Cross-attention] \marginnote{Cross-attention}
+                        Attention layer that uses the output of the encoder as keys and values, while the query is from the decoder.
+                \end{description}
+        \end{descriptionlist}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.7\linewidth]{./img/_encoder_decoder.pdf}
+        \end{figure}
+\end{description}
+
+
+\subsection{Pre-training}
+
+\begin{description}
+    \item[Span corruption] \marginnote{Span corruption}
+        Given an input sequence, replace different-length spans of text with a unique placeholder. The encoder takes as input the corrupted sequence, while the decoder has to predict the missing words.
+
+        \begin{remark}
+            It has been observed that targeted span masking works better compared to random span masking.
+        \end{remark}
+
+        \begin{example}
+            Given the sequence:
+            \[ \texttt{<bos> thank you \underline{for inviting} me to your party \underline{last} week <eos>} \]
+            Some spans of text are masked with placeholder tokens as follows:
+            \[ \texttt{<bos> thank you <X> me to your party <Y> week <eos>} \]
+            The masked sequence is passed through the encoder, while the decoder has to predict the masked tokens:
+            \[ \texttt{<bos> <X> for inviting <Y> last <Z> <eos>} \]
+        \end{example}
 \end{description}
--- a/src/year2/natural-language-processing/sections/_mlm.tex
+++ b/src/year2/natural-language-processing/sections/_mlm.tex
@ -1,55 +0,0 @@
-\chapter{Masked language models}
-
-
-
-\section{Bidirectional transformer encoder}
-
-\begin{description}
-    \item[Transformer encoder] \marginnote{Transformer encoder}
-        Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context.
-
-        \begin{remark}
-            This architecture does feature extraction and is more suited for classification tasks.
-        \end{remark}
-
-        \begin{description}
-            \item[Architecture]
-                Similar to a transformer decoder, but self-attention is not causal.
-
-                \begin{figure}[H]
-                    \centering
-                    \includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf}
-                \end{figure}
-        \end{description}
-\end{description}
-
-
-\subsection{Masked language modelling}
-
-\begin{description}
-    \item[Masked language modelling] \marginnote{Masked language modelling}
-        Main training task of transformer encoders. It consists of predicting missing or corrupted tokens in a sequence.
-
-        \begin{remark}
-            Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added.
-        \end{remark}
-
-        \begin{example}
-            Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either:
-            \begin{itemize}
-                \item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time).
-                \item Replace it with a different token ($10\%$ of the time).
-                \item Do nothing ($10\%$ of the time).
-            \end{itemize}
-
-            \begin{figure}[H]
-                \centering
-                \includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf}
-            \end{figure}
-
-            \indenttbox
-            \begin{remark}
-                BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety.
-            \end{remark}
-        \end{example}
-\end{description}
--- a/src/year2/natural-language-processing/sections/_model_efficiency.tex
+++ b/src/year2/natural-language-processing/sections/_model_efficiency.tex
@ -0,0 +1,85 @@
+\chapter{Efficient model utilization}
+
+
+
+\section{Low-rank adaptation} \label{sec:lora}
+
+\begin{description}
+    \item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)}
+        Method to update weights by learning an offset that uses fewer parameters.
+
+        Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as:
+        \[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \]
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.4\linewidth]{./img/_lora.pdf}
+        \end{figure}
+\end{description}
+
+
+
+\section{Model compression}
+
+
+\subsection{Parameters compression}
+
+\begin{description}
+    \item[Parameter sharing] \marginnote{Parameter sharing}
+        Use the same parameters between layers.
+
+    \item[Pruning] \marginnote{Pruning}
+        Remove weights with small impact on the loss.
+
+        \begin{remark}
+            Dropping some weights produce sparse matrices that are unoptimized for parallel hardware. Therefore, this approach does not always improve efficiency.
+        \end{remark}
+
+    \item[Quantization] \marginnote{Quantization}
+        Store and perform operations with lower precision floating-points (e.g., FP32 to FP4).
+\end{description}
+
+
+\subsection{Training compression}
+
+\begin{description}
+    \item[Mixture of experts] \marginnote{Mixture of experts}
+        Specialize smaller models on subset of data and train a router to forward the input to the correct expert.
+
+        \begin{remark}
+            This approach can be easily deployed on distributed systems.
+        \end{remark}
+
+    \item[Knowledge distillation] \marginnote{Knowledge distillation}
+        Train a student model to emulate the teacher's hidden states. In a general setting, the output distribution of the teacher is used to create the student. Two losses are used:
+        \begin{descriptionlist}
+            \item[Distillation loss] 
+                Matches the output distribution of the student to the one of the teacher. A softmax with higher temperature is usually used so that training contribution does not only come from the highest probability.
+
+            \item[Student loss] 
+                Matches the output distribution of the student with the ground-truth (i.e., same loss of the training task).
+        \end{descriptionlist}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.8\linewidth]{./img/_distillation.pdf}
+        \end{figure}
+
+    \item[Vocabulary transfer] \marginnote{Vocabulary transfer}
+        Use a domain-specific tokenizer to reduce the number of tokens to represent complex/domain-specific words and reduce the size of the embedding matrix.
+
+        \begin{description}
+            \item[Fast vocabulary transfer (FVT)] \marginnote{Fast vocabulary transfer (FVT)}
+                Given:
+                \begin{itemize}
+                    \item A starting embedding model with tokenizer $\mathcal{T}_\text{s}$, vocabulary $V_\text{s}$, and embedding matrix $\matr{E}_\text{s}$,
+                    \item A new tokenizer $\mathcal{T}_\text{dom}$ trained on a domain-specific corpus,
+                \end{itemize}
+                The embedding matrix $\matr{E}_\text{dom}$ for the vocabulary $V_\text{dom}$ of $\mathcal{T}_\text{dom}$ is built as follows:
+                \[ 
+                    \forall t_i \in V_\text{dom}: \matr{E}_\text{dom}(t_i) = \frac{1}{|\mathcal{T}_\text{s}(t_i)|} \sum_{t_j \in \mathcal{T}_\text{s}(t_i)}\matr{E}_\text{s}(t_j)
+                \]
+                In other words, each token in $V_\text{dom}$ is encoded as the average of embeddings of the tokens that compose it in the starting embedding model (if the token appear in both vocabularies, the embedding is the same).
+        \end{description}
+
+\end{description}