diff --git a/src/year2/natural-language-processing/img/_distillation.pdf b/src/year2/natural-language-processing/img/_distillation.pdf new file mode 100644 index 0000000..f8a9870 Binary files /dev/null and b/src/year2/natural-language-processing/img/_distillation.pdf differ diff --git a/src/year2/natural-language-processing/img/_encoder_decoder.pdf b/src/year2/natural-language-processing/img/_encoder_decoder.pdf new file mode 100644 index 0000000..ad1129e Binary files /dev/null and b/src/year2/natural-language-processing/img/_encoder_decoder.pdf differ diff --git a/src/year2/natural-language-processing/nlp.tex b/src/year2/natural-language-processing/nlp.tex index 964230c..45bba82 100644 --- a/src/year2/natural-language-processing/nlp.tex +++ b/src/year2/natural-language-processing/nlp.tex @@ -15,6 +15,6 @@ \include{./sections/_rnn.tex} \include{./sections/_attention.tex} \include{./sections/_llm.tex} - \include{./sections/_mlm.tex} + \include{./sections/_model_efficiency.tex} \end{document} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_enc_dec_models.tex b/src/year2/natural-language-processing/sections/_enc_dec_models.tex new file mode 100644 index 0000000..e69de29 diff --git a/src/year2/natural-language-processing/sections/_llm.tex b/src/year2/natural-language-processing/sections/_llm.tex index b562352..e808475 100644 --- a/src/year2/natural-language-processing/sections/_llm.tex +++ b/src/year2/natural-language-processing/sections/_llm.tex @@ -1,6 +1,9 @@ \chapter{Large language models} + +\section{Decoder-only architecture} \label{sec:llm} + \begin{description} \item[Conditional generation] \marginnote{Conditional generation} Generate text conditioned on the input tokens (i.e., prompt). @@ -30,7 +33,7 @@ \end{description} -\section{Decoding strategies} +\subsection{Decoding strategies} \begin{description} \item[Greedy decoding] \marginnote{Greedy decoding} @@ -42,12 +45,17 @@ \indenttbox \begin{example} Consider the following search tree of possible generated sequences: - \begin{figure}[H] - \centering - \includegraphics[width=0.3\linewidth]{./img/_greedy_decoding_local_minimum.pdf} - \end{figure} - Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$. + \begin{minipage}{0.35\linewidth} + \begin{figure}[H] + \centering + \includegraphics[width=\linewidth]{./img/_greedy_decoding_local_minimum.pdf} + \end{figure} + \end{minipage} + \hfill + \begin{minipage}[b]{0.6\linewidth} + Greedy search would select the sequence \texttt{yes yes} which has probability $0.5 \cdot 0.4 = 0.2$. However, the sequence \texttt{ok ok} has a higher probability of $0.4 \cdot 0.7 = 0.28$. + \end{minipage} \end{example} \end{remark} @@ -114,10 +122,6 @@ \end{description} - -\section{Training} - - \subsection{Pre-training} \begin{description} @@ -168,20 +172,7 @@ \begin{description} \item[Parameter-efficient fine-tuning (PEFT)] \marginnote{Parameter-efficient fine-tuning (PEFT)} - Continue training a selected subset of parameters. - - \begin{description} - \item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)} - Method to update weights by learning an offset that uses fewer parameters. - - Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as: - \[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \] - - \begin{figure}[H] - \centering - \includegraphics[width=0.35\linewidth]{./img/_lora.pdf} - \end{figure} - \end{description} + Continue training a selected subset of parameters (e.g., LoRA \Cref{sec:lora}). \item[Task-specific fine-tuning] \marginnote{Task-specific fine-tuning} Add a new trainable head on top of the model. @@ -190,4 +181,183 @@ \item[Supervised fine-tuning] \marginnote{Supervised fine-tuning} Continue training using a supervised dataset to align the model to human's expectation. \end{description} +\end{description} + + + +\section{Encoder-only architecture} \label{sec:mlm} + +\begin{description} + \item[Transformer encoder] \marginnote{Transformer encoder} + Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context. + + \begin{remark} + This architecture does feature extraction and is more suited for classification tasks. + \end{remark} + + \begin{description} + \item[Architecture] + Similar to a transformer decoder, but self-attention is not causal. + + \begin{figure}[H] + \centering + \includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf} + \end{figure} + \end{description} + + \item[Contextual embedding] \marginnote{Contextual embedding} + Represent the meaning of word instances (i.e., dynamically depending on the surroundings). + + \begin{remark}[Sequence embedding] + Encoders usually have a classifier token (e.g., \texttt{[CLS]}) to model the whole sentence. + \end{remark} + + \begin{example}[Word sense disambiguation] + Task of determining the sense of each word of a sequence. Senses usually come from an existing ontology (e.g., WordNet). An approach to solve the problem is the following: + \begin{enumerate} + \item Compute the embedding $\vec{v}_i$ of words using a pre-trained encoder (e.g., BERT). + \item Represent the embedding of a sense as the average of the tokens of that sense: + \[ \vec{v}_s = \frac{1}{n} \sum_i \vec{v}_i \] + \item Predict the sense of a word $\vec{t}$ as: + \[ \arg\max_{s \in \texttt{senses}(\vec{t})} \texttt{distance}(\vec{t}, \vec{v}_s) \] + \end{enumerate} + \end{example} +\end{description} + +\begin{description} + \item[Tokenizer fertility] \marginnote{Tokenizer fertility} + Average amount of tokens used to represent words. + + \begin{remark} + Tokenizer fertility is relevant for inference speed. + \end{remark} + + \item[Curse of multilinguality] \marginnote{Curse of multilinguality} + The performance of each language of a multilingual model tend to be worse than its monolingual counterpart. +\end{description} + + +\subsection{Pre-training} + +\begin{description} + \item[Masked language modelling] \marginnote{Masked language modelling} + Task of predicting missing or corrupted tokens in a sequence. + + \begin{remark} + Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added. + \end{remark} + + \begin{example} + Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either: + \begin{itemize} + \item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time). + \item Replace it with a different token ($10\%$ of the time). + \item Do nothing ($10\%$ of the time). + \end{itemize} + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf} + \end{figure} + + \indenttbox + \begin{remark} + BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety. + \end{remark} + \end{example} + + \item[Span masking] \marginnote{Span masking} + Mask contiguous spans of words to obtain a harder training objective. + + \begin{remark} + This approach generally produces better embeddings. + \end{remark} +\end{description} + + +\subsection{Fine-tuning} + +\begin{description} + \item[Fine-tuning for classification] + Add a classification head on top of the classifier token. + + \item[Fine-tuning for sequence-pair classification] + Use a model pre-trained to process pair of sequences. This is usually done by means of a special separator token (e.g., \texttt{[SEP]} in BERT). + + \item[Fine-tuning for sequence labeling] + Add a classification head on top of each token. A conditional random field (CRF) layers can also be added to produce globally more coherent tags. + + \begin{description} + \item[Named entity recognition (NER)] \marginnote{Named entity recognition (NER)} + Task of assigning to each word of a sequence its entity class. NER taggers usually also capture concepts spanning across multiple tokens. To achieve this, additional information is provided with the entity class: + \begin{descriptionlist} + \item[Begin] Starting token of a concept. + \item[Inside] Token belonging to the same span of the previous one. + \item[End] Last token of a span. + \item[Outside] Token outside the scope of the tagger. + \end{descriptionlist} + + \begin{description} + \item[Metrics] \phantom{} + \begin{description} + \item[Recall] $\frac{\text{Correctly labeled responses}}{\text{Total that should have been labeled}}$ + \item[Precision] $\frac{\text{Correctly labeled responses}}{\text{Total that has been labeled}}$ + \end{description} + + \begin{remark} + The entity (so, also a span of text) is the atomic unit for NER metrics. + \end{remark} + \end{description} + \end{description} +\end{description} + + +\begin{remark}[GLUE] + The General Language Understanding Evaluation (GLUE) benchmark is a common set of tasks used to evaluate natural language understanding models. It comprises tasks based on single sentences, multiple sentences, and inference from a sequence. +\end{remark} + + + +\section{Encoder-decoder architecture} + +\begin{description} + \item[Encoder-decoder architecture] \marginnote{Encoder-decoder architecture} + Model with both an encoder and decoder: + \begin{descriptionlist} + \item[Encoder] + Architecture as presented in \Cref{sec:mlm}. Its result is used to condition the output of the decoder. + \item[Decoder] + Architecture similar to the one presented in \Cref{sec:llm} with an additional cross-attention layer inserted before causal attention. + + \begin{description} + \item[Cross-attention] \marginnote{Cross-attention} + Attention layer that uses the output of the encoder as keys and values, while the query is from the decoder. + \end{description} + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/_encoder_decoder.pdf} + \end{figure} +\end{description} + + +\subsection{Pre-training} + +\begin{description} + \item[Span corruption] \marginnote{Span corruption} + Given an input sequence, replace different-length spans of text with a unique placeholder. The encoder takes as input the corrupted sequence, while the decoder has to predict the missing words. + + \begin{remark} + It has been observed that targeted span masking works better compared to random span masking. + \end{remark} + + \begin{example} + Given the sequence: + \[ \texttt{ thank you \underline{for inviting} me to your party \underline{last} week } \] + Some spans of text are masked with placeholder tokens as follows: + \[ \texttt{ thank you me to your party week } \] + The masked sequence is passed through the encoder, while the decoder has to predict the masked tokens: + \[ \texttt{ for inviting last } \] + \end{example} \end{description} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_mlm.tex b/src/year2/natural-language-processing/sections/_mlm.tex index faafefe..e69de29 100644 --- a/src/year2/natural-language-processing/sections/_mlm.tex +++ b/src/year2/natural-language-processing/sections/_mlm.tex @@ -1,55 +0,0 @@ -\chapter{Masked language models} - - - -\section{Bidirectional transformer encoder} - -\begin{description} - \item[Transformer encoder] \marginnote{Transformer encoder} - Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context. - - \begin{remark} - This architecture does feature extraction and is more suited for classification tasks. - \end{remark} - - \begin{description} - \item[Architecture] - Similar to a transformer decoder, but self-attention is not causal. - - \begin{figure}[H] - \centering - \includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf} - \end{figure} - \end{description} -\end{description} - - -\subsection{Masked language modelling} - -\begin{description} - \item[Masked language modelling] \marginnote{Masked language modelling} - Main training task of transformer encoders. It consists of predicting missing or corrupted tokens in a sequence. - - \begin{remark} - Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added. - \end{remark} - - \begin{example} - Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either: - \begin{itemize} - \item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time). - \item Replace it with a different token ($10\%$ of the time). - \item Do nothing ($10\%$ of the time). - \end{itemize} - - \begin{figure}[H] - \centering - \includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf} - \end{figure} - - \indenttbox - \begin{remark} - BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety. - \end{remark} - \end{example} -\end{description} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_model_efficiency.tex b/src/year2/natural-language-processing/sections/_model_efficiency.tex new file mode 100644 index 0000000..fed668b --- /dev/null +++ b/src/year2/natural-language-processing/sections/_model_efficiency.tex @@ -0,0 +1,85 @@ +\chapter{Efficient model utilization} + + + +\section{Low-rank adaptation} \label{sec:lora} + +\begin{description} + \item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)} + Method to update weights by learning an offset that uses fewer parameters. + + Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as: + \[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/_lora.pdf} + \end{figure} +\end{description} + + + +\section{Model compression} + + +\subsection{Parameters compression} + +\begin{description} + \item[Parameter sharing] \marginnote{Parameter sharing} + Use the same parameters between layers. + + \item[Pruning] \marginnote{Pruning} + Remove weights with small impact on the loss. + + \begin{remark} + Dropping some weights produce sparse matrices that are unoptimized for parallel hardware. Therefore, this approach does not always improve efficiency. + \end{remark} + + \item[Quantization] \marginnote{Quantization} + Store and perform operations with lower precision floating-points (e.g., FP32 to FP4). +\end{description} + + +\subsection{Training compression} + +\begin{description} + \item[Mixture of experts] \marginnote{Mixture of experts} + Specialize smaller models on subset of data and train a router to forward the input to the correct expert. + + \begin{remark} + This approach can be easily deployed on distributed systems. + \end{remark} + + \item[Knowledge distillation] \marginnote{Knowledge distillation} + Train a student model to emulate the teacher's hidden states. In a general setting, the output distribution of the teacher is used to create the student. Two losses are used: + \begin{descriptionlist} + \item[Distillation loss] + Matches the output distribution of the student to the one of the teacher. A softmax with higher temperature is usually used so that training contribution does not only come from the highest probability. + + \item[Student loss] + Matches the output distribution of the student with the ground-truth (i.e., same loss of the training task). + \end{descriptionlist} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_distillation.pdf} + \end{figure} + + \item[Vocabulary transfer] \marginnote{Vocabulary transfer} + Use a domain-specific tokenizer to reduce the number of tokens to represent complex/domain-specific words and reduce the size of the embedding matrix. + + \begin{description} + \item[Fast vocabulary transfer (FVT)] \marginnote{Fast vocabulary transfer (FVT)} + Given: + \begin{itemize} + \item A starting embedding model with tokenizer $\mathcal{T}_\text{s}$, vocabulary $V_\text{s}$, and embedding matrix $\matr{E}_\text{s}$, + \item A new tokenizer $\mathcal{T}_\text{dom}$ trained on a domain-specific corpus, + \end{itemize} + The embedding matrix $\matr{E}_\text{dom}$ for the vocabulary $V_\text{dom}$ of $\mathcal{T}_\text{dom}$ is built as follows: + \[ + \forall t_i \in V_\text{dom}: \matr{E}_\text{dom}(t_i) = \frac{1}{|\mathcal{T}_\text{s}(t_i)|} \sum_{t_j \in \mathcal{T}_\text{s}(t_i)}\matr{E}_\text{s}(t_j) + \] + In other words, each token in $V_\text{dom}$ is encoded as the average of embeddings of the tokens that compose it in the starting embedding model (if the token appear in both vocabularies, the embedding is the same). + \end{description} + +\end{description} \ No newline at end of file