diff --git a/src/year2/natural-language-processing/img/_bert_training.pdf b/src/year2/natural-language-processing/img/_bert_training.pdf new file mode 100644 index 0000000..9e4bdb9 Binary files /dev/null and b/src/year2/natural-language-processing/img/_bert_training.pdf differ diff --git a/src/year2/natural-language-processing/img/_decoder_vs_encoder.pdf b/src/year2/natural-language-processing/img/_decoder_vs_encoder.pdf new file mode 100644 index 0000000..b0af59d Binary files /dev/null and b/src/year2/natural-language-processing/img/_decoder_vs_encoder.pdf differ diff --git a/src/year2/natural-language-processing/img/_lora.pdf b/src/year2/natural-language-processing/img/_lora.pdf new file mode 100644 index 0000000..28fa1bc Binary files /dev/null and b/src/year2/natural-language-processing/img/_lora.pdf differ diff --git a/src/year2/natural-language-processing/img/_masked_attention.pdf b/src/year2/natural-language-processing/img/_masked_attention.pdf new file mode 100644 index 0000000..8f2c70e Binary files /dev/null and b/src/year2/natural-language-processing/img/_masked_attention.pdf differ diff --git a/src/year2/natural-language-processing/nlp.tex b/src/year2/natural-language-processing/nlp.tex index 4c4b073..964230c 100644 --- a/src/year2/natural-language-processing/nlp.tex +++ b/src/year2/natural-language-processing/nlp.tex @@ -15,5 +15,6 @@ \include{./sections/_rnn.tex} \include{./sections/_attention.tex} \include{./sections/_llm.tex} + \include{./sections/_mlm.tex} \end{document} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_attention.tex b/src/year2/natural-language-processing/sections/_attention.tex index bf8bef5..a92d123 100644 --- a/src/year2/natural-language-processing/sections/_attention.tex +++ b/src/year2/natural-language-processing/sections/_attention.tex @@ -144,15 +144,15 @@ \end{descriptionlist} where $\matr{W}_Q \in \mathbb{R}^{d_\text{model} \times d_k}$, $\matr{W}_K \in \mathbb{R}^{d_\text{model} \times d_k}$, and $\matr{W}_V \in \mathbb{R}^{d_\text{model} \times d_v}$ are parameters. - Then, the attention weights $\vec{\alpha}_{i,j}$ between two embeddings $\vec{x}_i$ and $\vec{x}_j$ are computed as: + Then, the attention weights $\alpha_{i,j}$ between two embeddings $\vec{x}_i$ and $\vec{x}_j$ are computed as: \[ \begin{gathered} - \texttt{scores}(\vec{x}_i, \vec{x}_j) = \frac{\vec{q}_i \cdot \vec{k}_j}{\sqrt{d_k}} \\ - \vec{\alpha}_{i,j} = \texttt{softmax}_j\left( \texttt{scores}(\vec{x}_i, \vec{x}_j) \right) + \texttt{scores}(\vec{x}_i, \vec{x}_j) = \frac{\vec{q}_i \vec{k}_j}{\sqrt{d_k}} \\ + \alpha_{i,j} = \texttt{softmax}_j\left( \left[\texttt{scores}(\vec{x}_i, \vec{x}_1), \dots, \texttt{scores}(\vec{x}_i, \vec{x}_T)\right] \right) \\ \end{gathered} \] The output $\vec{a}_i \in \mathbb{R}^{1 \times d_v}$ is a weighted sum of the values of each token: - \[ \vec{a}_i = \sum_{t} \vec{\alpha}_{i,t} \vec{v}_t \] + \[ \vec{a}_i = \sum_{t} \alpha_{i,t} \vec{v}_t \] To maintain the input dimension, a final projection $\matr{W}_O \in \mathbb{R}^{d_v \times d_\text{model}}$ is applied. @@ -166,12 +166,18 @@ Self-attention mechanism where only past tokens can be used to determine the representation of a token at a specific position. It is computed by modifying the standard self-attention as: \[ \begin{gathered} - \forall j \leq i: \texttt{scores}(\vec{x}_i, \vec{x}_j) = \frac{\vec{q}_i \cdot \vec{k}_j}{\sqrt{d_k}} \qquad - \forall j > i: \texttt{scores}(\vec{x}_i, \vec{x}_j) = \nullvec \\ - \vec{\alpha}_{i,j} = \texttt{softmax}_j\left( \texttt{scores}(\vec{x}_i, \vec{x}_j) \right) \\ - \vec{a}_i = \sum_{t: t \leq i} \vec{\alpha}_{i,t} \vec{v}_t + \forall j \leq i: \texttt{scores}(\vec{x}_i, \vec{x}_j) = \frac{\vec{q}_i \vec{k}_j}{\sqrt{d_k}} \qquad + \forall j > i: \texttt{scores}(\vec{x}_i, \vec{x}_j) = -\infty \\ + \alpha_{i,j} = \texttt{softmax}_j\left( \left[\texttt{scores}(\vec{x}_i, \vec{x}_1), \dots, \texttt{scores}(\vec{x}_i, \vec{x}_T)\right] \right) \\ + \vec{a}_i = \sum_{t: t \leq i} \alpha_{i,t} \vec{v}_t \end{gathered} \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.2\linewidth]{./img/_masked_attention.pdf} + \caption{Score matrix with causal attention} + \end{figure} \end{description} @@ -195,7 +201,7 @@ \begin{figure}[H] \centering - \includegraphics[width=0.4\linewidth]{./img/_positional_encoding.pdf} + \includegraphics[width=0.45\linewidth]{./img/_positional_encoding.pdf} \end{figure} \item[Transformer block] \marginnote{Transformer block} @@ -206,7 +212,7 @@ \begin{figure}[H] \centering - \includegraphics[width=0.5\linewidth]{./img/_multi_head_attention.pdf} + \includegraphics[width=0.6\linewidth]{./img/_multi_head_attention.pdf} \end{figure} \item[Feedforward layer] diff --git a/src/year2/natural-language-processing/sections/_llm.tex b/src/year2/natural-language-processing/sections/_llm.tex index 2080d51..adc57b8 100644 --- a/src/year2/natural-language-processing/sections/_llm.tex +++ b/src/year2/natural-language-processing/sections/_llm.tex @@ -108,7 +108,85 @@ $k$ is fixed and does not account for the shape of the distribution. \end{remark} - \item[Top-p sampling] + \item[Top-p/nucleus sampling] Consider the most likely words such that their probability mass adds up to $p$. Then, apply random sampling on their normalized distribution. \end{description} +\end{description} + + + +\section{Training} + + +\subsection{Pre-training} + +\begin{description} + \item[Pre-training] \marginnote{Pre-training} + Use self-supervision and teacher forcing to train the whole context window in parallel on a large text corpus. + + \begin{remark} + Results are highly dependent on the training corpora. Important aspects to consider are: + \begin{descriptionlist} + \item[Language] Most of the available data is in English. + \item[Data quality] Prefer high-quality sources such as Wikipedia or books. Boilerplate removal and deduplication might be needed. + \item[Safety filtering] Toxicity removal might be needed + \item[Ethical and legal issues] Use of copyrighted material, permission from data owners, use of private information, \dots + \end{descriptionlist} + \end{remark} + + \item[Scaling laws] \marginnote{Scaling laws} + Empirical laws that put in relationship: + \begin{itemize} + \item Non-embedding parameters $N$ ($N \approx 2 d_\text{model} n_\text{layer} (2 d_\text{attention} + d_\text{ff})$), + \item Training data size $D$, + \item Compute budget $C$ (i.e., training iterations). + \end{itemize} + By keeping two of the three factors constant, the loss $\mathcal{L}$ of an LLM can be estimated as a function of the third variable: + \[ + \mathcal{L}(N) = \left( \frac{N_c}{N} \right)^{\alpha N} + \qquad + \mathcal{L}(D) = \left( \frac{D_c}{D} \right)^{\alpha D} + \qquad + \mathcal{L}(C) = \left( \frac{C_c}{C} \right)^{\alpha C} + \] +\end{description} + + +\subsection{Fine-tuning} + +\begin{description} + \item[Fine-tuning] \marginnote{Fine-tuning} + Specialize an LLM to a specific domain or task. + + \begin{description} + \item[Continued pre-training] \marginnote{Continued pre-training} + Continue pre-training with a domain-specific corpus. + + \item[Model adaptation] + Specialize a model by adding new learnable parameters. + + \begin{description} + \item[Parameter-efficient fine-tuning (PEFT)] \marginnote{Parameter-efficient fine-tuning (PEFT)} + Continue training a selected subset of parameters. + + \begin{description} + \item[Low-rank adaptation (LoRA)] \marginnote{Low-rank adaptation (LoRA)} + Method to update weights by learning an offset that uses fewer parameters. + + Consider a weight matrix $\matr{W} \in \mathbb{R}^{d \times k}$, LoRA decomposes the update into two learnable matrices $\matr{A} \in \mathbb{R}^{d \times r}$ and $\matr{B} \in \mathbb{R}^{r \times k}$ (with $r \ll d, k$). Weights update is performed as: + \[ \matr{W}_{\text{fine-tuned}} = \matr{W}_{\text{pre-trained}} + \matr{AB} \] + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/_lora.pdf} + \end{figure} + \end{description} + + \item[Task-specific fine-tuning] \marginnote{Task-specific fine-tuning} + Add a new trainable head on top of the model. + \end{description} + + \item[Supervised fine-tuning] \marginnote{Supervised fine-tuning} + Continue training using a supervised dataset to align the model to human's expectation. + \end{description} \end{description} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_mlm.tex b/src/year2/natural-language-processing/sections/_mlm.tex new file mode 100644 index 0000000..faafefe --- /dev/null +++ b/src/year2/natural-language-processing/sections/_mlm.tex @@ -0,0 +1,55 @@ +\chapter{Masked language models} + + + +\section{Bidirectional transformer encoder} + +\begin{description} + \item[Transformer encoder] \marginnote{Transformer encoder} + Architecture that produces contextual embeddings by considering both left-to-right and right-to-left context. + + \begin{remark} + This architecture does feature extraction and is more suited for classification tasks. + \end{remark} + + \begin{description} + \item[Architecture] + Similar to a transformer decoder, but self-attention is not causal. + + \begin{figure}[H] + \centering + \includegraphics[width=0.75\linewidth]{./img/_decoder_vs_encoder.pdf} + \end{figure} + \end{description} +\end{description} + + +\subsection{Masked language modelling} + +\begin{description} + \item[Masked language modelling] \marginnote{Masked language modelling} + Main training task of transformer encoders. It consists of predicting missing or corrupted tokens in a sequence. + + \begin{remark} + Transformer encoders output embeddings. For training purposes, a head to output a distribution over the vocabulary is added. + \end{remark} + + \begin{example} + Given a training corpus, BERT is trained by randomly sampling $15\%$ of the tokens in the training data and either: + \begin{itemize} + \item Mask it with a special \texttt{[MASK]} token ($80\%$ of the time). + \item Replace it with a different token ($10\%$ of the time). + \item Do nothing ($10\%$ of the time). + \end{itemize} + + \begin{figure}[H] + \centering + \includegraphics[width=0.6\linewidth]{./img/_bert_training.pdf} + \end{figure} + + \indenttbox + \begin{remark} + BERT's training approach is inefficient as masks are determined before training and only $15\%$ of the corpus tokens are actually used for training. Other models (e.g., RoBERTa), dynamically determine the mask at training time, allowing for more variety. + \end{remark} + \end{example} +\end{description} \ No newline at end of file