Add NLP LLM tuning

2026-02-04 07:41:43 +01:00 · 2024-12-10 17:30:10 +01:00
parent 44e3976e34
commit 147f855ef7
9 changed files with 145 additions and 0 deletions
--- a/src/year2/natural-language-processing/img/_instruction_tuning.pdf
+++ b/src/year2/natural-language-processing/img/_instruction_tuning.pdf
--- a/src/year2/natural-language-processing/img/_tuning_comparison1.pdf
+++ b/src/year2/natural-language-processing/img/_tuning_comparison1.pdf
--- a/src/year2/natural-language-processing/img/_tuning_comparison2.pdf
+++ b/src/year2/natural-language-processing/img/_tuning_comparison2.pdf
--- a/src/year2/natural-language-processing/img/induction_head.png
+++ b/src/year2/natural-language-processing/img/induction_head.png
--- a/src/year2/natural-language-processing/img/prefix_tuning.png
+++ b/src/year2/natural-language-processing/img/prefix_tuning.png
--- a/src/year2/natural-language-processing/img/rlhf.png
+++ b/src/year2/natural-language-processing/img/rlhf.png
--- a/src/year2/natural-language-processing/nlp.tex
+++ b/src/year2/natural-language-processing/nlp.tex
@ -16,5 +16,6 @@
    \include{./sections/_attention.tex}
    \include{./sections/_llm.tex}
    \include{./sections/_model_efficiency.tex}
+    \include{./sections/_llm_usage.tex}

 \end{document}
--- a/src/year2/natural-language-processing/sections/_llm_usage.tex
+++ b/src/year2/natural-language-processing/sections/_llm_usage.tex
@ -0,0 +1,77 @@
+\chapter{Language model alignment and applications}
+
+
+\section{Model alignment}
+
+\begin{remark}
+    Off-the-shelf pre-trained models tend to only be good at word completion. They are most likely unable to understand instructions and might generate harmful content.
+\end{remark}
+
+
+\subsection{Instruction tuning}
+
+\begin{description}
+    \item[Instruction tuning] \marginnote{Instruction tuning}
+        Fine-tune a model on a dataset containing various tasks expressed in natural language in the form $(\text{description}, \text{examples}, \text{solution})$, all possibly formatted using multiple templates.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.7\linewidth]{./img/_instruction_tuning.pdf}
+            \caption{Example of templates for entailment detection}
+        \end{figure}
+
+        \begin{remark}
+            If performed correctly, after performing instruction tuning on a model, it is able to also solve tasks that were not present in the tuning dataset.
+        \end{remark}
+
+        \begin{figure}[H]
+            \centering
+            \begin{subfigure}[c]{0.34\linewidth}
+                \centering
+                \includegraphics[width=\linewidth]{./img/_tuning_comparison1.pdf}
+            \end{subfigure}
+            \hfill
+            \begin{subfigure}[c]{0.6\linewidth}
+                \centering
+                \includegraphics[width=\linewidth]{./img/_tuning_comparison2.pdf}
+            \end{subfigure}
+            \caption{Comparison of tuning approaches}
+        \end{figure}
+\end{description}
+
+
+\subsection{Preference alignment}
+
+\begin{description}
+    \item[Preference alignment] \marginnote{Preference alignment}
+        Align the output of a model with human values.
+
+    \item[Reinforcement learning with human feedback (RLHF)] \marginnote{Reinforcement learning with human feedback (RLHF)}
+        Align a language model using a policy-gradient reinforcement learning algorithm. The problem can be formulated as follows:
+        \begin{itemize}
+            \item The policy to learn represents the aligned model (i.e., $\texttt{prompt} \mapsto \texttt{answer}$ model),
+            \item Prompts are the states,
+            \item Answers are the actions.
+        \end{itemize}
+        RLHF works as follows:
+        \begin{enumerate}
+            \item Start from a pre-trained language model that already works well.
+
+            \item Train a reward model $r_\theta$ from a human-annotated dataset that maps text sequences into rewards. The architecture is usually based on transformers.
+
+            \item Fine-tune the language model (i.e., train the policy) using an RL algorithm (e.g., PPO) and the learned reward model. 
+
+            Given a prompt $x$ and an answer $y$, the reward $r$ used for the RL update is computed as:
+            \[ r = r_\theta(y \mid x) - \lambda_\text{KL} D_\text{KL}(\pi_{\text{PPO}}(y \mid x) \Vert \pi_{\text{base}}(y \mid x)) \]
+            where:
+            \begin{itemize}
+                \item $r_\theta(y \mid x)$ is the reward provided by the reward model.
+                \item $- \lambda_\text{KL} D_\text{KL}(\pi_{\text{PPO}}(y \mid x) \Vert \pi_{\text{base}}(y \mid x))$ is a penalty based on the Kullback-Leibler divergence to prevent the aligned model $\pi_\text{PPO}$ from moving too away from the original model $\pi_\text{base}$ (i.e., prevent the loss of language capabilities).
+            \end{itemize}
+        \end{enumerate}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.6\linewidth]{./img/rlhf.png}
+        \end{figure}
+\end{description}
--- a/src/year2/natural-language-processing/sections/_model_efficiency.tex
+++ b/src/year2/natural-language-processing/sections/_model_efficiency.tex
@ -81,5 +81,72 @@
                \]
                In other words, each token in $V_\text{dom}$ is encoded as the average of embeddings of the tokens that compose it in the starting embedding model (if the token appear in both vocabularies, the embedding is the same).
        \end{description}
+\end{description}

+
+
+\section{In-context learning}
+
+\begin{description}
+    \item[Prompting] \marginnote{Prompting}
+        Pass a prompt to the language model to condition generation.
+
+        More formally, a prompt is defined by means of a prompting function $f_\text{prompt}(\cdot)$ that formats an input text $x$. $f_\text{prompt}$ typically has a slot for the input and a slot for the answer (e.g., the class in case of classification). The prompt is then fed to the language model that searches the highest scoring word $\hat{z}$ to fill the answer as follows:
+        \[ \hat{z} = \arg\max \prob{ f_\text{fill}(f_\text{prompt}(x), z); \theta } \]
+        Where $f_\text{fill}(f_\text{prompt}(x), z)$ inserts $z$ in the prompt. In other word, we are looking for the word that makes the model least perplexed.
+
+        \begin{example}
+            A prompt for sentiment analysis of movie reviews might be:
+            \begin{center}
+                \texttt{[X] Overall, it was a [Z] movie.}
+            \end{center}
+            Where \texttt{[X]} is the placeholder for the review and \texttt{[Z]} is for the class.
+        \end{example}
+
+        \begin{remark}
+            The prompt does not necessarily need to be text (i.e., discrete/hard prompts). Continuous/soft prompts (i.e., embeddings) can also be used to condition generation.
+        \end{remark}
+
+
+    \item[Zero-shot learning] \marginnote{Zero-shot learning}
+        Solve a task by providing a language model the description of the problem in natural language.
+
+    \item[One-shot learning] \marginnote{One-shot learning}
+        Solve a task by providing a language model the description of the problem in natural language and a single demonstration (i.e., an example).
+
+    \item[Few-shot learning] \marginnote{Few-shot learning}
+        Solve a task by providing a language model the description of the problem in natural language and a few demonstrations.
+
+        \begin{remark}
+            Empirical results show that not too many examples are required. Also, too many examples might reduce performance.
+        \end{remark}
+\end{description}
+
+\begin{remark}
+    Some studies show that an explanation for in-context learning is that causal attention has the same effect of gradient updates (i.e., the left part of the prompt influences the right part).
+
+    Another possible explanation is based on the concept of induction heads which are attention heads that specialize in predicting repeated sequences (i.e., in-context learning is seen as the capability of imitating past data). Ablation studies show that by identifying and removing induction heads, in-context learning performance of a model drastically drops.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.9\linewidth]{./img/induction_head.png}
+        \caption{Example of induction head}
+    \end{figure}
+\end{remark}
+
+\begin{description}
+    \item[Prefix-tuning] \marginnote{Prefix-tuning}
+        Soft prompting technique that learns some prefix embeddings for a specific task to add to the prompt while keeping the rest of the model frozen.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.65\linewidth]{./img/prefix_tuning.png}
+        \end{figure}
+
+    \item[Chain-of-thought prompting] \marginnote{Chain-of-thought prompting}
+        Provide in the prompt examples of reasoning to make the model provide the output step-by-step.
+
+        \begin{remark}
+            Empirical results show that the best prompt for chain-of-thought is to add to the prompt \texttt{think step by step}.
+        \end{remark}
 \end{description}