Add NLP metrics, smoothing, naive Bayes

2025-12-16 03:21:48 +01:00 · 2024-10-11 20:06:10 +02:00
parent 88cde35721
commit 7de54e2693
3 changed files with 270 additions and 6 deletions
--- a/src/year2/natural-language-processing/sections/_language_models.tex
+++ b/src/year2/natural-language-processing/sections/_language_models.tex
@ -213,26 +213,26 @@
                Model with a single token context ($k=1$):
                \[ \prob{w_{1..n}} \approx \prod_i \prob{w_i | w_{i-1}} \]

-            \item[N-gram model] \marginnote{N-gram model}
+            \item[$\mathbf{N}$-gram model] \marginnote{$N$-gram model}
                Model with a context of $k=N-1$ tokens:
                \[ \prob{w_{1..n}} \approx \prod_i \prob{w_i | w_{i-N+1..i-1}} \]

                \begin{remark}
-                    N-gram models cannot capture long-range dependencies.
+                    $N$-gram models cannot capture long-range dependencies.
                \end{remark}

                \begin{description}
-                    \item[Estimating N-gram probabilities]
+                    \item[Estimating $\mathbf{N}$-gram probabilities]
                        Consider the bigram case, the probability that a token $w_i$ follows $w_{i-1}$ can be determined by counting:
                        \[ \prob{w_i | w_{i-1}} = \frac{\texttt{count}(w_{i-1} w_i)}{\texttt{count}(w_{i-1})} \]
                \end{description}

                \begin{remark}
-                    N-gram models cannot handle unknown tokens.
+                    $N$-gram models cannot handle unknown tokens.
                \end{remark}

                \begin{remark}
-                    N-gram models capture knowledge about:
+                    $N$-gram models capture knowledge about:
                    \begin{itemize}
                        \item Grammar and syntax.
                        \item Some information about the dataset (e.g., domain, genre of corpus, cultural aspects, \dots).
@ -244,6 +244,138 @@
        Randomly sample tokens from the distribution of a language model.

        \begin{remark}
-            In N-gram models ($N \geq 2$), the distribution changes depending on the previously sampled tokens.
+            In $N$-gram models ($N \geq 2$), the distribution changes depending on the previously sampled tokens.
        \end{remark}
+\end{description}
+
+
+
+\section{Metrics}
+
+
+\subsection{Extrinsic evaluation}
+
+\begin{description}
+    \item[Extrinsic/downstream evaluation] \marginnote{Extrinsic evaluation}
+        Compare the performance of different models on specific tasks.
+\end{description}
+
+\begin{remark}
+    Extrinsic evaluation is the best approach for comparing different models, but it is often computationally expensive.
+\end{remark}
+
+
+\subsection{Intrinsic evaluation}
+
+\begin{description}
+    \item[Intrinsic evaluation] \marginnote{Intrinsic evaluation}
+        Measure the quality of a model independently of the task.
+
+    \item[Perplexity (\texttt{PP})] \marginnote{Perplexity}
+        Probability-based metric based on the inverse probability of a sequence (usually the test set) normalized by the number of words:
+        \[
+            \begin{split}
+                \prob{w_{1..N}} &= \prod_{i} \prob{w_i | w_{1..i-1}} \\
+                \texttt{PP}(w_{1..N}) &= \prob{w_{1..N}}^{-\frac{1}{N}} \in [1, +\infty]
+            \end{split}
+        \]
+        A lower perplexity represents a generally better model.
+
+        \begin{example}
+            For bigram models, perplexity is computed as:
+            \[ 
+                \prob{w_{1..N}} \approx \prod_{i} \prob{w_i | w_{i-1}}
+                \qquad
+                \texttt{PP}(w_{1..N}) = \sqrt[N]{\prod_{i} \frac{1}{\prob{w_i | w_{i-1}}}}
+            \]
+        \end{example}
+
+        \begin{remark}[Perplexity intuition]
+            Perplexity can be seen as a measure of surprise of a language model when evaluating a sequence.
+
+            Alternatively, it can also be seen as a weighted average branching factor (i.e., average number of possible next words that follow any word, accounting for their probabilities). For instance, consider a vocabulary of digits and a training corpus where every digit appears with uniform probability $0.1$. The perplexity of any sequence using a 1-gram model is:
+            \[ \texttt{PP}(w_{1..N}) = \left( 0.1^{N} \right)^{-\frac{1}{N}} = 10 \]
+            Now consider a training corpus where $0$ occurs $91\%$ of the time and the other digits $1\%$ of the time. The perplexity of the sequence \texttt{0 0 0 0 0 3 0 0 0 0} is:
+            \[ \texttt{PP}(\texttt{0 0 0 0 0 3 0 0 0 0}) = \left( 0.91^9 \cdot 0.01 \right)^{-\frac{1}{10}} \approx 1.73 \]
+        \end{remark}
+
+        \begin{remark}
+            Minimizing perplexity is the same as maximizing the probability of the tokens.
+        \end{remark}
+
+        \begin{remark}
+            Perplexity can be artificially reduced by using a smaller vocabulary. Therefore, it is only reasonable to compare perplexity of models with the same vocabulary.
+        \end{remark}
+
+        \begin{remark}
+            Perplexity is generally a bad approximation of extrinsic metrics and only works well if the test set is representative of the training data. Therefore, it is only useful to guide experiments and the final evaluation should be done through extrinsic evaluation.
+        \end{remark}
+\end{description}
+
+
+
+\section{$\mathbf{N}$-gram model problems}
+
+
+\subsection{Overfitting}
+\marginnote{Overfitting}
+
+$N$-gram models become better at modeling the training corpus for increasing values of $N$. This risks overfitting and does not allow to obtain a generalized model.
+
+\begin{example}
+    A 4-gram model is able to nearly perfectly generate sentences from Shakespeare's works.
+\end{example}
+
+
+\subsection{Out-of-vocabulary tokens}
+
+There are two types of vocabulary systems:
+\begin{descriptionlist}
+    \item[Closed vocabulary system] \marginnote{Closed vocabulary}
+        All words that can occur are known.
+
+    \item[Open vocabulary system] \marginnote{Open vocabulary}
+        Unknown words are possible. They are usually handled using a dedicated token \texttt{<UNK>} which allows to turn an open vocabulary system into a closed one:
+        \begin{itemize}
+            \item Use a vocabulary and model all other words as \texttt{<UNK>}.
+            \item Model infrequent words as \texttt{<UNK>}.
+        \end{itemize}
+
+        \begin{remark}
+            The training set must contain \texttt{<UNK>} tokens to estimate its distribution as it is treated as any other token.
+        \end{remark}
+\end{descriptionlist}
+
+
+\subsection{Unseen sequences}
+
+Only for $n$-grams that occur enough times a representative probability can be estimated. For increasing values of $n$, the sparsity grows causing many unseen $n$-grams that produce a probability of $0$, with the risk of performing divisions by zero (e.g., perplexity) or zeroing probabilities (e.g., when applying the chain rule).
+
+\begin{description}
+    \item[Laplace smoothing] \marginnote{Laplace smoothing}
+        Adds $1$ to all counts and renormalizes them. Given a vocabulary $V$ and an $N$-gram model, smoothing is done as follows:
+        \[
+            \mathcal{P}_\text{Laplace}(w_i | w_{i-N+1..i-1}) = \frac{\texttt{count}(w_{i-N+1..i-1}w_i) + 1}{\texttt{count}(w_{i-N+1..i-1}) + \vert V \vert}
+        \]
+        Alternatively, by only changing the numerator, it can be formulated using an adjusted count as:
+        \[
+            \begin{gathered}
+                \mathcal{P}_\text{Laplace}(w_i | w_{i-N+1..i-1}) = \frac{c^*}{\texttt{count}(w_{i-N+1..i-1})} \\
+                c^* = \big( \texttt{count}(w_{i-N+1..i-1}w_i) + 1 \big) \frac{\texttt{count}(w_{i-N+1..i-1})}{\texttt{count}(w_{i-N+1..i-1}) + \vert V \vert}
+            \end{gathered}
+        \]
+        where $\frac{\texttt{count}(w_{i-N+1..i-1})}{\texttt{count}(w_{i-N+1..i-1}) + \vert V \vert}$ is a normalization factor.
+
+        \begin{example}
+            For a 2-gram model, Laplace smoothing is computed as:
+            \[
+                \mathcal{P}_\text{Laplace}(w_i | w_{i-1}) = \frac{\texttt{count}(w_{i-1}w_i) + 1}{\texttt{count}(w_{i-1}) + \vert V \vert}
+            \]
+            Or by using the adjusted count as:
+            \[
+                \mathcal{P}_\text{Laplace}(w_i | w_{i-1}) = \frac{c^*}{\texttt{count}(w_{i-1})}
+                \qquad
+                c^* = \big( \texttt{count}(w_{i-1}w_i) + 1 \big) \frac{\texttt{count}(w_{i-1})}{\texttt{count}(w_{i-1}) + \vert V \vert}
+            \]
+        \end{example}
 \end{description}