Add NLP metrics, smoothing, naive Bayes

2025-12-14 18:51:52 +01:00 · 2024-10-11 20:06:10 +02:00
parent 88cde35721
commit 7de54e2693
3 changed files with 270 additions and 6 deletions
--- a/src/year2/natural-language-processing/nlp.tex
+++ b/src/year2/natural-language-processing/nlp.tex
@ -10,5 +10,6 @@
    \makenotesfront
    \input{./sections/_basic_text.tex}
    \input{./sections/_language_models.tex}
+    \input{./sections/_classification.tex}

 \end{document}
--- a/src/year2/natural-language-processing/sections/_classification.tex
+++ b/src/year2/natural-language-processing/sections/_classification.tex
@ -0,0 +1,131 @@
+\chapter{Text classification}
+
+
+\section{Common tasks}
+
+\begin{description}
+    \item[Sentiment analysis/Opinion mining] \marginnote{Sentiment analysis/Opinion mining}
+        Detection of attitudes. It can involve detecting:
+        \begin{itemize}
+            \item The holder of attitude (i.e., the source).
+            \item The target of attitude (i.e., the aspect).
+            \item The type of attitude (e.g., positive and negative).
+            \item The text containing the attitude. 
+        \end{itemize}
+
+    \item[Spam detection]
+    \item[Language identification]
+    \item[Authorship attribution]
+    \item[Subject category classification] 
+\end{description}
+
+
+\section{Classification}
+
+\begin{description}
+    \item[Classification task] \marginnote{Classification task}
+        Given an input $x$ and a set of possible classes $Y = \{ y_1, \dots, y_M \}$, a classifier determines the class $\hat{y} \in Y$ associated to $x$.
+
+        Classification can be:
+        \begin{descriptionlist}
+            \item[Rule-based] \marginnote{Rule-based}
+                Based on fixed (possibly handwritten) rules.
+
+                \begin{example}
+                    Blacklist, whitelist, regex, \dots
+                \end{example}
+
+            \item[In-context learning] \marginnote{In-context learning}
+                Provide a decoder (i.e., generative) large language model a prompt describing the task and the possible classes.
+
+                \begin{example}
+                    Zero-shot learning, few-shot learning, \dots
+                \end{example}
+
+            \item[Supervised machine learning] \marginnote{Supervised machine learning}
+                Use a training set of $N$ labeled data $\{ (d_i, c_i) \}$ to fit a classifier.
+
+                An ML model can be:
+                \begin{descriptionlist}
+                    \item[Generative] Informally, it learns the distribution of the data.
+                    \item[Discriminative] Informally, it learns to exploit the features to determine the class.
+                \end{descriptionlist}
+        \end{descriptionlist}
+\end{description}
+
+
+\section{Naive Bayes}
+
+\begin{description}
+    \item[Bag-of-words (BoW)] \marginnote{Bag-of-words (BoW)}
+        Represents a document using the frequencies of its words.
+
+        Given a vocabulary $V$ and a document $d$, the bag-of-words embedding of $d$ is a vector in $\mathbb{N}^{\vert V \vert}$ where the $i$-th position contains the number of occurrences of the $i$-th token in $d$.
+
+    \item[Multinomial naive Bayes classifier] \marginnote{Multinomial naive Bayes classifier}
+        Probabilistic (i.e., generative) classifier based on the assumption that features are independent given the class.
+
+        Given a document $d = \{ w_1, \dots, w_n \}$, a naive Bayes classifier returns the class $\hat{c}$ with maximum posterior probability:
+        \[
+            \begin{split}
+                \hat{c} &= \arg\max_{c \in C} \prob{c | d} \\
+                &= \arg\max_{c \in C} \underbrace{\prob{d | c}}_{\text{likelihood}} \underbrace{\prob{c}}_{\text{prior}} \\
+                &= \arg\max_{c \in C} \prob{w_1, \dots, w_n | c} \prob{c} \\
+                &= \arg\max_{c \in C} \prod_{i} \prob{w_i | c} \prob{c} \\
+                &= \arg\max_{c \in C} \sum_{i} \log\prob{w_i | c} \log\prob{c} \\
+            \end{split}
+        \]
+
+        Given a training set $D$ with $N_c$ classes and a vocabulary $V$, $\prob{w_i | c}$ and $\prob{c}$ are determined during training by maximum likelihood estimation as follows:
+        \[
+            \prob{c} = \frac{N_c}{\vert D \vert}
+            \qquad
+            \prob{w_i | c} = \frac{\texttt{count}(w_i, c)}{\sum_{v \in V} \texttt{count}(v, c)}
+        \]
+        where $\texttt{count}(w, c)$ counts the occurrences of the word $w$ in the training samples with class $c$.
+
+        \begin{remark}
+            Laplace smoothing is used to avoid zero probabilities.
+        \end{remark}
+
+        \begin{remark}
+            Stop words can be removed from the training set as they are usually not relevant.
+        \end{remark}
+
+        \begin{remark}
+            The likelihood part of the equation ($\sum_{i} \log\prob{w_i | c}$) can be seen as a set of class-specific 1-gram language models.
+        \end{remark}
+\end{description}
+
+\begin{example}
+    Given the following training set for sentiment analysis with two classes:
+    \begin{table}[H]
+        \centering
+        \footnotesize
+        \begin{tabular}{cl}
+            \toprule
+            \textbf{Class} & \textbf{Document} \\
+            \midrule
+            \texttt{-} & \texttt{just plain boring} \\
+            \texttt{-} & \texttt{entirely predictable and lacks energy} \\
+            \texttt{-} & \texttt{no surprises and very few laughs} \\
+            \texttt{+} & \texttt{very powerful} \\
+            \texttt{+} & \texttt{the most fun film of the summer} \\
+            \bottomrule
+        \end{tabular}
+    \end{table}
+    We want to classify the sentence ``\texttt{predictable with no fun}''. Excluding stop words (i.e., \texttt{with}), we need to compute:
+    \[
+        \begin{split}
+            \prob{\texttt{+} | \texttt{predictable with no fun}} &= \prob{\texttt{+}} \prob{\texttt{predictable} | \texttt{+}} \prob{\texttt{no} | \texttt{+}} \prob{\texttt{fun} | \texttt{+}} \\
+            \prob{\texttt{-} | \texttt{predictable with no fun}} &= \prob{\texttt{-}} \prob{\texttt{predictable} | \texttt{-}} \prob{\texttt{no} | \texttt{-}} \prob{\texttt{fun} | \texttt{-}}
+        \end{split}
+    \]
+    A vocabulary of $20$ tokens can be used to represent the training samples. The required likelihoods and priors with Laplace smoothing are computed as:
+    \[
+        \begin{gathered}
+            \prob{\texttt{+}} = \frac{2}{5} \qquad \prob{\texttt{predictable} | \texttt{+}} = \frac{0+1}{9+20} \quad \prob{\texttt{no} | \texttt{+}} = \frac{0+1}{9+20} \quad \prob{\texttt{fun} | \texttt{+}} = \frac{1+1}{9+20} \\
+            \prob{\texttt{-}} = \frac{3}{5} \qquad \prob{\texttt{predictable} | \texttt{-}} = \frac{1+1}{14+20} \quad \prob{\texttt{no} | \texttt{-}} = \frac{1+1}{14+20} \quad \prob{\texttt{fun} | \texttt{-}} = \frac{0+1}{14+20}
+        \end{gathered}
+    \]
+\end{example}
--- a/src/year2/natural-language-processing/sections/_language_models.tex
+++ b/src/year2/natural-language-processing/sections/_language_models.tex
@ -213,26 +213,26 @@
                Model with a single token context ($k=1$):
                \[ \prob{w_{1..n}} \approx \prod_i \prob{w_i | w_{i-1}} \]

-            \item[N-gram model] \marginnote{N-gram model}
+            \item[$\mathbf{N}$-gram model] \marginnote{$N$-gram model}
                Model with a context of $k=N-1$ tokens:
                \[ \prob{w_{1..n}} \approx \prod_i \prob{w_i | w_{i-N+1..i-1}} \]

                \begin{remark}
-                    N-gram models cannot capture long-range dependencies.
+                    $N$-gram models cannot capture long-range dependencies.
                \end{remark}

                \begin{description}
-                    \item[Estimating N-gram probabilities]
+                    \item[Estimating $\mathbf{N}$-gram probabilities]
                        Consider the bigram case, the probability that a token $w_i$ follows $w_{i-1}$ can be determined by counting:
                        \[ \prob{w_i | w_{i-1}} = \frac{\texttt{count}(w_{i-1} w_i)}{\texttt{count}(w_{i-1})} \]
                \end{description}

                \begin{remark}
-                    N-gram models cannot handle unknown tokens.
+                    $N$-gram models cannot handle unknown tokens.
                \end{remark}

                \begin{remark}
-                    N-gram models capture knowledge about:
+                    $N$-gram models capture knowledge about:
                    \begin{itemize}
                        \item Grammar and syntax.
                        \item Some information about the dataset (e.g., domain, genre of corpus, cultural aspects, \dots).
@ -244,6 +244,138 @@
        Randomly sample tokens from the distribution of a language model.

        \begin{remark}
-            In N-gram models ($N \geq 2$), the distribution changes depending on the previously sampled tokens.
+            In $N$-gram models ($N \geq 2$), the distribution changes depending on the previously sampled tokens.
        \end{remark}
+\end{description}
+
+
+
+\section{Metrics}
+
+
+\subsection{Extrinsic evaluation}
+
+\begin{description}
+    \item[Extrinsic/downstream evaluation] \marginnote{Extrinsic evaluation}
+        Compare the performance of different models on specific tasks.
+\end{description}
+
+\begin{remark}
+    Extrinsic evaluation is the best approach for comparing different models, but it is often computationally expensive.
+\end{remark}
+
+
+\subsection{Intrinsic evaluation}
+
+\begin{description}
+    \item[Intrinsic evaluation] \marginnote{Intrinsic evaluation}
+        Measure the quality of a model independently of the task.
+
+    \item[Perplexity (\texttt{PP})] \marginnote{Perplexity}
+        Probability-based metric based on the inverse probability of a sequence (usually the test set) normalized by the number of words:
+        \[
+            \begin{split}
+                \prob{w_{1..N}} &= \prod_{i} \prob{w_i | w_{1..i-1}} \\
+                \texttt{PP}(w_{1..N}) &= \prob{w_{1..N}}^{-\frac{1}{N}} \in [1, +\infty]
+            \end{split}
+        \]
+        A lower perplexity represents a generally better model.
+
+        \begin{example}
+            For bigram models, perplexity is computed as:
+            \[ 
+                \prob{w_{1..N}} \approx \prod_{i} \prob{w_i | w_{i-1}}
+                \qquad
+                \texttt{PP}(w_{1..N}) = \sqrt[N]{\prod_{i} \frac{1}{\prob{w_i | w_{i-1}}}}
+            \]
+        \end{example}
+
+        \begin{remark}[Perplexity intuition]
+            Perplexity can be seen as a measure of surprise of a language model when evaluating a sequence.
+
+            Alternatively, it can also be seen as a weighted average branching factor (i.e., average number of possible next words that follow any word, accounting for their probabilities). For instance, consider a vocabulary of digits and a training corpus where every digit appears with uniform probability $0.1$. The perplexity of any sequence using a 1-gram model is:
+            \[ \texttt{PP}(w_{1..N}) = \left( 0.1^{N} \right)^{-\frac{1}{N}} = 10 \]
+            Now consider a training corpus where $0$ occurs $91\%$ of the time and the other digits $1\%$ of the time. The perplexity of the sequence \texttt{0 0 0 0 0 3 0 0 0 0} is:
+            \[ \texttt{PP}(\texttt{0 0 0 0 0 3 0 0 0 0}) = \left( 0.91^9 \cdot 0.01 \right)^{-\frac{1}{10}} \approx 1.73 \]
+        \end{remark}
+
+        \begin{remark}
+            Minimizing perplexity is the same as maximizing the probability of the tokens.
+        \end{remark}
+
+        \begin{remark}
+            Perplexity can be artificially reduced by using a smaller vocabulary. Therefore, it is only reasonable to compare perplexity of models with the same vocabulary.
+        \end{remark}
+
+        \begin{remark}
+            Perplexity is generally a bad approximation of extrinsic metrics and only works well if the test set is representative of the training data. Therefore, it is only useful to guide experiments and the final evaluation should be done through extrinsic evaluation.
+        \end{remark}
+\end{description}
+
+
+
+\section{$\mathbf{N}$-gram model problems}
+
+
+\subsection{Overfitting}
+\marginnote{Overfitting}
+
+$N$-gram models become better at modeling the training corpus for increasing values of $N$. This risks overfitting and does not allow to obtain a generalized model.
+
+\begin{example}
+    A 4-gram model is able to nearly perfectly generate sentences from Shakespeare's works.
+\end{example}
+
+
+\subsection{Out-of-vocabulary tokens}
+
+There are two types of vocabulary systems:
+\begin{descriptionlist}
+    \item[Closed vocabulary system] \marginnote{Closed vocabulary}
+        All words that can occur are known.
+
+    \item[Open vocabulary system] \marginnote{Open vocabulary}
+        Unknown words are possible. They are usually handled using a dedicated token \texttt{<UNK>} which allows to turn an open vocabulary system into a closed one:
+        \begin{itemize}
+            \item Use a vocabulary and model all other words as \texttt{<UNK>}.
+            \item Model infrequent words as \texttt{<UNK>}.
+        \end{itemize}
+
+        \begin{remark}
+            The training set must contain \texttt{<UNK>} tokens to estimate its distribution as it is treated as any other token.
+        \end{remark}
+\end{descriptionlist}
+
+
+\subsection{Unseen sequences}
+
+Only for $n$-grams that occur enough times a representative probability can be estimated. For increasing values of $n$, the sparsity grows causing many unseen $n$-grams that produce a probability of $0$, with the risk of performing divisions by zero (e.g., perplexity) or zeroing probabilities (e.g., when applying the chain rule).
+
+\begin{description}
+    \item[Laplace smoothing] \marginnote{Laplace smoothing}
+        Adds $1$ to all counts and renormalizes them. Given a vocabulary $V$ and an $N$-gram model, smoothing is done as follows:
+        \[
+            \mathcal{P}_\text{Laplace}(w_i | w_{i-N+1..i-1}) = \frac{\texttt{count}(w_{i-N+1..i-1}w_i) + 1}{\texttt{count}(w_{i-N+1..i-1}) + \vert V \vert}
+        \]
+        Alternatively, by only changing the numerator, it can be formulated using an adjusted count as:
+        \[
+            \begin{gathered}
+                \mathcal{P}_\text{Laplace}(w_i | w_{i-N+1..i-1}) = \frac{c^*}{\texttt{count}(w_{i-N+1..i-1})} \\
+                c^* = \big( \texttt{count}(w_{i-N+1..i-1}w_i) + 1 \big) \frac{\texttt{count}(w_{i-N+1..i-1})}{\texttt{count}(w_{i-N+1..i-1}) + \vert V \vert}
+            \end{gathered}
+        \]
+        where $\frac{\texttt{count}(w_{i-N+1..i-1})}{\texttt{count}(w_{i-N+1..i-1}) + \vert V \vert}$ is a normalization factor.
+
+        \begin{example}
+            For a 2-gram model, Laplace smoothing is computed as:
+            \[
+                \mathcal{P}_\text{Laplace}(w_i | w_{i-1}) = \frac{\texttt{count}(w_{i-1}w_i) + 1}{\texttt{count}(w_{i-1}) + \vert V \vert}
+            \]
+            Or by using the adjusted count as:
+            \[
+                \mathcal{P}_\text{Laplace}(w_i | w_{i-1}) = \frac{c^*}{\texttt{count}(w_{i-1})}
+                \qquad
+                c^* = \big( \texttt{count}(w_{i-1}w_i) + 1 \big) \frac{\texttt{count}(w_{i-1})}{\texttt{count}(w_{i-1}) + \vert V \vert}
+            \]
+        \end{example}
 \end{description}