Add NLP TF-IDF, PPMI, dense embeddings

2026-02-04 07:41:43 +01:00 · 2024-10-22 17:31:22 +02:00
parent 7e429b7711
commit 916fccea43
3 changed files with 214 additions and 3 deletions
--- a/src/year2/natural-language-processing/img/_neural_language_model_example.pdf
+++ b/src/year2/natural-language-processing/img/_neural_language_model_example.pdf
--- a/src/year2/natural-language-processing/img/word2vec_alternatives.png
+++ b/src/year2/natural-language-processing/img/word2vec_alternatives.png
--- a/src/year2/natural-language-processing/sections/_semantics.tex
+++ b/src/year2/natural-language-processing/sections/_semantics.tex
@ -1,4 +1,4 @@
-\chapter{Semantics}
+\chapter{Semantics embedding}


 \section{Traditional semantic representation}
@ -166,7 +166,7 @@
 \end{description}


-\subsection{Co-occurrence (sparse) embeddings}
+\subsection{Sparse embeddings}

 \begin{description}
    \item[Co-occurrence matrix] \marginnote{Co-occurrence matrix}
@ -238,12 +238,145 @@
        \end{description}

    \item[Reweighing] 
-        Rescale the value of the components of the vectors (e.g., make a probability, length normalization, TF-IDF, \dots).
+        Rescale the vectors to emphasize important features and down-weigh irrelevant words.

        \begin{remark}[Frequency paradox]
            Raw frequencies are not an ideal representation for words as they are skewed and not discriminative. Moreover, overly frequent words (e.g., stop words) do not provide context information.
        \end{remark}

+        \begin{description}
+            \item[Term frequency-inverse document frequency (TF-IDF)] \marginnote{Term frequency-inverse document frequency (TF-IDF)}
+                Based on term-document occurrences. Given a word $t$ and a document $d$, it is computed as:
+                \[ \texttt{tf-idf}(t, d) = \texttt{tf}(t, d) \cdot \texttt{idf}(t) \]
+                where:
+                \begin{descriptionlist}
+                    \item[Term frequency (\texttt{tf})] 
+                        Log-transformed frequency count of a word $t$ in a document $d$:
+                        \[ 
+                            \texttt{tf}(t, d) = \begin{cases}
+                                1 + \log_{10}\left( \texttt{count}(t, d) \right) & \text{if $\texttt{count}(t, d) > 0$} \\
+                                0 & \text{otherwise}
+                            \end{cases} 
+                        \]
+
+                    \item[Inverse document frequency (\texttt{idf})]
+                        Inverse occurrence count of a word $t$ across all documents:
+                        \[ \texttt{idf}(t) = \log_{10}\left( \frac{N}{\texttt{df}_t} \right) \]
+                        where $\texttt{df}_t$ is the number of documents in which the term $t$ occurs.
+
+                        \begin{remark}
+                            Words that occur in a few documents have a high \texttt{idf}. Therefore, stop words, which appear often, have a low \texttt{idf}.
+                        \end{remark}
+                \end{descriptionlist}
+
+                \begin{example}
+                    Consider the term-document matrix with \texttt{tf} in parentheses:
+                    \begin{table}[H]
+                        \centering
+                        \footnotesize
+                        \begin{tabular}{ccccc}
+                            \toprule
+                            & \textit{As You Like It} & \textit{Twelfth Night} & \textit{Julius Caesar} & \textit{Henry V} \\
+                            \midrule
+                            \texttt{battle} & 1 ($1$) & 0 ($0$) & 7 ($1.845$) & 13 ($2.114$) \\
+                            \texttt{good} & 114 ($3.057$) & 80 ($2.903$) & 62 ($2.792$) & 89 ($2.949$) \\
+                            \texttt{fool} & 36 ($2.553$) & 58 ($2.763$) & 1 ($1$) & 4 ($1.602$) \\
+                            \texttt{wit} & 20 ($2.301$) & 15 ($2.176$) & 2 ($1.301$) & 3 ($1.477$) \\
+                            \bottomrule
+                        \end{tabular}
+                    \end{table}
+                    Assume that the \texttt{df} and \texttt{idf} of the words are:
+                    \begin{table}[H]
+                        \centering
+                        \footnotesize
+                        \begin{tabular}{ccc}
+                            \toprule
+                            \textbf{Word} & \texttt{df} & \texttt{idf} \\
+                            \midrule
+                            \texttt{battle} & $21$ & $0.246$ \\
+                            \texttt{good} & $37$ & $0$ \\
+                            \texttt{fool} & $36$ & $0.012$ \\
+                            \texttt{wit} & $34$ & $0.037$ \\
+                            \bottomrule
+                        \end{tabular}
+                    \end{table}
+                    The resulting TF-IDF weighted matrix is:
+                    \begin{table}[H]
+                        \centering
+                        \footnotesize
+                        \begin{tabular}{ccccc}
+                            \toprule
+                            & \textit{As You Like It} & \textit{Twelfth Night} & \textit{Julius Caesar} & \textit{Henry V} \\
+                            \midrule
+                            \texttt{battle} & $0.246$ & 0 & $0.454$ & $0.520$ \\
+                            \texttt{good} & 0 & 0 & 0 & 0 \\
+                            \texttt{fool} & $0.030$ & $0.033$ & $0.001$ & $0.002$ \\
+                            \texttt{wit} & $0.085$ & $0.081$ & $0.048$ & $0.054$ \\
+                            \bottomrule
+                        \end{tabular}
+                    \end{table}
+                \end{example}
+
+            \item[Positive point-wise mutual information (PPMI)] \marginnote{Positive point-wise mutual information (PPMI)}
+                Based on term-term occurrences. Given a word $w$ and a context word $c$, it determines whether they are correlated or occur by chance as follows:
+                \[ \texttt{PPMI}(w, c) = \max\left\{ \texttt{PMI}(w, c), 0 \right\} \]
+                where:
+                \begin{descriptionlist}
+                    \item[Point-wise mutual information (\texttt{PMI})]
+                    \[ \texttt{PMI}(w, c) = \log_2\left( \frac{\prob{w, c}}{\prob{w}\prob{c}} \right) \in (-\infty, +\infty) \]
+                    where:
+                    \begin{itemize}
+                        \item The numerator is the probability that $w$ and $c$ co-occur by correlation.
+                        \item The denominator is the probability that $w$ and $c$ co-occur by chance.
+                    \end{itemize} 
+
+                    \begin{remark}
+                        $\texttt{PMI} > 1$ indicates correlated co-occurrence. Otherwise, it is by chance.
+                    \end{remark}
+                \end{descriptionlist}
+
+                \begin{remark}[Weighting \texttt{PPMI}]
+                    \texttt{PMI} is biased towards infrequent events and returns very high values for them. This can be solved by either:
+                    \begin{itemize}
+                        \item Using add-$k$ smoothing (typically, $k \in [0.1, 3]$).
+                        \item Slightly increasing the probability of rare context words such that $\mathcal{P}_\alpha(c) = \frac{\texttt{count}(c)^\alpha}{\sum_{c'}\texttt{count}(c')^\alpha}$ (typically, $\alpha=0.75$).
+                    \end{itemize}
+                \end{remark}
+
+                \begin{example}
+                    Consider the term-term matrix:
+                    \begin{table}[H]
+                        \centering
+                        \footnotesize
+                        \begin{tabular}{cccccc|c}
+                            \toprule
+                            & \texttt{computer} & \texttt{data} & \texttt{result} & \texttt{pie} & \texttt{sugar} & $\texttt{count}(w)$ \\
+                            \midrule
+                            \texttt{cherry} & $2$ & $8$ & $9$ & $442$ & $25$ & $486$ \\
+                            \texttt{strawberry} & $0$ & $0$ & $1$ & $60$ & $19$ & $80$ \\
+                            \texttt{digital} & $1670$ & $1683$ & $85$ & $5$ & $4$ & $3447$ \\
+                            \texttt{information} & $3325$ & $3982$ & $378$ & $5$ & $13$ & $7703$ \\
+                            \midrule
+                            $\texttt{count}(c)$ & $4977$ & $5673$ & $473$ & $512$ & $61$ & $11716$ \\
+                            \bottomrule
+                        \end{tabular}
+                    \end{table}
+                    The PPMI between \texttt{information} and \texttt{data} con be computed as:
+                    \[
+                        \begin{split}
+                            \prob{\texttt{information}, \texttt{data}} &= \frac{3982}{11716} = 0.3399 \\
+                            \prob{\texttt{information}} &= \frac{7703}{11716} = 0.6575 \\
+                            \prob{\texttt{data}} &= \frac{5673}{11716} = 0.4872 \\
+                            \texttt{PPMI}(\texttt{information}, \texttt{data}) &= \max\left\{ \log_2\left( \frac{0.3399}{0.6575 \cdot 0.4872} \right), 0 \right\} = 0.0944
+                        \end{split}
+                    \]
+                \end{example}
+        \end{description}
+
+        \begin{remark}
+            Reweighing loses information about the magnitude of the counts.
+        \end{remark}
+
    \item[Dimensionality reduction] 
        Reduce the dimensionality of the embeddings.

@ -255,4 +388,82 @@
            \item[Length] Compare the length $|\vec{v}| = \sqrt{\sum_{i=1}^{n} v_i^2}$ of the vectors.
            \item[Cosine similarity] $\frac{\vec{w} \cdot \vec{v}}{|\vec{w}| \, |\vec{v}|}$.
        \end{description}
+\end{description}
+
+
+\subsection{Dense embeddings}
+
+\begin{remark}
+    Dense embeddings are usually:
+    \begin{itemize}
+        \item Easier to process with machine learning algorithms.
+        \item Able to generalize better than simply counting.
+        \item Handle synonyms better.
+    \end{itemize}
+\end{remark}
+
+\begin{description}
+    \item[Neural language modeling] \marginnote{Neural language modeling}
+        Use a neural network to predict the next word $w_{n+1}$ given an input sequence $w_{1..n}$. The general flow is the following:
+        \begin{enumerate}
+            \item Encode the input words into one-hot vectors ($\mathbb{R}^{|V| \times n}$). 
+            \item Project the input vectors with an embedding matrix $\matr{E} \in \mathbb{R}^{d \times |V|}$ that encodes them into $d$-dimensional vectors.
+            \item Pass the embedding into the hidden layers.
+            \item The final layer is a probability distribution over the vocabulary ($\mathbb{R}^{|V| \times 1}$).
+        \end{enumerate}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.6\linewidth]{./img/_neural_language_model_example.pdf}
+            \caption{Example of neural language model with a context of $3$ tokens}
+        \end{figure}
+
+        \begin{remark}
+            The embedding matrix $\matr{E}$ can be used independently to embed words. In fact, by construction, the $i$-th column of $\matr{E}$ represents the embedding of the $i$-th token of the vocabulary.
+        \end{remark}
+
+        \begin{description}
+            \item[Training] 
+                Given a text corpus, training is done sequentially in a self-supervised manner by sliding a context window over the sequence. At each iteration, the next word is predicted and cross-entropy is used as loss.
+
+                \begin{remark}
+                    The initial embedding matrix is usually initialized using statistical methods and not randomly.
+                \end{remark}
+        \end{description}
+
+    \item[Word2vec] \marginnote{Word2vec}
+        Based on the idea of using a binary classifier to determine whether a word $c$ is likely to appear near the target word $w$. 
+
+        Given a context word $c$ and a target word $w$, the problem can be solved using a logistic regressor (i.e., use the dot product to measure vector similarity):
+        \[ 
+            \prob{\texttt{+} | w, c} = \sigma(\vec{c} \cdot \vec{w}) 
+            \qquad
+            \prob{\texttt{-} | w, c} = 1 - \prob{\texttt{+} | w, c}
+        \]
+        where $\vec{w} \in \mathbb{R}^{d}$ and $\vec{c} \in \mathbb{R}^{d}$ are the columns of the learned embedding matrix for the words $w$ and $c$, respectively.
+
+        Moreover, it is assumed that context words are independent, therefore, if the context is a sequence, it is computed as follows::
+        \[ \prob{\texttt{+} | w, c_{1..L}} = \prod_{i=1}^{L} \sigma(\vec{c}_i \cdot \vec{w}) \]
+
+        \begin{description}
+            \item[Training]
+                Given a text corpus, chosen target words and their neighbors are considered positive examples. Negative examples are obtained by randomly sampling other words.
+
+                When training, two variants are possible:
+                \begin{descriptionlist}
+                    \item[Continuous bag-of-words (CBOW)]
+                        Given the context words, predict the target word.
+
+                    \item[Skip-grams]
+                        Given the target word, predict the (position independent) context words.
+                \end{descriptionlist}
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.65\linewidth]{./img/word2vec_alternatives.png}
+                \end{figure}
+        \end{description}
+
+        \begin{remark}
+            In practice, Word2vec learns two sets of embeddings $\matr{W} \in \mathbb{R}^{|V| \times d}$ and $\matr{C} \in \mathbb{R}^{|V| \times d}$ for the target and context words, respectively. At the end, they can either be averaged, concatenated, or one can be dropped.
+        \end{remark}
 \end{description}