mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add NLP dense non-contextual embeddings + RNN
This commit is contained in:
BIN
src/year2/natural-language-processing/img/_embedding_history.png
Normal file
BIN
src/year2/natural-language-processing/img/_embedding_history.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 142 KiB |
Binary file not shown.
Binary file not shown.
|
After Width: | Height: | Size: 36 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 33 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 102 KiB |
BIN
src/year2/natural-language-processing/img/rnn_lm.png
Normal file
BIN
src/year2/natural-language-processing/img/rnn_lm.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 86 KiB |
BIN
src/year2/natural-language-processing/img/rnn_unrolled.png
Normal file
BIN
src/year2/natural-language-processing/img/rnn_unrolled.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 60 KiB |
@ -8,9 +8,10 @@
|
||||
\begin{document}
|
||||
|
||||
\makenotesfront
|
||||
\input{./sections/_basic_text.tex}
|
||||
\input{./sections/_language_models.tex}
|
||||
\input{./sections/_classification.tex}
|
||||
\input{./sections/_semantics.tex}
|
||||
\include{./sections/_basic_text.tex}
|
||||
\include{./sections/_language_models.tex}
|
||||
\include{./sections/_classification.tex}
|
||||
\include{./sections/_semantics.tex}
|
||||
\include{./sections/_rnn.tex}
|
||||
|
||||
\end{document}
|
||||
71
src/year2/natural-language-processing/sections/_rnn.tex
Normal file
71
src/year2/natural-language-processing/sections/_rnn.tex
Normal file
@ -0,0 +1,71 @@
|
||||
\chapter{Recurrent neural networks}
|
||||
|
||||
|
||||
\section{Architectures}
|
||||
|
||||
|
||||
\subsection{(Elman) recurrent neural network}
|
||||
|
||||
\begin{description}
|
||||
\item[Recurrent neural network (RNN)] \marginnote{Recurrent neural network (RNN)}
|
||||
Neural network that processes a sequential input. At each iteration, an input is fed to the network and the hidden activation is computed considering both the input and the hidden activation of the last iteration.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/rnn_unrolled.png}
|
||||
\caption{RNN unrolled in time}
|
||||
\end{figure}
|
||||
|
||||
\item[RNN language model (RNN-LM)] \marginnote{RNN language model (RNN-LM)}
|
||||
Given an input word $w^{(t)}$, an RNN-LM does the following:
|
||||
\begin{enumerate}
|
||||
\item Compute the embedding $\vec{e}^{(t)}$ of $w^{(t)}$.
|
||||
\item Compute the hidden state $\vec{h}^{(t)}$ considering the hidden state $\vec{h}^{(t-1)}$ of the previous step:
|
||||
\[ \vec{h}^{(t)} = f(\matr{W}_e \vec{e}^{(t)} + \matr{W}_h \vec{h}^{(t-1)} + b_1) \]
|
||||
\item Compute the output vocabulary distribution $\hat{\vec{y}}^{(t)}$:
|
||||
\[ \hat{\vec{y}}^{(t)} = \texttt{softmax}(\matr{U}\vec{h}^{(t)} + b_2) \]
|
||||
\item Repeat for the next token.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.4\linewidth]{./img/rnn_lm.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
RNN-LMs generate the output autoregressively.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Training]
|
||||
Given the predicted distribution $\hat{\vec{y}}^{(t)}$ and ground-truth $\vec{y}^{(t)}$ at step $t$, the loss is computed as the cross-entropy:
|
||||
\[ \mathcal{L}^{(t)}(\matr{\theta}) = - \sum_{v \in V} \vec{y}_v^{(t)} \log\left( \hat{\vec{y}}_w^{(t)} \right) \]
|
||||
|
||||
\begin{description}
|
||||
\item[Teacher forcing] \marginnote{Teacher forcing}
|
||||
During training, as the ground-truth is known, the input at each step is the correct token even if the previous step outputted the wrong value.
|
||||
|
||||
\begin{remark}
|
||||
This allows to stay close to the ground-truth and avoid completely wrong training steps.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Applications}
|
||||
|
||||
\subsection{Autoregressive generation}
|
||||
|
||||
\begin{description}
|
||||
\item[Autoregressive generation] \marginnote{Autoregressive generation}
|
||||
Repeatedly sample a token and feed it back to the network.
|
||||
|
||||
\item[Decoding strategy] \marginnote{Decoding strategy}
|
||||
Method to select the output token from the output distribution. Possible approaches are:
|
||||
\begin{descriptionlist}
|
||||
\item[Greedy] Select the token with the highest probability.
|
||||
\item[Sampling] Randomly sample the token following the probabilities of the output distribution.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
@ -391,7 +391,7 @@
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Dense embeddings}
|
||||
\subsection{Dense non-contextual embeddings}
|
||||
|
||||
\begin{remark}
|
||||
Dense embeddings are usually:
|
||||
@ -432,38 +432,236 @@
|
||||
\end{description}
|
||||
|
||||
\item[Word2vec] \marginnote{Word2vec}
|
||||
Based on the idea of using a binary classifier to determine whether a word $c$ is likely to appear near the target word $w$.
|
||||
Word embedding framework that encodes a target word based on the context words near it.
|
||||
|
||||
Given a context word $c$ and a target word $w$, the problem can be solved using a logistic regressor (i.e., use the dot product to measure vector similarity):
|
||||
\[
|
||||
\prob{\texttt{+} | w, c} = \sigma(\vec{c} \cdot \vec{w})
|
||||
\qquad
|
||||
\prob{\texttt{-} | w, c} = 1 - \prob{\texttt{+} | w, c}
|
||||
\]
|
||||
where $\vec{w} \in \mathbb{R}^{d}$ and $\vec{c} \in \mathbb{R}^{d}$ are the columns of the learned embedding matrix for the words $w$ and $c$, respectively.
|
||||
Two training variants are available in Word2vec:
|
||||
\begin{descriptionlist}
|
||||
\item[Continuous bag-of-words (CBOW)]
|
||||
Given the context words, predict the target word.
|
||||
|
||||
Moreover, it is assumed that context words are independent, therefore, if the context is a sequence, it is computed as follows::
|
||||
\[ \prob{\texttt{+} | w, c_{1..L}} = \prod_{i=1}^{L} \sigma(\vec{c}_i \cdot \vec{w}) \]
|
||||
\item[Skip-gram]
|
||||
Given the target word, predict the (position independent) context words.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/word2vec_alternatives.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{description}
|
||||
\item[Training]
|
||||
Given a text corpus, chosen target words and their neighbors are considered positive examples. Negative examples are obtained by randomly sampling other words.
|
||||
\item[Skip-gram model] \marginnote{Skip-gram model}
|
||||
Given a context word $c$ and a target word $w$, a classifier is trained to determine whether $c$ appears in the context of $w$. After training, the weights of the classifier are used as the skip-gram model to embed words.
|
||||
|
||||
When training, two variants are possible:
|
||||
\begin{descriptionlist}
|
||||
\item[Continuous bag-of-words (CBOW)]
|
||||
Given the context words, predict the target word.
|
||||
\begin{remark}
|
||||
In practice, for an easier optimization, the skip-gram model learns two sets of embeddings $\matr{W} \in \mathbb{R}^{|V| \times d}$ and $\matr{C} \in \mathbb{R}^{|V| \times d}$ for the target and context words, respectively. Therefore, it has two sets of parameters $\matr{\theta} = \langle\matr{W}, \matr{C}\rangle$. At the end, they can either be averaged, concatenated, or one can be dropped.
|
||||
\end{remark}
|
||||
|
||||
\item[Skip-grams]
|
||||
Given the target word, predict the (position independent) context words.
|
||||
\end{descriptionlist}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.65\linewidth]{./img/word2vec_alternatives.png}
|
||||
\end{figure}
|
||||
\begin{description}
|
||||
\item[Training (softmax)]
|
||||
Given the target word $w$ and context word $c$, and their embeddings $\vec{w}$ and $\vec{c}$, the skip-gram model computes their similarity as the dot product. The probability that $c$ is in the context of $w$ is then computed though a softmax as:
|
||||
\[
|
||||
\prob{c | w; \matr{\theta}} = \frac{\exp(\vec{c} \cdot \vec{w})}{\sum_{v \in V} \exp(\vec{v} \cdot \vec{w})}
|
||||
\]
|
||||
|
||||
Given a training sequence $w_1, \dots, w_T$ and a context window of size $m$, training is done by iterating over each possible target word $w_t$ and considering the conditional probabilities of its neighbors. Then, the loss is defined as the average negative log-likelihood defined as follows:
|
||||
\[
|
||||
\begin{split}
|
||||
\mathcal{L}(\matr{\theta}) = -\frac{1}{T} \sum_{t=1}^{T} \sum\limits_{\substack{-m \leq j \leq m\\j \neq 0}} \log\left( \prob{w_{t+j} | w_t; \matr{\theta}} \right)
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{remark}
|
||||
Due to the normalization factor over the whole vocabulary, using softmax for training is expensive.
|
||||
\end{remark}
|
||||
|
||||
\item[Training (negative sampling)] \marginnote{Skip-gram with negative sampling (SGNS)}
|
||||
Use a binary logistic regressor as classifier. The two classes are:
|
||||
\begin{itemize}
|
||||
\item Context words within the context window (positive label).
|
||||
\item Words randomly sampled (negative label).
|
||||
\end{itemize}
|
||||
The probabilities can be computed as:
|
||||
\[
|
||||
\prob{\texttt{+} | w, c; \matr{\theta}} = \sigma(\vec{c} \cdot \vec{w})
|
||||
\qquad
|
||||
\prob{\texttt{-} | w, c; \matr{\theta}} = 1 - \prob{\texttt{+} | w, c; \matr{\theta}}
|
||||
\]
|
||||
|
||||
It is assumed context-independent words, therefore, if the context is a sequence, the probability is computed as follows:
|
||||
\[ \prob{\texttt{+} | w, c_{1..L}; \matr{\theta}} = \prod_{i=1}^{L} \sigma(\vec{c}_i \cdot \vec{w}) \]
|
||||
|
||||
At each iteration, the batch is composed of a single positive examples and $K$ negative examples randomly sampled according to their weighted unigram probability $\mathcal{P}_\alpha(w) = \frac{\texttt{count}(w)^\alpha}{\sum_{v \in V} \texttt{count}(v)^\alpha}$ ($\alpha$ it used to give rarer words a slightly higher probability).
|
||||
|
||||
Given a batch, the loss is defined as:
|
||||
\[
|
||||
\begin{split}
|
||||
\mathcal{L}(\matr{\theta}) &= -\log\left( \prob{\texttt{+} | w, c^\text{pos}; \matr{\theta}} \prod_{i=1}^{K} \prob{\texttt{-} | w, c^\text{neg}_{i}; \matr{\theta}} \right) \\
|
||||
&= - \left( \log\left( \sigma(\vec{c}^\text{pos} \cdot \vec{w}) \right) + \sum_{i=1}^{K} \log\left( \sigma(-\vec{c}^\text{neg}_{i} \cdot \vec{w}) \right) \right)
|
||||
\end{split}
|
||||
\]
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\item[fastText] \marginnote{fastText}
|
||||
Extension of Word2vec based on subwords to deal with out-of-vocabulary words.
|
||||
|
||||
A word is represented both as itself and a bag of $n$-grams. Both whole words and $n$-grams have an embedding. The overall embedding of a word is represented through the sum of its constituent $n$-grams.
|
||||
|
||||
\begin{example}
|
||||
With $n=3$, the word \texttt{where} is represented both as \texttt{<where>} and \texttt{<wh, whe, her, ere, re>} (\texttt{<} and \texttt{>} are boundary characters).
|
||||
\end{example}
|
||||
|
||||
\item[GloVe] \marginnote{GloVe}
|
||||
Based on the term-term co-occurrence (within a window) probability matrix that indicates for each word its probability of co-occurring with the other words.
|
||||
|
||||
Similarly to Word2vec, the objective is to learn two sets of embeddings $\matr{\theta} = \langle\matr{W}, \matr{C}\rangle$ such that their similarity is close to their log-probability of co-occurring. Given the term-term matrix $\matr{X}$, the loss for a target word $w$ and a context word $c$ is defined as:
|
||||
\[ \mathcal{L}(\matr{\theta}) = \left( \vec{c} \cdot \vec{w} - \log( \matr{X}[c, w] ) \right)^2 \]
|
||||
|
||||
\begin{remark}
|
||||
In practice, Word2vec learns two sets of embeddings $\matr{W} \in \mathbb{R}^{|V| \times d}$ and $\matr{C} \in \mathbb{R}^{|V| \times d}$ for the target and context words, respectively. At the end, they can either be averaged, concatenated, or one can be dropped.
|
||||
Empirically, for GloVe it has been observed that the final embedding matrix obtain as $\matr{W} + \matr{C}$ works better.
|
||||
\end{remark}
|
||||
|
||||
\begin{example}
|
||||
A possible term-term co-occurrence probability for the words \texttt{ice} and \texttt{steam} is the following:
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\footnotesize
|
||||
\begin{tabular}{ccccc}
|
||||
\toprule
|
||||
& $k=\texttt{solid}$ & $k=\texttt{gas}$ & $k=\texttt{water}$ & $k=\texttt{fashion}$ \\
|
||||
\midrule
|
||||
$\prob{k | \texttt{ice}}$ & $1.9 \times 10^{-4}$ & $6.6 \times 10^{-5}$ & $3.0 \times 10^{-3}$ & $1.7 \times 10^{-5}$ \\
|
||||
$\prob{k | \texttt{steam}}$ & $2.2 \times 10^{-5}$ & $7.8 \times 10^{-4}$ & $2.2 \times 10^{-3}$ & $1.8 \times 10^{-5}$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
\texttt{ice} is more likely to co-occur with \texttt{solid} while \texttt{steam} is more likely to co-occur with \texttt{gas}. GloVe uses this information when determining the embeddings.
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Embeddings properties}
|
||||
|
||||
|
||||
\subsection{Embeddings similarity}
|
||||
|
||||
\begin{description}
|
||||
\item[Context size] \marginnote{Context size}
|
||||
The window size used to collect counts or determine context words can result in different embeddings.
|
||||
|
||||
As a general rule, smaller windows tend to capture more syntactic features while a larger window encodes more topically related but not necessarily similar words.
|
||||
|
||||
\item[Similarity orders] \marginnote{Similarity orders}
|
||||
Two words have:
|
||||
\begin{descriptionlist}
|
||||
\item[First-order co-occurrence]
|
||||
If they are nearby each other.
|
||||
\item[Second-order co-occurrence]
|
||||
If they have similar context words.
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Relational similarity] \marginnote{Relational similarity}
|
||||
Dense embeddings are able to capture relational meanings.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/embedding_relations.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{description}
|
||||
\item[Parallelogram model]
|
||||
Given the problem ``$a \text{ is to } b \text{ as } a^* \text{ is to } b^*$'' ($a : b :: a^* : b^*$), the parallelogram model solves it as:
|
||||
\[ b^* = \arg\min_x \texttt{distance}(x, b-a+a^*) \]
|
||||
|
||||
\begin{example}
|
||||
In Word2vec, the following operation between embeddings can be done:
|
||||
\[ \texttt{Paris} - \texttt{France} + \texttt{Italy} \approx \texttt{Rome} \]
|
||||
\end{example}
|
||||
|
||||
\begin{remark}
|
||||
Even if it sometimes works, parallelogram model is not guaranteed to always produce the expected result.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Embeddings analysis}
|
||||
|
||||
\begin{description}
|
||||
\item[Word history] \marginnote{Word history}
|
||||
Trained on different corpora, dense embeddings can provide a semantic evolution of words by analyzing its neighboring embeddings.
|
||||
|
||||
\begin{example}
|
||||
\phantom{}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_embedding_history.png}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Neighboring embeddings of the same words encoded using Word2vec trained on different corpora from different decades
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
\phantom{}
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.25\linewidth]{./img/embedding_sentiment_history.png}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Sentiment for the word \texttt{terrific} analyzed using the embeddings obtained by training on different corpora
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
\end{example}
|
||||
|
||||
\item[Cultural bias] \marginnote{Cultural bias}
|
||||
Embeddings reflect implicit biases in the training corpus.
|
||||
|
||||
\begin{description}
|
||||
\item[Implicit association test]
|
||||
Determine how associated are concepts and attributes.
|
||||
|
||||
\begin{example}
|
||||
Using the parallelogram model to solve:
|
||||
\[ \texttt{father} : \texttt{doctor} :: \texttt{mother} : x \]
|
||||
finds as the closest words $x =$ \texttt{homemaker}, \texttt{nurse}, \texttt{receptionist}, \dots
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
African-American and Chinese names are closer to unpleasant words compared to European-American names.
|
||||
\end{example}
|
||||
|
||||
\begin{example}
|
||||
Using the Google News dataset as training corpus, there is a correlation between the women bias of the jobs embeddings and the percentage of women over men in those jobs.
|
||||
|
||||
Woman bias for a word $w$ is computed as:
|
||||
\[ d_\text{women}(w) - d_\text{men}(w) \]
|
||||
where $d_\text{women}(w)$ is the average embedding distance between words representing women (e.g., \texttt{she}, \texttt{female}, \dots) and the word $w$. The same idea is applied to $d_\text{men}(w)$.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/_embedding_women_occupation.pdf}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Relationship between the relative percentage of women in an occupation and the women bias.
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/embedding_women_occupation_bias.png}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Average women bias vs average women occupation difference over time.
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
\end{example}
|
||||
\end{description}
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user