Add NLP regex and tokenization

2026-02-04 15:51:43 +01:00 · 2024-09-20 15:28:40 +02:00
parent 83d98d32c2
commit 5c809f7461
4 changed files with 233 additions and 0 deletions
--- a/src/year2/natural-language-processing/ainotes.cls
+++ b/src/year2/natural-language-processing/ainotes.cls
@ -0,0 +1 @@
+../../ainotes.cls
--- a/src/year2/natural-language-processing/metadata.json
+++ b/src/year2/natural-language-processing/metadata.json
@ -0,0 +1,11 @@
+{
+    "name": "Natural Language Processing",
+    "year": 2,
+    "semester": 1,
+    "pdfs": [
+        {
+            "name": null,
+            "path": "nlp.pdf"
+        }
+    ]
+}
--- a/src/year2/natural-language-processing/nlp.tex
+++ b/src/year2/natural-language-processing/nlp.tex
@ -0,0 +1,13 @@
+\documentclass[11pt]{ainotes}
+
+\title{Natural Language Processing}
+\date{2024 -- 2025}
+\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
+\def\giturl{{PLACEHOLDER-GIT-URL}}
+
+\begin{document}
+
+    \makenotesfront
+    \input{./sections/_basic_text.tex}
+
+\end{document}
--- a/src/year2/natural-language-processing/sections/_basic_text.tex
+++ b/src/year2/natural-language-processing/sections/_basic_text.tex
@ -0,0 +1,208 @@
+\chapter{Basic text processing}
+
+
+\begin{description}
+    \item[Text normalization]
+        Operations such as:
+        \begin{description}
+            \item[Tokenization] \marginnote{Tokenization}
+            Split a sentence in tokens.
+
+            \begin{remark}
+                Depending on the approach, a token is not always a word.
+            \end{remark}
+
+            \item[Lemmatization/stemming] \marginnote{Lemmatization/stemming}
+                Convert words to their canonical form.
+
+                \begin{example}
+                    $\{ \texttt{sang}, \texttt{sung}, \texttt{sings} \} \mapsto \texttt{sing}$
+                \end{example}
+
+            \item[Sentence segmentation] \marginnote{Sentence segmentation}
+                Split a text in sentences.
+
+                \begin{remark}
+                    A period does not always signal the end of a sentence.
+                \end{remark}
+        \end{description}
+\end{description}
+
+
+\section{Regular expressions}
+
+\begin{description}
+    \item[Regular expression (regex)] \marginnote{Regular expression (regex)}
+        Formal language to describe string patterns.
+\end{description}
+
+
+\subsection{Basic operators}
+
+\begin{description}
+    \item[Disjunction (brackets)]
+        Match a single character between square brackets \texttt{[]}.
+
+        \begin{example}
+            \texttt{/[wW]oodchuck/} matches \texttt{Woodchuck} and \texttt{woodchuck}.
+        \end{example}
+
+    \item[Range]
+        Match a single character from a range of characters or digits.
+
+        \begin{example} \phantom{}\\[-1.5em]
+            \begin{itemize}
+                \item \texttt{/[A-Z]/} matches a single upper case letter.
+                \item \texttt{/[a-z]/} matches a single lower case letter.
+                \item \texttt{/[0-9]/} matches a single digit.
+            \end{itemize}
+        \end{example}
+
+    \item[Negation]
+        Match the negation of a pattern.
+
+        \begin{example}
+            \texttt{/[\textasciicircum A-Z]/} matches a single character that is not an upper case letter.
+        \end{example}
+
+    \item[Disjunction (pipe)]
+        Disjunction of regular expressions separated by \texttt{|}.
+
+        \begin{example}
+            \texttt{/groundhog|woodchuck/} matches \texttt{groundhog} and \texttt{woodchuck}.
+        \end{example}
+
+
+    \item[Wildcards] \phantom{}
+        \begin{description}
+            \item[Optional]
+                A character followed by \texttt{?} can be matched optionally.
+
+                \begin{example}
+                    \texttt{/woodchucks?/} matches \texttt{woodchuck} and \texttt{woodchucks}.
+                \end{example}
+
+            \item[Any]
+                \texttt{.} matches any character.
+
+            \item[Kleene \texttt{*}]
+                A character followed by \texttt{*} can be matched zero or more times.
+
+            \item[Kleene \texttt{+}]
+                A character followed by \texttt{+} must be matched at least once.
+
+            \item[Counting]
+                A character followed by \texttt{\string{n,m\string}} must be matched from $n$ to $m$ times.
+
+                \begin{example} \phantom{}\\[-1.5em]
+                    \begin{itemize}
+                        \item \texttt{\string{n\string}} matches exactly $n$ instances of the previous character.
+
+                        \item \texttt{\string{n,m\string}} matches from $n$ to $m$ instances of the previous character.
+
+                        \item \texttt{\string{n,\string}} matches at least $n$ instances of the previous character.
+                        
+                        \item \texttt{\string{,m\string}} matches at most $m$ instances of the previous character.
+                    \end{itemize}
+                \end{example}
+        \end{description}
+
+    \item[Anchors] \phantom{}
+        \begin{description}
+            \item[Start of line]
+                \texttt{\textasciicircum} matches only at the start of line.
+
+                \begin{example}
+                    \texttt{/\textasciicircum a/} matches \texttt{\underline{a}} but not \texttt{ba}.
+                \end{example}
+
+            \item[End of line] 
+                \texttt{\$} matches only at the end of line.
+
+                \begin{example}
+                    \texttt{/a\$/} matches \texttt{\underline{a}} but not \texttt{ab}.
+                \end{example}
+
+            \item[Word boundary]
+                \texttt{\char`\\ b} matches a word boundary character.
+
+            \item[Word non-boundary] 
+                \texttt{\char`\\ B} matches a word non-boundary character.
+        \end{description}
+
+    \item[Aliases] \phantom{}
+        \begin{itemize}
+            \item \texttt{\char`\\ d} matches a single digit (same as \texttt{[0-9]}).
+            
+            \item \texttt{\char`\\ D} matches a single non-digit (same as \texttt{[\textasciicircum\char`\\ d]}).
+            
+            \item \texttt{\char`\\ w} matches a single alphanumeric or underscore character (same as \texttt{[a-zA-Z0-9\_]}).
+            
+            \item \texttt{\char`\\ w} matches a single non-alphanumeric and non-underscore character (same as \texttt{[\textasciicircum\char`\\ w]}).
+
+            \item \texttt{\char`\\ s} matches a single whitespace (space or tab).
+            
+            \item \texttt{\char`\\ S} matches a single non-whitespace.
+        \end{itemize}
+
+
+    \item[Capture group]
+        Operator to refer to previously matched substrings.
+
+        \begin{example}
+            In the regex \texttt{/the (.*)er they were, the \char`\\ 1er they will be/}, \texttt{\char`\\ 1} should match the same content matched by \texttt{(.*)}.
+        \end{example}
+\end{description}
+
+
+
+\section{Tokenization}
+
+\begin{description}
+    \item[Lemma] \marginnote{Lemma}
+        Words with the same stem and roughly the same semantic meaning.
+        \begin{example}
+            \texttt{cat} and \texttt{cats} are the same lemma.
+        \end{example}
+
+    \item[Wordform] \marginnote{Wordform}
+        Orthographic appearance of a word.
+        \begin{example}
+            \texttt{cat} and \texttt{cats} do not have the same wordform.
+        \end{example}
+    
+    \item[Vocabulary] \marginnote{Vocabulary}
+        Collection of text elements, each indexed by an integer.
+
+        \begin{remark}
+            To reduce the size of a vocabulary, words can be reduced to lemmas.
+        \end{remark}
+
+    \item[Type / Wordtype] \marginnote{Type / Wordtype}
+        Element of a vocabulary (i.e., wordforms in the vocabulary).
+    
+    \item[Token] \marginnote{Token}
+        Instance of a type in a text.
+
+    \item[Genre] \marginnote{Genre}
+        Topic of a text corpus (e.g., short social media comments, books, Wikipedia pages, \dots).
+\end{description}
+
+\begin{remark}[Herdan's law]
+    Given a corpus with $N$ tokens, a vocabulary $V$ over that corpus roughly have size:
+    \[ \left\vert V \right\vert = kN^\beta \]
+    where the typical values are $10 \leq k \leq 100$ and $0.4 \leq \beta \leq 0.6$.
+\end{remark}
+
+\begin{description}
+    \item[Stopwords] \marginnote{Stopwords}
+        Frequent words that can be dropped.
+
+        \begin{remark}
+            If semantics is important, stopwords should be kept. LLMs keep stopwords.
+        \end{remark}
+\end{description}
+
+\begin{remark}
+    For speed, simple tokenizers use regex.
+\end{remark}