diff --git a/src/year2/natural-language-processing/img/asp_arch.png b/src/year2/natural-language-processing/img/asp_arch.png new file mode 100644 index 0000000..194c0b4 Binary files /dev/null and b/src/year2/natural-language-processing/img/asp_arch.png differ diff --git a/src/year2/natural-language-processing/img/nlu_arch.png b/src/year2/natural-language-processing/img/nlu_arch.png new file mode 100644 index 0000000..f8f877e Binary files /dev/null and b/src/year2/natural-language-processing/img/nlu_arch.png differ diff --git a/src/year2/natural-language-processing/img/spectrogram.png b/src/year2/natural-language-processing/img/spectrogram.png new file mode 100644 index 0000000..ae1409c Binary files /dev/null and b/src/year2/natural-language-processing/img/spectrogram.png differ diff --git a/src/year2/natural-language-processing/img/spoken_dialog_system.png b/src/year2/natural-language-processing/img/spoken_dialog_system.png new file mode 100644 index 0000000..c9ce604 Binary files /dev/null and b/src/year2/natural-language-processing/img/spoken_dialog_system.png differ diff --git a/src/year2/natural-language-processing/img/tts_arch.png b/src/year2/natural-language-processing/img/tts_arch.png new file mode 100644 index 0000000..8a1bc0c Binary files /dev/null and b/src/year2/natural-language-processing/img/tts_arch.png differ diff --git a/src/year2/natural-language-processing/img/waveform.png b/src/year2/natural-language-processing/img/waveform.png new file mode 100644 index 0000000..cff48bf Binary files /dev/null and b/src/year2/natural-language-processing/img/waveform.png differ diff --git a/src/year2/natural-language-processing/nlp.tex b/src/year2/natural-language-processing/nlp.tex index ad82d12..6eb5c98 100644 --- a/src/year2/natural-language-processing/nlp.tex +++ b/src/year2/natural-language-processing/nlp.tex @@ -18,4 +18,9 @@ \include{./sections/_model_efficiency.tex} \include{./sections/_llm_usage.tex} + \appendix + \include{./sections/_task_oriented_dialog_system.tex} + \include{./sections/_speech.tex} + \include{./sections/_italian_llm.tex} + \end{document} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_italian_llm.tex b/src/year2/natural-language-processing/sections/_italian_llm.tex new file mode 100644 index 0000000..21ee5f3 --- /dev/null +++ b/src/year2/natural-language-processing/sections/_italian_llm.tex @@ -0,0 +1,51 @@ +\chapter{Italian LLMs} + +\begin{remark} + Advantages of pre-training from scratch are: + \begin{itemize} + \item Having full control on the training data. + \item Improve the fertility of the tokenizer. + \end{itemize} +\end{remark} + +\begin{description} + \item[Minerva] \marginnote{Minerva} + Language model pre-trained on the Italian language. + + \begin{remark} + Minerva's pre-training corpus is actually composed by both Italian and English datasets. + + Initially, English was used for benchmarking due to the lack of Italian benchmarks. However, it is also useful for tasks intrinsically in English (e.g., coding). + \end{remark} + + \begin{remark} + Some training datasets were automatically translated in Italian. Some others were adapted from existing Italian ones (e.g., transform a question answering dataset into a cloze form). + \end{remark} +\end{description} + + +\begin{description} + \item[FENICE metric] \marginnote{FENICE metric} + Factuality metric for summarization. It works as follows: + \begin{enumerate} + \item Extract claims from the summary with an ad-hoc LLM. + \item Align each claim with the original document with positive (if in support) and negative (if against) scores. + \item Perform co-reference resolution to unify entities across claims. + \end{enumerate} + + \item[ALERT benchmark] \marginnote{ALERT benchmark} + Benchmark to test the safeness of an LLM based on 32 risk categories. The testing data are created as follows: + \begin{enumerate} + \item Filter the ``\textit{Helpfulness \& Harmlessness-RLHF}'' dataset of \textit{Anthropic} by considering for each example the first prompt and red team attacks only. + \item Use templates to automatically generate additional prompts. + \item Augment the prompts by formatting them as adversarial attacks. Examples of attacks are: + \begin{descriptionlist} + \item[Prefix/suffix injection] + Prepend or append an adversarial prompt (e.g., \texttt{disregard the instructions above and \dots}). + \item[Token manipulation] + Alter or invert a small fraction of tokens in the prompt (the idea is to use a prompt that is less likely to have been already seen in the alignment datasets). + \item[Jailbreaking] + Use more complex strategies (e.g., role playing). + \end{descriptionlist} + \end{enumerate} +\end{description} diff --git a/src/year2/natural-language-processing/sections/_speech.tex b/src/year2/natural-language-processing/sections/_speech.tex new file mode 100644 index 0000000..9e7343c --- /dev/null +++ b/src/year2/natural-language-processing/sections/_speech.tex @@ -0,0 +1,195 @@ +\chapter{Speech processing} + + +\section{Audio representation} + +\begin{description} + \item[Sound/soundwave] \marginnote{Soundwave} + Vibration that travels though a medium. It is modulated by: + \begin{descriptionlist} + \item[Pitch] Frequency of the vibrations. + \item[Loudness] Amplitude of the vibrations. + \end{descriptionlist} + + \item[Waveform] \marginnote{Waveform} + Representation of a soundwave. It is described by: + \begin{description} + \item[Frequency] + Represents the pitch of the sound. + + \item[Period] + Distance between two peaks of the sound (i.e., correlated to frequency as $f=\frac{1}{T}$). + + \item[Amplitude] + Represents the loudness of the sound (i.e., the air pressure). + + \begin{remark} + In practice, amplitude is usually converted in decibels due to the fact that the human auditory system perceives sound closer to a logarithmic scale. + \end{remark} + \end{description} + + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/waveform.png} + \end{figure} + + \item[Signal] \marginnote{Signal} + Representation of information. + + \begin{remark} + In sound processing, the waveform itself is the signal. + \end{remark} + + \begin{description} + \item[Analog signal] \marginnote{Analog signal} + Waveform as-is in the real world. + + \item[Digital signal] \marginnote{Digital signal} + Sampled (i.e., measure uniform time steps) and quantized (discretize values) version of an analog waveform. + \end{description} + + \item[Fourier transform] \marginnote{Fourier transform} + Method to decompose a continuous signal in its constituent sin waves. + + Given a continuous signal $x(t)$, its Fourier transform is: + \[ X(f) = \int_{-\infty}^{+\infty} x(t) e^{-j2\pi ft} \,dt \] + where $X(f)$ indicates how much of the frequency $f$ exists in $x(t)$. + + \begin{description} + \item[Discrete Fourier transform (DFT)] \marginnote{Discrete Fourier transform (DFT)} + Fourier transform for digital signals. + + Given a discrete signal $x[n]$, its DFT is: + \[ X[k] = \sum_{n=0}^{N-1} x[n]e^{-\frac{j2\pi kn}{N}} \] + where $k$ is the discrete frequency and $N$ is the number of samples. + + \item[Fast Fourier transform (FFT)] \marginnote{Fast Fourier transform (FFT)} + Efficient implementation of DFT for $N$s that are power of $2$. + + \item[Short-time Fourier transform (STFT)] \marginnote{Short-time Fourier transform (STFT)} + FFT computed on short time windows of the sound signal. + + \begin{remark} + This method allows preserving time information by using a fixed frame size. + \end{remark} + + \begin{description} + \item[Spectrogram] \marginnote{Spectrogram} + Result of STFT that shows how the frequencies change over time. + \end{description} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/spectrogram.png} + \end{figure} + + \item[Inverse STFT (ISTFT)] \marginnote{Inverse STFT (ISTFT)} + Converts a time-frequency representation of sound (i.e., spectrogram) to its sound signal. + + \begin{remark} + This allows to manipulate a signal in its frequency domain (STFT) and then convert it back (ISTFT). + \end{remark} + \end{description} + + \item[Mel-scaled spectrogram] \marginnote{Mel-scaled spectrogram} + Spectrogram where frequencies are mapped to the mel scale (i.e., lower frequencies are more fine-grained while higher frequencies are more compressed, to match the human logarithmic sound perception). + + \item[Audio features] \marginnote{Audio features} + Representation of a sound signal extracted from the waveform or spectrogram. +\end{description} + + + +\section{Tasks} + +\begin{description} + \item[Automatic speech recognition (ASP)] + Convert a sound signal into text. + + \begin{example} + Use an RNN/transformer encoder-decoder architecture. A sound signal is processed as follows: + \begin{enumerate} + \item Compute the audio features from the waveform (e.g., mel-spectrogram). + \item Pass the computed features through the encoder. + \item Use the decoder to generate the output text autoregressively conditioned on the encoder. + \end{enumerate} + + \begin{figure}[H] + \centering + \includegraphics[width=0.5\linewidth]{./img/asp_arch.png} + \end{figure} + \end{example} + + \item[Speech enhancement] + Clear the sound signal. + + \item[Speech separation] + Separate the different sources in a sound signal (e.g., differentiate speakers). + + \item[Text-to-speech] + Convert text into a sound signal. + + \begin{example} + Use an encoder-decoder architecture. A text is processed as follows: + \begin{enumerate} + \item Use the encoder to embed the input text into a representation that encodes linguistic features (e.g., pronunciation, rhythm, \dots). + \item Use the decoder to predict a mel-spectrogram. + \item Use a neural vocoder to convert the mel-spectrogram into an audio waveform. + \end{enumerate} + + \begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/tts_arch.png} + \end{figure} + \end{example} + + \item[Speaker diarization] + Determine the moment and the person who spoke. + + \item[Speech emotion recognition] + Recognize emotions from the sound signal. + + \item[Neural network explanation] + Use speech to explain another speech. +\end{description} + + + +\section{Speech foundation models} + + +\begin{description} + \item[Speech foundation model (SFM)] \marginnote{Speech foundation model (SFM)} + Transformer-based model pre-trained on speech. A common architecture is composed by: + \begin{descriptionlist} + \item[Feature extractor] + Converts the waveform into a low-dimensional representation (e.g., by using convolutions). + \item[Encoder] + Computes contextual embeddings from the sound features. + \end{descriptionlist} + + \begin{remark} + SFM takes as input raw waveforms and are more robust in dealing with speech variability due to diverse speakers, environment, noise, \dots + \end{remark} + + \begin{remark} + A SFM can be either fine-tuned for a specific task or used as a feature extractor for other models. + \end{remark} + + \item[Multimodal model] \marginnote{Multimodal model} + Model able to handle multiple modalities (e.g., speech and text). + + The main considerations to take into account when working with multimodel models are: + \begin{descriptionlist} + \item[Representation] + Decide how to encode different modalities into the same embedding space. + \item[Fusion] + Combine information from different modalities. + \item[Alignment] + Link corresponding elements (e.g., in time or by meaning) across different modalities. + \item[Translation] + Map information from one modality to another. + \item[Co-learning] + Leverage shared information between modalities for training. + \end{descriptionlist} +\end{description} \ No newline at end of file diff --git a/src/year2/natural-language-processing/sections/_task_oriented_dialog_system.tex b/src/year2/natural-language-processing/sections/_task_oriented_dialog_system.tex new file mode 100644 index 0000000..1eec077 --- /dev/null +++ b/src/year2/natural-language-processing/sections/_task_oriented_dialog_system.tex @@ -0,0 +1,213 @@ +\chapter{Task-oriented dialog systems} + + +\section{Human dialogs} + +\begin{description} + \item[Natural language dialog] \marginnote{Natural language dialog} + Sequence of utterances (i.e., sentences) between two or more participants where each takes a turn. + + \item[Turn-taking problem] \marginnote{Turn-taking problem} + Determine when the turn of another participant ended. + + \item[Speech/dialog act] \marginnote{Speech/dialog act} + Indicates the type of utterance. + + \begin{example} + Yes-no question, declarative question, statement, appreciation, yes answer, \dots + \end{example} + + \begin{description} + \item[Adjacency pairs] + Speech acts that commonly appear together. + + \begin{example} + Question $\rightarrow$ answer. + \end{example} + + \item[Subdialog] + Dialogs opened and closed within a dialog. + + \begin{example} + Correction subdialog, clarification subdialog, \dots + \end{example} + \end{description} + + \item[Dialog slot] \marginnote{Dialog slot} + Relevant entities and properties of an utterance. + + \begin{description} + \item[Filler] Values assigned to a slot. + \end{description} + + \item[Conversation initiative] \marginnote{Conversation initiative} + Who initiates the dialog. + + \begin{description} + \item[User initiative] + The user asks questions and the system responds (e.g., FAQ). + + \item[System initiative] + The system asks questions to the user (e.g., form completion). + + \item[Mixed initiative] + Both the user and the system can ask questions. + \end{description} + + \item[Types of dialog] \marginnote{Types of dialog} + \phantom{} + \begin{description} + \item[Information seeking] To retrieve information. + \item[Task-oriented] Dialog to achieve a goal. + \item[Argumentative] Argument in support or against an opinion. + \item[Explanatory] Teacher-student type of dialog. + \item[Recommendation] Persuasion dialog. + \item[Chit-chat] Free conversation. + \end{description} +\end{description} + + + +\section{Task-oriented dialogs} + + +\subsection{Architectures} + +\begin{description} + \item[Traditional dialog system] \marginnote{Traditional dialog system} + The main components of an artificial dialog system are: + \begin{description} + \item[Natural language understanding (NLU)] \marginnote{Natural language understanding (NLU)} + Extract the relevant information such as dialog acts and slot-fillers from the utterance. + + \begin{remark} + This task can be seen as a named entity recognition problem. + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.45\linewidth]{./img/nlu_arch.png} + \caption{Example of neural architecture for slot filling} + \end{figure} + + \item[Dialog state tracker (DST)] \marginnote{Dialog state tracker (DST)} + Maintains the history of the dialog. This component should also have access to a knowledge-base. + + \item[Dialog policy manager] \marginnote{Dialog policy manager} + Produces the dialog acts that composes the response from the output of the DST. + + \item[Natural language generation (NLG)] \marginnote{Natural language generation (NLG)} + Produces a natural language utterance from the dialog acts produced by the dialog manager. + \end{description} + + \begin{figure}[H] + \centering + \includegraphics[width=0.85\linewidth]{./img/spoken_dialog_system.png} + \caption{Example of components for a spoken dialog system} + \end{figure} + + + \item[LLM for dialog system] \marginnote{LLM for dialog system} + Use a language model that takes as input the utterance and directly produces a response. +\end{description} + + +\subsection{Dataset} + +\begin{description} + \item[MultiWOZ] \marginnote{MultiWOZ} + Collection of human-human conversations over multiple domains and topics annotated with dialog states (i.e., turns), slots, and acts. + + The dataset also defines an ontology for slots and a knowledge-base. + + \begin{remark} + Human annotations are determined by agreement between multiple annotators. + \end{remark} + + \begin{remark} + The type of dialogs in the dataset sensibly affects the resulting dialog system. + + \indenttbox + \begin{example} + Wizard of Oz collection is a part of MultiWOZ that consists of question-answer dialogs between a user and a wizard. Dialogs produced based on these might result too artificial. + \end{example} + \end{remark} +\end{description} + + + +\section{Research topics} + + +\subsection{LLM domain portability} + +\begin{description} + \item[Domain portability] \marginnote{Domain portability} + Adapt a model to a new domain (i.e., knowledge-base). + + Possible approaches are: + \begin{descriptionlist} + \item[Fine-tuning] + Fine-tune the LLM with the new knowledge-base. + + \begin{remark} + This approach is susceptible to the catastrophic forgetting problem. + \end{remark} + + \item[Prompting] + Embed the new knowledge-base into the prompt of the LLM. + + \begin{remark} + This approach risks hallucinations and is constrained to the limits of the context length and computational inefficiency. + \end{remark} + + \item[Functional calling] + Let the LLM query the knowledge-base when needed. + + \begin{remark} + This approach requires more complex prompts and not all LLMs support it. + \end{remark} + \end{descriptionlist} + + \begin{remark} + Experimental results show that functional calling works better than embedding the KB in the prompt. It is also more effective when the KB becomes bigger. + \end{remark} +\end{description} + + +\subsection{LLM pragmatics} + +\begin{description} + \item[Pragmatics] \marginnote{Pragmatics} + Ability to adapt a conversation based on the context. + + \item[Proactivity] \marginnote{Proactivity} + Ability of providing useful but not explicitly requested information. + + \begin{remark} + An LLM can be made more proactive by prompting or fine-tuning. + \end{remark} +\end{description} + + +\subsection{LLM for dialog generation} + +\begin{description} + \item[Automatic dialog generation] \marginnote{Automatic dialog generation} + Use an LLM to generate and annotate dialogs to create a synthetic dataset. A possible approach is based on the following steps: + \begin{descriptionlist} + \item[Generation] + Use the LLM to generate a dialog. Possible approaches are: + \begin{descriptionlist} + \item[One-pass] Prompt the LLM to generate a dialog based on a few references. + \item[Interactive] Produce a dialog by conversing with the model. + \item[Teacher-student] Let two LLMs converse. + \end{descriptionlist} + + \item[Annotation] + Prompt the LLM to annotate the generated dialog based on some schema. + + \item[Evaluation] + Evaluate based on human opinion. + \end{descriptionlist} +\end{description} \ No newline at end of file