Add NLP seminars

This commit is contained in:
2024-12-14 21:51:56 +01:00
parent 7f23838b5e
commit 3ff3fa8672
10 changed files with 464 additions and 0 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 76 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 386 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 97 KiB

View File

@ -18,4 +18,9 @@
\include{./sections/_model_efficiency.tex}
\include{./sections/_llm_usage.tex}
\appendix
\include{./sections/_task_oriented_dialog_system.tex}
\include{./sections/_speech.tex}
\include{./sections/_italian_llm.tex}
\end{document}

View File

@ -0,0 +1,51 @@
\chapter{Italian LLMs}
\begin{remark}
Advantages of pre-training from scratch are:
\begin{itemize}
\item Having full control on the training data.
\item Improve the fertility of the tokenizer.
\end{itemize}
\end{remark}
\begin{description}
\item[Minerva] \marginnote{Minerva}
Language model pre-trained on the Italian language.
\begin{remark}
Minerva's pre-training corpus is actually composed by both Italian and English datasets.
Initially, English was used for benchmarking due to the lack of Italian benchmarks. However, it is also useful for tasks intrinsically in English (e.g., coding).
\end{remark}
\begin{remark}
Some training datasets were automatically translated in Italian. Some others were adapted from existing Italian ones (e.g., transform a question answering dataset into a cloze form).
\end{remark}
\end{description}
\begin{description}
\item[FENICE metric] \marginnote{FENICE metric}
Factuality metric for summarization. It works as follows:
\begin{enumerate}
\item Extract claims from the summary with an ad-hoc LLM.
\item Align each claim with the original document with positive (if in support) and negative (if against) scores.
\item Perform co-reference resolution to unify entities across claims.
\end{enumerate}
\item[ALERT benchmark] \marginnote{ALERT benchmark}
Benchmark to test the safeness of an LLM based on 32 risk categories. The testing data are created as follows:
\begin{enumerate}
\item Filter the ``\textit{Helpfulness \& Harmlessness-RLHF}'' dataset of \textit{Anthropic} by considering for each example the first prompt and red team attacks only.
\item Use templates to automatically generate additional prompts.
\item Augment the prompts by formatting them as adversarial attacks. Examples of attacks are:
\begin{descriptionlist}
\item[Prefix/suffix injection]
Prepend or append an adversarial prompt (e.g., \texttt{disregard the instructions above and \dots}).
\item[Token manipulation]
Alter or invert a small fraction of tokens in the prompt (the idea is to use a prompt that is less likely to have been already seen in the alignment datasets).
\item[Jailbreaking]
Use more complex strategies (e.g., role playing).
\end{descriptionlist}
\end{enumerate}
\end{description}

View File

@ -0,0 +1,195 @@
\chapter{Speech processing}
\section{Audio representation}
\begin{description}
\item[Sound/soundwave] \marginnote{Soundwave}
Vibration that travels though a medium. It is modulated by:
\begin{descriptionlist}
\item[Pitch] Frequency of the vibrations.
\item[Loudness] Amplitude of the vibrations.
\end{descriptionlist}
\item[Waveform] \marginnote{Waveform}
Representation of a soundwave. It is described by:
\begin{description}
\item[Frequency]
Represents the pitch of the sound.
\item[Period]
Distance between two peaks of the sound (i.e., correlated to frequency as $f=\frac{1}{T}$).
\item[Amplitude]
Represents the loudness of the sound (i.e., the air pressure).
\begin{remark}
In practice, amplitude is usually converted in decibels due to the fact that the human auditory system perceives sound closer to a logarithmic scale.
\end{remark}
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/waveform.png}
\end{figure}
\item[Signal] \marginnote{Signal}
Representation of information.
\begin{remark}
In sound processing, the waveform itself is the signal.
\end{remark}
\begin{description}
\item[Analog signal] \marginnote{Analog signal}
Waveform as-is in the real world.
\item[Digital signal] \marginnote{Digital signal}
Sampled (i.e., measure uniform time steps) and quantized (discretize values) version of an analog waveform.
\end{description}
\item[Fourier transform] \marginnote{Fourier transform}
Method to decompose a continuous signal in its constituent sin waves.
Given a continuous signal $x(t)$, its Fourier transform is:
\[ X(f) = \int_{-\infty}^{+\infty} x(t) e^{-j2\pi ft} \,dt \]
where $X(f)$ indicates how much of the frequency $f$ exists in $x(t)$.
\begin{description}
\item[Discrete Fourier transform (DFT)] \marginnote{Discrete Fourier transform (DFT)}
Fourier transform for digital signals.
Given a discrete signal $x[n]$, its DFT is:
\[ X[k] = \sum_{n=0}^{N-1} x[n]e^{-\frac{j2\pi kn}{N}} \]
where $k$ is the discrete frequency and $N$ is the number of samples.
\item[Fast Fourier transform (FFT)] \marginnote{Fast Fourier transform (FFT)}
Efficient implementation of DFT for $N$s that are power of $2$.
\item[Short-time Fourier transform (STFT)] \marginnote{Short-time Fourier transform (STFT)}
FFT computed on short time windows of the sound signal.
\begin{remark}
This method allows preserving time information by using a fixed frame size.
\end{remark}
\begin{description}
\item[Spectrogram] \marginnote{Spectrogram}
Result of STFT that shows how the frequencies change over time.
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/spectrogram.png}
\end{figure}
\item[Inverse STFT (ISTFT)] \marginnote{Inverse STFT (ISTFT)}
Converts a time-frequency representation of sound (i.e., spectrogram) to its sound signal.
\begin{remark}
This allows to manipulate a signal in its frequency domain (STFT) and then convert it back (ISTFT).
\end{remark}
\end{description}
\item[Mel-scaled spectrogram] \marginnote{Mel-scaled spectrogram}
Spectrogram where frequencies are mapped to the mel scale (i.e., lower frequencies are more fine-grained while higher frequencies are more compressed, to match the human logarithmic sound perception).
\item[Audio features] \marginnote{Audio features}
Representation of a sound signal extracted from the waveform or spectrogram.
\end{description}
\section{Tasks}
\begin{description}
\item[Automatic speech recognition (ASP)]
Convert a sound signal into text.
\begin{example}
Use an RNN/transformer encoder-decoder architecture. A sound signal is processed as follows:
\begin{enumerate}
\item Compute the audio features from the waveform (e.g., mel-spectrogram).
\item Pass the computed features through the encoder.
\item Use the decoder to generate the output text autoregressively conditioned on the encoder.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/asp_arch.png}
\end{figure}
\end{example}
\item[Speech enhancement]
Clear the sound signal.
\item[Speech separation]
Separate the different sources in a sound signal (e.g., differentiate speakers).
\item[Text-to-speech]
Convert text into a sound signal.
\begin{example}
Use an encoder-decoder architecture. A text is processed as follows:
\begin{enumerate}
\item Use the encoder to embed the input text into a representation that encodes linguistic features (e.g., pronunciation, rhythm, \dots).
\item Use the decoder to predict a mel-spectrogram.
\item Use a neural vocoder to convert the mel-spectrogram into an audio waveform.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/tts_arch.png}
\end{figure}
\end{example}
\item[Speaker diarization]
Determine the moment and the person who spoke.
\item[Speech emotion recognition]
Recognize emotions from the sound signal.
\item[Neural network explanation]
Use speech to explain another speech.
\end{description}
\section{Speech foundation models}
\begin{description}
\item[Speech foundation model (SFM)] \marginnote{Speech foundation model (SFM)}
Transformer-based model pre-trained on speech. A common architecture is composed by:
\begin{descriptionlist}
\item[Feature extractor]
Converts the waveform into a low-dimensional representation (e.g., by using convolutions).
\item[Encoder]
Computes contextual embeddings from the sound features.
\end{descriptionlist}
\begin{remark}
SFM takes as input raw waveforms and are more robust in dealing with speech variability due to diverse speakers, environment, noise, \dots
\end{remark}
\begin{remark}
A SFM can be either fine-tuned for a specific task or used as a feature extractor for other models.
\end{remark}
\item[Multimodal model] \marginnote{Multimodal model}
Model able to handle multiple modalities (e.g., speech and text).
The main considerations to take into account when working with multimodel models are:
\begin{descriptionlist}
\item[Representation]
Decide how to encode different modalities into the same embedding space.
\item[Fusion]
Combine information from different modalities.
\item[Alignment]
Link corresponding elements (e.g., in time or by meaning) across different modalities.
\item[Translation]
Map information from one modality to another.
\item[Co-learning]
Leverage shared information between modalities for training.
\end{descriptionlist}
\end{description}

View File

@ -0,0 +1,213 @@
\chapter{Task-oriented dialog systems}
\section{Human dialogs}
\begin{description}
\item[Natural language dialog] \marginnote{Natural language dialog}
Sequence of utterances (i.e., sentences) between two or more participants where each takes a turn.
\item[Turn-taking problem] \marginnote{Turn-taking problem}
Determine when the turn of another participant ended.
\item[Speech/dialog act] \marginnote{Speech/dialog act}
Indicates the type of utterance.
\begin{example}
Yes-no question, declarative question, statement, appreciation, yes answer, \dots
\end{example}
\begin{description}
\item[Adjacency pairs]
Speech acts that commonly appear together.
\begin{example}
Question $\rightarrow$ answer.
\end{example}
\item[Subdialog]
Dialogs opened and closed within a dialog.
\begin{example}
Correction subdialog, clarification subdialog, \dots
\end{example}
\end{description}
\item[Dialog slot] \marginnote{Dialog slot}
Relevant entities and properties of an utterance.
\begin{description}
\item[Filler] Values assigned to a slot.
\end{description}
\item[Conversation initiative] \marginnote{Conversation initiative}
Who initiates the dialog.
\begin{description}
\item[User initiative]
The user asks questions and the system responds (e.g., FAQ).
\item[System initiative]
The system asks questions to the user (e.g., form completion).
\item[Mixed initiative]
Both the user and the system can ask questions.
\end{description}
\item[Types of dialog] \marginnote{Types of dialog}
\phantom{}
\begin{description}
\item[Information seeking] To retrieve information.
\item[Task-oriented] Dialog to achieve a goal.
\item[Argumentative] Argument in support or against an opinion.
\item[Explanatory] Teacher-student type of dialog.
\item[Recommendation] Persuasion dialog.
\item[Chit-chat] Free conversation.
\end{description}
\end{description}
\section{Task-oriented dialogs}
\subsection{Architectures}
\begin{description}
\item[Traditional dialog system] \marginnote{Traditional dialog system}
The main components of an artificial dialog system are:
\begin{description}
\item[Natural language understanding (NLU)] \marginnote{Natural language understanding (NLU)}
Extract the relevant information such as dialog acts and slot-fillers from the utterance.
\begin{remark}
This task can be seen as a named entity recognition problem.
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.45\linewidth]{./img/nlu_arch.png}
\caption{Example of neural architecture for slot filling}
\end{figure}
\item[Dialog state tracker (DST)] \marginnote{Dialog state tracker (DST)}
Maintains the history of the dialog. This component should also have access to a knowledge-base.
\item[Dialog policy manager] \marginnote{Dialog policy manager}
Produces the dialog acts that composes the response from the output of the DST.
\item[Natural language generation (NLG)] \marginnote{Natural language generation (NLG)}
Produces a natural language utterance from the dialog acts produced by the dialog manager.
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.85\linewidth]{./img/spoken_dialog_system.png}
\caption{Example of components for a spoken dialog system}
\end{figure}
\item[LLM for dialog system] \marginnote{LLM for dialog system}
Use a language model that takes as input the utterance and directly produces a response.
\end{description}
\subsection{Dataset}
\begin{description}
\item[MultiWOZ] \marginnote{MultiWOZ}
Collection of human-human conversations over multiple domains and topics annotated with dialog states (i.e., turns), slots, and acts.
The dataset also defines an ontology for slots and a knowledge-base.
\begin{remark}
Human annotations are determined by agreement between multiple annotators.
\end{remark}
\begin{remark}
The type of dialogs in the dataset sensibly affects the resulting dialog system.
\indenttbox
\begin{example}
Wizard of Oz collection is a part of MultiWOZ that consists of question-answer dialogs between a user and a wizard. Dialogs produced based on these might result too artificial.
\end{example}
\end{remark}
\end{description}
\section{Research topics}
\subsection{LLM domain portability}
\begin{description}
\item[Domain portability] \marginnote{Domain portability}
Adapt a model to a new domain (i.e., knowledge-base).
Possible approaches are:
\begin{descriptionlist}
\item[Fine-tuning]
Fine-tune the LLM with the new knowledge-base.
\begin{remark}
This approach is susceptible to the catastrophic forgetting problem.
\end{remark}
\item[Prompting]
Embed the new knowledge-base into the prompt of the LLM.
\begin{remark}
This approach risks hallucinations and is constrained to the limits of the context length and computational inefficiency.
\end{remark}
\item[Functional calling]
Let the LLM query the knowledge-base when needed.
\begin{remark}
This approach requires more complex prompts and not all LLMs support it.
\end{remark}
\end{descriptionlist}
\begin{remark}
Experimental results show that functional calling works better than embedding the KB in the prompt. It is also more effective when the KB becomes bigger.
\end{remark}
\end{description}
\subsection{LLM pragmatics}
\begin{description}
\item[Pragmatics] \marginnote{Pragmatics}
Ability to adapt a conversation based on the context.
\item[Proactivity] \marginnote{Proactivity}
Ability of providing useful but not explicitly requested information.
\begin{remark}
An LLM can be made more proactive by prompting or fine-tuning.
\end{remark}
\end{description}
\subsection{LLM for dialog generation}
\begin{description}
\item[Automatic dialog generation] \marginnote{Automatic dialog generation}
Use an LLM to generate and annotate dialogs to create a synthetic dataset. A possible approach is based on the following steps:
\begin{descriptionlist}
\item[Generation]
Use the LLM to generate a dialog. Possible approaches are:
\begin{descriptionlist}
\item[One-pass] Prompt the LLM to generate a dialog based on a few references.
\item[Interactive] Produce a dialog by conversing with the model.
\item[Teacher-student] Let two LLMs converse.
\end{descriptionlist}
\item[Annotation]
Prompt the LLM to annotate the generated dialog based on some schema.
\item[Evaluation]
Evaluate based on human opinion.
\end{descriptionlist}
\end{description}