mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add NLP seminars
This commit is contained in:
BIN
src/year2/natural-language-processing/img/asp_arch.png
Normal file
BIN
src/year2/natural-language-processing/img/asp_arch.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 75 KiB |
BIN
src/year2/natural-language-processing/img/nlu_arch.png
Normal file
BIN
src/year2/natural-language-processing/img/nlu_arch.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 76 KiB |
BIN
src/year2/natural-language-processing/img/spectrogram.png
Normal file
BIN
src/year2/natural-language-processing/img/spectrogram.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 386 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 148 KiB |
BIN
src/year2/natural-language-processing/img/tts_arch.png
Normal file
BIN
src/year2/natural-language-processing/img/tts_arch.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 71 KiB |
BIN
src/year2/natural-language-processing/img/waveform.png
Normal file
BIN
src/year2/natural-language-processing/img/waveform.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 97 KiB |
@ -18,4 +18,9 @@
|
||||
\include{./sections/_model_efficiency.tex}
|
||||
\include{./sections/_llm_usage.tex}
|
||||
|
||||
\appendix
|
||||
\include{./sections/_task_oriented_dialog_system.tex}
|
||||
\include{./sections/_speech.tex}
|
||||
\include{./sections/_italian_llm.tex}
|
||||
|
||||
\end{document}
|
||||
@ -0,0 +1,51 @@
|
||||
\chapter{Italian LLMs}
|
||||
|
||||
\begin{remark}
|
||||
Advantages of pre-training from scratch are:
|
||||
\begin{itemize}
|
||||
\item Having full control on the training data.
|
||||
\item Improve the fertility of the tokenizer.
|
||||
\end{itemize}
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Minerva] \marginnote{Minerva}
|
||||
Language model pre-trained on the Italian language.
|
||||
|
||||
\begin{remark}
|
||||
Minerva's pre-training corpus is actually composed by both Italian and English datasets.
|
||||
|
||||
Initially, English was used for benchmarking due to the lack of Italian benchmarks. However, it is also useful for tasks intrinsically in English (e.g., coding).
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Some training datasets were automatically translated in Italian. Some others were adapted from existing Italian ones (e.g., transform a question answering dataset into a cloze form).
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[FENICE metric] \marginnote{FENICE metric}
|
||||
Factuality metric for summarization. It works as follows:
|
||||
\begin{enumerate}
|
||||
\item Extract claims from the summary with an ad-hoc LLM.
|
||||
\item Align each claim with the original document with positive (if in support) and negative (if against) scores.
|
||||
\item Perform co-reference resolution to unify entities across claims.
|
||||
\end{enumerate}
|
||||
|
||||
\item[ALERT benchmark] \marginnote{ALERT benchmark}
|
||||
Benchmark to test the safeness of an LLM based on 32 risk categories. The testing data are created as follows:
|
||||
\begin{enumerate}
|
||||
\item Filter the ``\textit{Helpfulness \& Harmlessness-RLHF}'' dataset of \textit{Anthropic} by considering for each example the first prompt and red team attacks only.
|
||||
\item Use templates to automatically generate additional prompts.
|
||||
\item Augment the prompts by formatting them as adversarial attacks. Examples of attacks are:
|
||||
\begin{descriptionlist}
|
||||
\item[Prefix/suffix injection]
|
||||
Prepend or append an adversarial prompt (e.g., \texttt{disregard the instructions above and \dots}).
|
||||
\item[Token manipulation]
|
||||
Alter or invert a small fraction of tokens in the prompt (the idea is to use a prompt that is less likely to have been already seen in the alignment datasets).
|
||||
\item[Jailbreaking]
|
||||
Use more complex strategies (e.g., role playing).
|
||||
\end{descriptionlist}
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
195
src/year2/natural-language-processing/sections/_speech.tex
Normal file
195
src/year2/natural-language-processing/sections/_speech.tex
Normal file
@ -0,0 +1,195 @@
|
||||
\chapter{Speech processing}
|
||||
|
||||
|
||||
\section{Audio representation}
|
||||
|
||||
\begin{description}
|
||||
\item[Sound/soundwave] \marginnote{Soundwave}
|
||||
Vibration that travels though a medium. It is modulated by:
|
||||
\begin{descriptionlist}
|
||||
\item[Pitch] Frequency of the vibrations.
|
||||
\item[Loudness] Amplitude of the vibrations.
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Waveform] \marginnote{Waveform}
|
||||
Representation of a soundwave. It is described by:
|
||||
\begin{description}
|
||||
\item[Frequency]
|
||||
Represents the pitch of the sound.
|
||||
|
||||
\item[Period]
|
||||
Distance between two peaks of the sound (i.e., correlated to frequency as $f=\frac{1}{T}$).
|
||||
|
||||
\item[Amplitude]
|
||||
Represents the loudness of the sound (i.e., the air pressure).
|
||||
|
||||
\begin{remark}
|
||||
In practice, amplitude is usually converted in decibels due to the fact that the human auditory system perceives sound closer to a logarithmic scale.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/waveform.png}
|
||||
\end{figure}
|
||||
|
||||
\item[Signal] \marginnote{Signal}
|
||||
Representation of information.
|
||||
|
||||
\begin{remark}
|
||||
In sound processing, the waveform itself is the signal.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Analog signal] \marginnote{Analog signal}
|
||||
Waveform as-is in the real world.
|
||||
|
||||
\item[Digital signal] \marginnote{Digital signal}
|
||||
Sampled (i.e., measure uniform time steps) and quantized (discretize values) version of an analog waveform.
|
||||
\end{description}
|
||||
|
||||
\item[Fourier transform] \marginnote{Fourier transform}
|
||||
Method to decompose a continuous signal in its constituent sin waves.
|
||||
|
||||
Given a continuous signal $x(t)$, its Fourier transform is:
|
||||
\[ X(f) = \int_{-\infty}^{+\infty} x(t) e^{-j2\pi ft} \,dt \]
|
||||
where $X(f)$ indicates how much of the frequency $f$ exists in $x(t)$.
|
||||
|
||||
\begin{description}
|
||||
\item[Discrete Fourier transform (DFT)] \marginnote{Discrete Fourier transform (DFT)}
|
||||
Fourier transform for digital signals.
|
||||
|
||||
Given a discrete signal $x[n]$, its DFT is:
|
||||
\[ X[k] = \sum_{n=0}^{N-1} x[n]e^{-\frac{j2\pi kn}{N}} \]
|
||||
where $k$ is the discrete frequency and $N$ is the number of samples.
|
||||
|
||||
\item[Fast Fourier transform (FFT)] \marginnote{Fast Fourier transform (FFT)}
|
||||
Efficient implementation of DFT for $N$s that are power of $2$.
|
||||
|
||||
\item[Short-time Fourier transform (STFT)] \marginnote{Short-time Fourier transform (STFT)}
|
||||
FFT computed on short time windows of the sound signal.
|
||||
|
||||
\begin{remark}
|
||||
This method allows preserving time information by using a fixed frame size.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Spectrogram] \marginnote{Spectrogram}
|
||||
Result of STFT that shows how the frequencies change over time.
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/spectrogram.png}
|
||||
\end{figure}
|
||||
|
||||
\item[Inverse STFT (ISTFT)] \marginnote{Inverse STFT (ISTFT)}
|
||||
Converts a time-frequency representation of sound (i.e., spectrogram) to its sound signal.
|
||||
|
||||
\begin{remark}
|
||||
This allows to manipulate a signal in its frequency domain (STFT) and then convert it back (ISTFT).
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
\item[Mel-scaled spectrogram] \marginnote{Mel-scaled spectrogram}
|
||||
Spectrogram where frequencies are mapped to the mel scale (i.e., lower frequencies are more fine-grained while higher frequencies are more compressed, to match the human logarithmic sound perception).
|
||||
|
||||
\item[Audio features] \marginnote{Audio features}
|
||||
Representation of a sound signal extracted from the waveform or spectrogram.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Tasks}
|
||||
|
||||
\begin{description}
|
||||
\item[Automatic speech recognition (ASP)]
|
||||
Convert a sound signal into text.
|
||||
|
||||
\begin{example}
|
||||
Use an RNN/transformer encoder-decoder architecture. A sound signal is processed as follows:
|
||||
\begin{enumerate}
|
||||
\item Compute the audio features from the waveform (e.g., mel-spectrogram).
|
||||
\item Pass the computed features through the encoder.
|
||||
\item Use the decoder to generate the output text autoregressively conditioned on the encoder.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/asp_arch.png}
|
||||
\end{figure}
|
||||
\end{example}
|
||||
|
||||
\item[Speech enhancement]
|
||||
Clear the sound signal.
|
||||
|
||||
\item[Speech separation]
|
||||
Separate the different sources in a sound signal (e.g., differentiate speakers).
|
||||
|
||||
\item[Text-to-speech]
|
||||
Convert text into a sound signal.
|
||||
|
||||
\begin{example}
|
||||
Use an encoder-decoder architecture. A text is processed as follows:
|
||||
\begin{enumerate}
|
||||
\item Use the encoder to embed the input text into a representation that encodes linguistic features (e.g., pronunciation, rhythm, \dots).
|
||||
\item Use the decoder to predict a mel-spectrogram.
|
||||
\item Use a neural vocoder to convert the mel-spectrogram into an audio waveform.
|
||||
\end{enumerate}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{./img/tts_arch.png}
|
||||
\end{figure}
|
||||
\end{example}
|
||||
|
||||
\item[Speaker diarization]
|
||||
Determine the moment and the person who spoke.
|
||||
|
||||
\item[Speech emotion recognition]
|
||||
Recognize emotions from the sound signal.
|
||||
|
||||
\item[Neural network explanation]
|
||||
Use speech to explain another speech.
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Speech foundation models}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Speech foundation model (SFM)] \marginnote{Speech foundation model (SFM)}
|
||||
Transformer-based model pre-trained on speech. A common architecture is composed by:
|
||||
\begin{descriptionlist}
|
||||
\item[Feature extractor]
|
||||
Converts the waveform into a low-dimensional representation (e.g., by using convolutions).
|
||||
\item[Encoder]
|
||||
Computes contextual embeddings from the sound features.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{remark}
|
||||
SFM takes as input raw waveforms and are more robust in dealing with speech variability due to diverse speakers, environment, noise, \dots
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
A SFM can be either fine-tuned for a specific task or used as a feature extractor for other models.
|
||||
\end{remark}
|
||||
|
||||
\item[Multimodal model] \marginnote{Multimodal model}
|
||||
Model able to handle multiple modalities (e.g., speech and text).
|
||||
|
||||
The main considerations to take into account when working with multimodel models are:
|
||||
\begin{descriptionlist}
|
||||
\item[Representation]
|
||||
Decide how to encode different modalities into the same embedding space.
|
||||
\item[Fusion]
|
||||
Combine information from different modalities.
|
||||
\item[Alignment]
|
||||
Link corresponding elements (e.g., in time or by meaning) across different modalities.
|
||||
\item[Translation]
|
||||
Map information from one modality to another.
|
||||
\item[Co-learning]
|
||||
Leverage shared information between modalities for training.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
@ -0,0 +1,213 @@
|
||||
\chapter{Task-oriented dialog systems}
|
||||
|
||||
|
||||
\section{Human dialogs}
|
||||
|
||||
\begin{description}
|
||||
\item[Natural language dialog] \marginnote{Natural language dialog}
|
||||
Sequence of utterances (i.e., sentences) between two or more participants where each takes a turn.
|
||||
|
||||
\item[Turn-taking problem] \marginnote{Turn-taking problem}
|
||||
Determine when the turn of another participant ended.
|
||||
|
||||
\item[Speech/dialog act] \marginnote{Speech/dialog act}
|
||||
Indicates the type of utterance.
|
||||
|
||||
\begin{example}
|
||||
Yes-no question, declarative question, statement, appreciation, yes answer, \dots
|
||||
\end{example}
|
||||
|
||||
\begin{description}
|
||||
\item[Adjacency pairs]
|
||||
Speech acts that commonly appear together.
|
||||
|
||||
\begin{example}
|
||||
Question $\rightarrow$ answer.
|
||||
\end{example}
|
||||
|
||||
\item[Subdialog]
|
||||
Dialogs opened and closed within a dialog.
|
||||
|
||||
\begin{example}
|
||||
Correction subdialog, clarification subdialog, \dots
|
||||
\end{example}
|
||||
\end{description}
|
||||
|
||||
\item[Dialog slot] \marginnote{Dialog slot}
|
||||
Relevant entities and properties of an utterance.
|
||||
|
||||
\begin{description}
|
||||
\item[Filler] Values assigned to a slot.
|
||||
\end{description}
|
||||
|
||||
\item[Conversation initiative] \marginnote{Conversation initiative}
|
||||
Who initiates the dialog.
|
||||
|
||||
\begin{description}
|
||||
\item[User initiative]
|
||||
The user asks questions and the system responds (e.g., FAQ).
|
||||
|
||||
\item[System initiative]
|
||||
The system asks questions to the user (e.g., form completion).
|
||||
|
||||
\item[Mixed initiative]
|
||||
Both the user and the system can ask questions.
|
||||
\end{description}
|
||||
|
||||
\item[Types of dialog] \marginnote{Types of dialog}
|
||||
\phantom{}
|
||||
\begin{description}
|
||||
\item[Information seeking] To retrieve information.
|
||||
\item[Task-oriented] Dialog to achieve a goal.
|
||||
\item[Argumentative] Argument in support or against an opinion.
|
||||
\item[Explanatory] Teacher-student type of dialog.
|
||||
\item[Recommendation] Persuasion dialog.
|
||||
\item[Chit-chat] Free conversation.
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Task-oriented dialogs}
|
||||
|
||||
|
||||
\subsection{Architectures}
|
||||
|
||||
\begin{description}
|
||||
\item[Traditional dialog system] \marginnote{Traditional dialog system}
|
||||
The main components of an artificial dialog system are:
|
||||
\begin{description}
|
||||
\item[Natural language understanding (NLU)] \marginnote{Natural language understanding (NLU)}
|
||||
Extract the relevant information such as dialog acts and slot-fillers from the utterance.
|
||||
|
||||
\begin{remark}
|
||||
This task can be seen as a named entity recognition problem.
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/nlu_arch.png}
|
||||
\caption{Example of neural architecture for slot filling}
|
||||
\end{figure}
|
||||
|
||||
\item[Dialog state tracker (DST)] \marginnote{Dialog state tracker (DST)}
|
||||
Maintains the history of the dialog. This component should also have access to a knowledge-base.
|
||||
|
||||
\item[Dialog policy manager] \marginnote{Dialog policy manager}
|
||||
Produces the dialog acts that composes the response from the output of the DST.
|
||||
|
||||
\item[Natural language generation (NLG)] \marginnote{Natural language generation (NLG)}
|
||||
Produces a natural language utterance from the dialog acts produced by the dialog manager.
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.85\linewidth]{./img/spoken_dialog_system.png}
|
||||
\caption{Example of components for a spoken dialog system}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\item[LLM for dialog system] \marginnote{LLM for dialog system}
|
||||
Use a language model that takes as input the utterance and directly produces a response.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Dataset}
|
||||
|
||||
\begin{description}
|
||||
\item[MultiWOZ] \marginnote{MultiWOZ}
|
||||
Collection of human-human conversations over multiple domains and topics annotated with dialog states (i.e., turns), slots, and acts.
|
||||
|
||||
The dataset also defines an ontology for slots and a knowledge-base.
|
||||
|
||||
\begin{remark}
|
||||
Human annotations are determined by agreement between multiple annotators.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
The type of dialogs in the dataset sensibly affects the resulting dialog system.
|
||||
|
||||
\indenttbox
|
||||
\begin{example}
|
||||
Wizard of Oz collection is a part of MultiWOZ that consists of question-answer dialogs between a user and a wizard. Dialogs produced based on these might result too artificial.
|
||||
\end{example}
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Research topics}
|
||||
|
||||
|
||||
\subsection{LLM domain portability}
|
||||
|
||||
\begin{description}
|
||||
\item[Domain portability] \marginnote{Domain portability}
|
||||
Adapt a model to a new domain (i.e., knowledge-base).
|
||||
|
||||
Possible approaches are:
|
||||
\begin{descriptionlist}
|
||||
\item[Fine-tuning]
|
||||
Fine-tune the LLM with the new knowledge-base.
|
||||
|
||||
\begin{remark}
|
||||
This approach is susceptible to the catastrophic forgetting problem.
|
||||
\end{remark}
|
||||
|
||||
\item[Prompting]
|
||||
Embed the new knowledge-base into the prompt of the LLM.
|
||||
|
||||
\begin{remark}
|
||||
This approach risks hallucinations and is constrained to the limits of the context length and computational inefficiency.
|
||||
\end{remark}
|
||||
|
||||
\item[Functional calling]
|
||||
Let the LLM query the knowledge-base when needed.
|
||||
|
||||
\begin{remark}
|
||||
This approach requires more complex prompts and not all LLMs support it.
|
||||
\end{remark}
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{remark}
|
||||
Experimental results show that functional calling works better than embedding the KB in the prompt. It is also more effective when the KB becomes bigger.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{LLM pragmatics}
|
||||
|
||||
\begin{description}
|
||||
\item[Pragmatics] \marginnote{Pragmatics}
|
||||
Ability to adapt a conversation based on the context.
|
||||
|
||||
\item[Proactivity] \marginnote{Proactivity}
|
||||
Ability of providing useful but not explicitly requested information.
|
||||
|
||||
\begin{remark}
|
||||
An LLM can be made more proactive by prompting or fine-tuning.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{LLM for dialog generation}
|
||||
|
||||
\begin{description}
|
||||
\item[Automatic dialog generation] \marginnote{Automatic dialog generation}
|
||||
Use an LLM to generate and annotate dialogs to create a synthetic dataset. A possible approach is based on the following steps:
|
||||
\begin{descriptionlist}
|
||||
\item[Generation]
|
||||
Use the LLM to generate a dialog. Possible approaches are:
|
||||
\begin{descriptionlist}
|
||||
\item[One-pass] Prompt the LLM to generate a dialog based on a few references.
|
||||
\item[Interactive] Produce a dialog by conversing with the model.
|
||||
\item[Teacher-student] Let two LLMs converse.
|
||||
\end{descriptionlist}
|
||||
|
||||
\item[Annotation]
|
||||
Prompt the LLM to annotate the generated dialog based on some schema.
|
||||
|
||||
\item[Evaluation]
|
||||
Evaluate based on human opinion.
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user