diff --git a/src/year2/natural-language-processing/img/asp_arch.png b/src/year2/natural-language-processing/img/asp_arch.png
new file mode 100644
index 0000000..194c0b4
Binary files /dev/null and b/src/year2/natural-language-processing/img/asp_arch.png differ
diff --git a/src/year2/natural-language-processing/img/nlu_arch.png b/src/year2/natural-language-processing/img/nlu_arch.png
new file mode 100644
index 0000000..f8f877e
Binary files /dev/null and b/src/year2/natural-language-processing/img/nlu_arch.png differ
diff --git a/src/year2/natural-language-processing/img/spectrogram.png b/src/year2/natural-language-processing/img/spectrogram.png
new file mode 100644
index 0000000..ae1409c
Binary files /dev/null and b/src/year2/natural-language-processing/img/spectrogram.png differ
diff --git a/src/year2/natural-language-processing/img/spoken_dialog_system.png b/src/year2/natural-language-processing/img/spoken_dialog_system.png
new file mode 100644
index 0000000..c9ce604
Binary files /dev/null and b/src/year2/natural-language-processing/img/spoken_dialog_system.png differ
diff --git a/src/year2/natural-language-processing/img/tts_arch.png b/src/year2/natural-language-processing/img/tts_arch.png
new file mode 100644
index 0000000..8a1bc0c
Binary files /dev/null and b/src/year2/natural-language-processing/img/tts_arch.png differ
diff --git a/src/year2/natural-language-processing/img/waveform.png b/src/year2/natural-language-processing/img/waveform.png
new file mode 100644
index 0000000..cff48bf
Binary files /dev/null and b/src/year2/natural-language-processing/img/waveform.png differ
diff --git a/src/year2/natural-language-processing/nlp.tex b/src/year2/natural-language-processing/nlp.tex
index ad82d12..6eb5c98 100644
--- a/src/year2/natural-language-processing/nlp.tex
+++ b/src/year2/natural-language-processing/nlp.tex
@@ -18,4 +18,9 @@
     \include{./sections/_model_efficiency.tex}
     \include{./sections/_llm_usage.tex}
 
+    \appendix
+    \include{./sections/_task_oriented_dialog_system.tex}
+    \include{./sections/_speech.tex}
+    \include{./sections/_italian_llm.tex}
+
 \end{document}
\ No newline at end of file
diff --git a/src/year2/natural-language-processing/sections/_italian_llm.tex b/src/year2/natural-language-processing/sections/_italian_llm.tex
new file mode 100644
index 0000000..21ee5f3
--- /dev/null
+++ b/src/year2/natural-language-processing/sections/_italian_llm.tex
@@ -0,0 +1,51 @@
+\chapter{Italian LLMs}
+
+\begin{remark}
+    Advantages of pre-training from scratch are:
+    \begin{itemize}
+        \item Having full control on the training data.
+        \item Improve the fertility of the tokenizer.
+    \end{itemize} 
+\end{remark}
+
+\begin{description}
+    \item[Minerva] \marginnote{Minerva}
+        Language model pre-trained on the Italian language.
+
+        \begin{remark}
+            Minerva's pre-training corpus is actually composed by both Italian and English datasets.
+
+            Initially, English was used for benchmarking due to the lack of Italian benchmarks. However, it is also useful for tasks intrinsically in English (e.g., coding).
+        \end{remark}
+
+        \begin{remark}
+            Some training datasets were automatically translated in Italian. Some others were adapted from existing Italian ones (e.g., transform a question answering dataset into a cloze form).
+        \end{remark}
+\end{description}
+
+
+\begin{description}
+    \item[FENICE metric] \marginnote{FENICE metric}
+        Factuality metric for summarization. It works as follows:
+        \begin{enumerate}
+            \item Extract claims from the summary with an ad-hoc LLM.
+            \item Align each claim with the original document with positive (if in support) and negative (if against) scores.
+            \item Perform co-reference resolution to unify entities across claims.
+        \end{enumerate}
+
+    \item[ALERT benchmark] \marginnote{ALERT benchmark}
+        Benchmark to test the safeness of an LLM based on 32 risk categories. The testing data are created as follows:
+        \begin{enumerate}
+            \item Filter the ``\textit{Helpfulness \& Harmlessness-RLHF}'' dataset of \textit{Anthropic} by considering for each example the first prompt and red team attacks only.
+            \item Use templates to automatically generate additional prompts.
+            \item Augment the prompts by formatting them as adversarial attacks. Examples of attacks are:
+            \begin{descriptionlist}
+                \item[Prefix/suffix injection]
+                    Prepend or append an adversarial prompt (e.g., \texttt{disregard the instructions above and \dots}).
+                \item[Token manipulation]
+                    Alter or invert a small fraction of tokens in the prompt (the idea is to use a prompt that is less likely to have been already seen in the alignment datasets).
+                \item[Jailbreaking]
+                    Use more complex strategies (e.g., role playing).
+            \end{descriptionlist}
+        \end{enumerate}
+\end{description}
diff --git a/src/year2/natural-language-processing/sections/_speech.tex b/src/year2/natural-language-processing/sections/_speech.tex
new file mode 100644
index 0000000..9e7343c
--- /dev/null
+++ b/src/year2/natural-language-processing/sections/_speech.tex
@@ -0,0 +1,195 @@
+\chapter{Speech processing}
+
+
+\section{Audio representation}
+
+\begin{description}
+    \item[Sound/soundwave] \marginnote{Soundwave}
+        Vibration that travels though a medium. It is modulated by:
+        \begin{descriptionlist}
+            \item[Pitch] Frequency of the vibrations.
+            \item[Loudness] Amplitude of the vibrations. 
+        \end{descriptionlist}
+
+    \item[Waveform] \marginnote{Waveform}
+        Representation of a soundwave. It is described by:
+        \begin{description}
+            \item[Frequency] 
+                Represents the pitch of the sound.
+
+            \item[Period] 
+                Distance between two peaks of the sound (i.e., correlated to frequency as $f=\frac{1}{T}$).
+
+            \item[Amplitude] 
+                Represents the loudness of the sound (i.e., the air pressure).
+
+                \begin{remark}
+                    In practice, amplitude is usually converted in decibels due to the fact that the human auditory system perceives sound closer to a logarithmic scale.
+                \end{remark}
+        \end{description}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.7\linewidth]{./img/waveform.png}
+        \end{figure}
+
+    \item[Signal] \marginnote{Signal}
+        Representation of information.
+
+        \begin{remark}
+            In sound processing, the waveform itself is the signal.
+        \end{remark}
+
+        \begin{description}
+            \item[Analog signal] \marginnote{Analog signal}
+                Waveform as-is in the real world.
+
+            \item[Digital signal] \marginnote{Digital signal}
+                Sampled (i.e., measure uniform time steps) and quantized (discretize values) version of an analog waveform.
+        \end{description}
+
+    \item[Fourier transform] \marginnote{Fourier transform}
+        Method to decompose a continuous signal in its constituent sin waves.
+
+        Given a continuous signal $x(t)$, its Fourier transform is:
+        \[ X(f) = \int_{-\infty}^{+\infty} x(t) e^{-j2\pi ft} \,dt \]
+        where $X(f)$ indicates how much of the frequency $f$ exists in $x(t)$.
+
+        \begin{description}
+            \item[Discrete Fourier transform (DFT)] \marginnote{Discrete Fourier transform (DFT)}
+                Fourier transform for digital signals.
+
+                Given a discrete signal $x[n]$, its DFT is:
+                \[ X[k] = \sum_{n=0}^{N-1} x[n]e^{-\frac{j2\pi kn}{N}} \]
+                where $k$ is the discrete frequency and $N$ is the number of samples.
+
+            \item[Fast Fourier transform (FFT)] \marginnote{Fast Fourier transform (FFT)}
+                Efficient implementation of DFT for $N$s that are power of $2$.
+
+            \item[Short-time Fourier transform (STFT)] \marginnote{Short-time Fourier transform (STFT)}
+                FFT computed on short time windows of the sound signal.
+
+                \begin{remark}
+                    This method allows preserving time information by using a fixed frame size.
+                \end{remark}
+
+                \begin{description}
+                    \item[Spectrogram] \marginnote{Spectrogram}
+                        Result of STFT that shows how the frequencies change over time.
+                \end{description}
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.8\linewidth]{./img/spectrogram.png}
+                \end{figure}
+
+            \item[Inverse STFT (ISTFT)] \marginnote{Inverse STFT (ISTFT)}
+                Converts a time-frequency representation of sound (i.e., spectrogram) to its sound signal.
+
+                \begin{remark}
+                    This allows to manipulate a signal in its frequency domain (STFT) and then convert it back (ISTFT).
+                \end{remark}
+        \end{description}
+
+    \item[Mel-scaled spectrogram] \marginnote{Mel-scaled spectrogram}
+        Spectrogram where frequencies are mapped to the mel scale (i.e., lower frequencies are more fine-grained while higher frequencies are more compressed, to match the human logarithmic sound perception).
+
+    \item[Audio features] \marginnote{Audio features}
+        Representation of a sound signal extracted from the waveform or spectrogram.
+\end{description}
+
+
+
+\section{Tasks}
+
+\begin{description}
+    \item[Automatic speech recognition (ASP)]
+        Convert a sound signal into text.
+
+        \begin{example}
+            Use an RNN/transformer encoder-decoder architecture. A sound signal is processed as follows:
+            \begin{enumerate}
+                \item Compute the audio features from the waveform (e.g., mel-spectrogram).
+                \item Pass the computed features through the encoder.
+                \item Use the decoder to generate the output text autoregressively conditioned on the encoder.
+            \end{enumerate}
+
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.5\linewidth]{./img/asp_arch.png}
+            \end{figure}
+        \end{example}
+
+    \item[Speech enhancement]
+        Clear the sound signal.
+
+    \item[Speech separation]
+        Separate the different sources in a sound signal (e.g., differentiate speakers).
+
+    \item[Text-to-speech]
+        Convert text into a sound signal.
+
+        \begin{example}
+            Use an encoder-decoder architecture. A text is processed as follows:
+            \begin{enumerate}
+                \item Use the encoder to embed the input text into a representation that encodes linguistic features (e.g., pronunciation, rhythm, \dots).
+                \item Use the decoder to predict a mel-spectrogram.
+                \item Use a neural vocoder to convert the mel-spectrogram into an audio waveform.
+            \end{enumerate}
+
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.9\linewidth]{./img/tts_arch.png}
+            \end{figure}
+        \end{example}
+
+    \item[Speaker diarization]
+        Determine the moment and the person who spoke.
+
+    \item[Speech emotion recognition]
+        Recognize emotions from the sound signal.
+
+    \item[Neural network explanation]
+        Use speech to explain another speech.
+\end{description}
+
+
+
+\section{Speech foundation models}
+
+
+\begin{description}
+    \item[Speech foundation model (SFM)] \marginnote{Speech foundation model (SFM)}
+        Transformer-based model pre-trained on speech. A common architecture is composed by:
+        \begin{descriptionlist}
+            \item[Feature extractor] 
+                Converts the waveform into a low-dimensional representation (e.g., by using convolutions).
+            \item[Encoder] 
+                Computes contextual embeddings from the sound features.
+        \end{descriptionlist}
+
+        \begin{remark}
+            SFM takes as input raw waveforms and are more robust in dealing with speech variability due to diverse speakers, environment, noise, \dots
+        \end{remark}
+
+        \begin{remark}
+            A SFM can be either fine-tuned for a specific task or used as a feature extractor for other models.
+        \end{remark}
+
+    \item[Multimodal model] \marginnote{Multimodal model}
+        Model able to handle multiple modalities (e.g., speech and text). 
+
+        The main considerations to take into account when working with multimodel models are:
+        \begin{descriptionlist}
+            \item[Representation] 
+                Decide how to encode different modalities into the same embedding space.
+            \item[Fusion]
+                Combine information from different modalities.
+            \item[Alignment]
+                Link corresponding elements (e.g., in time or by meaning) across different modalities.
+            \item[Translation]
+                Map information from one modality to another.
+            \item[Co-learning]
+                Leverage shared information between modalities for training.
+        \end{descriptionlist}
+\end{description}
\ No newline at end of file
diff --git a/src/year2/natural-language-processing/sections/_task_oriented_dialog_system.tex b/src/year2/natural-language-processing/sections/_task_oriented_dialog_system.tex
new file mode 100644
index 0000000..1eec077
--- /dev/null
+++ b/src/year2/natural-language-processing/sections/_task_oriented_dialog_system.tex
@@ -0,0 +1,213 @@
+\chapter{Task-oriented dialog systems}
+
+
+\section{Human dialogs}
+
+\begin{description}
+    \item[Natural language dialog] \marginnote{Natural language dialog}
+        Sequence of utterances (i.e., sentences) between two or more participants where each takes a turn.
+
+    \item[Turn-taking problem] \marginnote{Turn-taking problem}
+        Determine when the turn of another participant ended.
+
+    \item[Speech/dialog act] \marginnote{Speech/dialog act}
+        Indicates the type of utterance.
+
+        \begin{example}
+            Yes-no question, declarative question, statement, appreciation, yes answer, \dots
+        \end{example}
+
+        \begin{description}
+            \item[Adjacency pairs]
+                Speech acts that commonly appear together.
+
+                \begin{example}
+                    Question $\rightarrow$ answer.
+                \end{example}
+
+            \item[Subdialog]
+                Dialogs opened and closed within a dialog.
+
+                \begin{example}
+                    Correction subdialog, clarification subdialog, \dots
+                \end{example}
+        \end{description}
+
+    \item[Dialog slot] \marginnote{Dialog slot}
+        Relevant entities and properties of an utterance.
+
+        \begin{description}
+            \item[Filler] Values assigned to a slot. 
+        \end{description}
+
+    \item[Conversation initiative] \marginnote{Conversation initiative}
+        Who initiates the dialog.
+
+        \begin{description}
+            \item[User initiative]
+                The user asks questions and the system responds (e.g., FAQ).
+
+            \item[System initiative]
+                The system asks questions to the user (e.g., form completion).
+
+            \item[Mixed initiative]
+                Both the user and the system can ask questions.
+        \end{description}
+
+    \item[Types of dialog] \marginnote{Types of dialog}
+        \phantom{}
+        \begin{description}
+            \item[Information seeking] To retrieve information.
+            \item[Task-oriented] Dialog to achieve a goal.
+            \item[Argumentative] Argument in support or against an opinion.
+            \item[Explanatory] Teacher-student type of dialog. 
+            \item[Recommendation] Persuasion dialog.
+            \item[Chit-chat] Free conversation. 
+        \end{description}
+\end{description}
+
+
+
+\section{Task-oriented dialogs}
+
+
+\subsection{Architectures}
+
+\begin{description}
+    \item[Traditional dialog system] \marginnote{Traditional dialog system}
+        The main components of an artificial dialog system are:
+        \begin{description}
+            \item[Natural language understanding (NLU)] \marginnote{Natural language understanding (NLU)}
+                Extract the relevant information such as dialog acts and slot-fillers from the utterance.
+
+                \begin{remark}
+                    This task can be seen as a named entity recognition problem.
+                \end{remark}
+
+                \begin{figure}[H]
+                    \centering
+                    \includegraphics[width=0.45\linewidth]{./img/nlu_arch.png}
+                    \caption{Example of neural architecture for slot filling}
+                \end{figure}
+
+            \item[Dialog state tracker (DST)] \marginnote{Dialog state tracker (DST)}
+                Maintains the history of the dialog. This component should also have access to a knowledge-base.
+
+            \item[Dialog policy manager] \marginnote{Dialog policy manager}
+                Produces the dialog acts that composes the response from the output of the DST.
+
+            \item[Natural language generation (NLG)] \marginnote{Natural language generation (NLG)}
+                Produces a natural language utterance from the dialog acts produced by the dialog manager.
+        \end{description}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/spoken_dialog_system.png}
+            \caption{Example of components for a spoken dialog system}
+        \end{figure}
+
+
+    \item[LLM for dialog system] \marginnote{LLM for dialog system}
+        Use a language model that takes as input the utterance and directly produces a response.
+\end{description}
+
+
+\subsection{Dataset}
+
+\begin{description}
+    \item[MultiWOZ] \marginnote{MultiWOZ}
+        Collection of human-human conversations over multiple domains and topics annotated with dialog states (i.e., turns), slots, and acts.
+
+        The dataset also defines an ontology for slots and a knowledge-base.
+
+        \begin{remark}
+            Human annotations are determined by agreement between multiple annotators.
+        \end{remark}
+
+        \begin{remark}
+            The type of dialogs in the dataset sensibly affects the resulting dialog system.
+
+            \indenttbox
+            \begin{example}
+                Wizard of Oz collection is a part of MultiWOZ that consists of question-answer dialogs between a user and a wizard. Dialogs produced based on these might result too artificial.
+            \end{example}
+        \end{remark}
+\end{description}
+
+
+
+\section{Research topics}
+
+
+\subsection{LLM domain portability}
+
+\begin{description}
+    \item[Domain portability] \marginnote{Domain portability}
+        Adapt a model to a new domain (i.e., knowledge-base).
+
+        Possible approaches are:
+        \begin{descriptionlist}
+            \item[Fine-tuning] 
+                Fine-tune the LLM with the new knowledge-base.
+
+                \begin{remark}
+                    This approach is susceptible to the catastrophic forgetting problem.
+                \end{remark}
+
+            \item[Prompting] 
+                Embed the new knowledge-base into the prompt of the LLM.
+
+                \begin{remark}
+                    This approach risks hallucinations and is constrained to the limits of the context length and computational inefficiency.
+                \end{remark}
+
+            \item[Functional calling] 
+                Let the LLM query the knowledge-base when needed.
+
+                \begin{remark}
+                    This approach requires more complex prompts and not all LLMs support it.
+                \end{remark}
+            \end{descriptionlist}
+
+        \begin{remark}
+            Experimental results show that functional calling works better than embedding the KB in the prompt. It is also more effective when the KB becomes bigger.
+        \end{remark}
+\end{description}
+
+
+\subsection{LLM pragmatics}
+
+\begin{description}
+    \item[Pragmatics] \marginnote{Pragmatics}
+        Ability to adapt a conversation based on the context.
+
+    \item[Proactivity] \marginnote{Proactivity}
+        Ability of providing useful but not explicitly requested information.
+
+        \begin{remark}
+            An LLM can be made more proactive by prompting or fine-tuning.
+        \end{remark}
+\end{description}
+
+
+\subsection{LLM for dialog generation}
+
+\begin{description}
+    \item[Automatic dialog generation] \marginnote{Automatic dialog generation}
+        Use an LLM to generate and annotate dialogs to create a synthetic dataset. A possible approach is based on the following steps:
+        \begin{descriptionlist}
+            \item[Generation] 
+                Use the LLM to generate a dialog. Possible approaches are:
+                \begin{descriptionlist}
+                    \item[One-pass] Prompt the LLM to generate a dialog based on a few references.
+                    \item[Interactive] Produce a dialog by conversing with the model.
+                    \item[Teacher-student] Let two LLMs converse.
+                \end{descriptionlist}
+
+            \item[Annotation] 
+                Prompt the LLM to annotate the generated dialog based on some schema.
+
+            \item[Evaluation] 
+                Evaluate based on human opinion.
+        \end{descriptionlist}
+\end{description}
\ No newline at end of file