Files
unibo-ai-notes/src/year2/natural-language-processing/sections/_speech.tex
2024-12-27 16:21:47 +01:00

195 lines
7.8 KiB
TeX

\chapter{Speech processing}
\section{Audio representation}
\begin{description}
\item[Sound/soundwave] \marginnote{Soundwave}
Vibration that travels though a medium. It is modulated by:
\begin{descriptionlist}
\item[Pitch] Frequency of the vibrations.
\item[Loudness] Amplitude of the vibrations.
\end{descriptionlist}
\item[Waveform] \marginnote{Waveform}
Representation of a soundwave. It is described by:
\begin{description}
\item[Frequency]
Represents the pitch of the sound.
\item[Period]
Distance between two peaks of the sound (i.e., correlated to frequency as $f=\frac{1}{T}$).
\item[Amplitude]
Represents the loudness of the sound (i.e., the air pressure).
\begin{remark}
In practice, amplitude is usually converted in decibels due to the fact that the human auditory system perceives sound closer to a logarithmic scale.
\end{remark}
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/waveform.png}
\end{figure}
\item[Signal] \marginnote{Signal}
Representation of information.
\begin{remark}
In sound processing, the waveform itself is the signal.
\end{remark}
\begin{description}
\item[Analog signal] \marginnote{Analog signal}
Waveform as-is in the real world.
\item[Digital signal] \marginnote{Digital signal}
Sampled (i.e., measure uniform time steps) and quantized (i.e., discretize values) version of an analog waveform.
\end{description}
\item[Fourier transform] \marginnote{Fourier transform}
Method to decompose a continuous signal in its constituent sin waves.
Given a continuous signal $x(t)$, its Fourier transform is:
\[ X(f) = \int_{-\infty}^{+\infty} x(t) e^{-j2\pi ft} \,dt \]
where $X(f)$ indicates how much of the frequency $f$ exists in $x(t)$.
\begin{description}
\item[Discrete Fourier transform (DFT)] \marginnote{Discrete Fourier transform (DFT)}
Fourier transform for digital signals.
Given a discrete signal $x[n]$, its DFT is:
\[ X[k] = \sum_{n=0}^{N-1} x[n]e^{-\frac{j2\pi kn}{N}} \]
where $k$ is the discrete frequency and $N$ is the number of samples.
\item[Fast Fourier transform (FFT)] \marginnote{Fast Fourier transform (FFT)}
Efficient implementation of DFT for $N$s that are power of $2$.
\item[Short-time Fourier transform (STFT)] \marginnote{Short-time Fourier transform (STFT)}
FFT computed on short time windows of the sound signal.
\begin{remark}
This method allows preserving time information by using a fixed frame size.
\end{remark}
\begin{description}
\item[Spectrogram] \marginnote{Spectrogram}
Result of STFT that shows how the frequencies change over time.
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/spectrogram.png}
\end{figure}
\item[Inverse STFT (ISTFT)] \marginnote{Inverse STFT (ISTFT)}
Converts a time-frequency representation of sound (i.e., spectrogram) to its sound signal.
\begin{remark}
This allows to manipulate a signal in its frequency domain (STFT) and then convert it back (ISTFT).
\end{remark}
\end{description}
\item[Mel-scaled spectrogram] \marginnote{Mel-scaled spectrogram}
Spectrogram where frequencies are mapped to the mel scale (i.e., lower frequencies are more fine-grained while higher frequencies are more compressed, to match the human logarithmic sound perception).
\item[Audio features] \marginnote{Audio features}
Representation of a sound signal extracted from the waveform or spectrogram.
\end{description}
\section{Tasks}
\begin{description}
\item[Automatic speech recognition (ASR)]
Convert a sound signal into text.
\begin{example}
Use an RNN/transformer encoder-decoder architecture. A sound signal is processed as follows:
\begin{enumerate}
\item Compute the audio features from the waveform (e.g., mel-spectrogram).
\item Pass the computed features through the encoder.
\item Use the decoder to generate the output text autoregressively conditioned on the encoder.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/asp_arch.png}
\end{figure}
\end{example}
\item[Speech enhancement]
Clear the sound signal.
\item[Speech separation]
Separate the different sources in a sound signal (e.g., differentiate speakers).
\item[Text-to-speech]
Convert text into a sound signal.
\begin{example}
Use an encoder-decoder architecture. A text is processed as follows:
\begin{enumerate}
\item Use the encoder to embed the input text into a representation that encodes linguistic features (e.g., pronunciation, rhythm, \dots).
\item Use the decoder to predict a mel-spectrogram.
\item Use a neural vocoder to convert the mel-spectrogram into an audio waveform.
\end{enumerate}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/tts_arch.png}
\end{figure}
\end{example}
\item[Speaker diarization]
Determine the moment and the person who spoke.
\item[Speech emotion recognition]
Recognize emotions from the sound signal.
\item[Neural network explanation]
Use speech to explain another speech.
\end{description}
\section{Speech foundation models}
\begin{description}
\item[Speech foundation model (SFM)] \marginnote{Speech foundation model (SFM)}
Transformer-based model pre-trained on speech. A common architecture is composed of:
\begin{descriptionlist}
\item[Feature extractor]
Converts the waveform into a low-dimensional representation (e.g., by using convolutions).
\item[Encoder]
Computes contextual embeddings from the sound features.
\end{descriptionlist}
\begin{remark}
SFM takes as input raw waveforms and are more robust in dealing with speech variability due to diverse speakers, environment, noise, \dots
\end{remark}
\begin{remark}
A SFM can be either fine-tuned for a specific task or used as a feature extractor for other models.
\end{remark}
\item[Multimodal model] \marginnote{Multimodal model}
Model able to handle multiple modalities (e.g., speech and text).
The main considerations to take into account when working with multimodal models are:
\begin{descriptionlist}
\item[Representation]
Decide how to encode different modalities into the same embedding space.
\item[Fusion]
Combine information from different modalities.
\item[Alignment]
Link corresponding elements (e.g., in time or by meaning) across different modalities.
\item[Translation]
Map information from one modality to another.
\item[Co-learning]
Leverage shared information between modalities for training.
\end{descriptionlist}
\end{description}