mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add A3I autoencoder + missing data
This commit is contained in:
@ -14,5 +14,6 @@
|
||||
\input{./sections/_preliminaries.tex}
|
||||
\input{./sections/_anomaly_detection_low_dim.tex}
|
||||
\input{./sections/_anomaly_detection_high_dim.tex}
|
||||
\input{./sections/_missing_data.tex}
|
||||
|
||||
\end{document}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -198,7 +198,7 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}.
|
||||
\item[Autoencoder for anomaly detection]
|
||||
By evaluating the quality of the reconstruction, an autoencoder can be used for anomaly detection:
|
||||
\[ \Vert x - d(e(x, \theta_e), \theta_d) \Vert_2^2 \geq \varepsilon \]
|
||||
|
||||
|
||||
The advantages of this approach are the following:
|
||||
\begin{itemize}
|
||||
\item The size of the neural network does not scale with the training data.
|
||||
@ -211,4 +211,85 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}.
|
||||
|
||||
\begin{remark}
|
||||
It is always a good idea to normalize the input of a neural network to have a more stable gradient descent. Moreover, with normalized data, common weight initialization techniques make the output approximately normalized too.
|
||||
\end{remark}
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Counterintuitively, neural networks have a higher bias compared to traditional machine learning techniques for mainly two reasons:
|
||||
\begin{itemize}
|
||||
\item A change in a single parameter affects the whole network.
|
||||
\item Training using SGD inherently prevents overfitting.
|
||||
\end{itemize}
|
||||
\end{remark}
|
||||
|
||||
|
||||
\begin{theorem}
|
||||
Under the following assumptions:
|
||||
\begin{itemize}
|
||||
\item Normally distributed noise (i.e., distance between prediction and ground truth),
|
||||
\item Independent noise among each output component (i.e., columns),
|
||||
\item Same variance (i.e., homoscedasticity) in the noise of each output component,
|
||||
\end{itemize}
|
||||
autoencoders are trained as density estimators.
|
||||
|
||||
\begin{remark}
|
||||
When minimizing MSE, these assumptions hold.
|
||||
\end{remark}
|
||||
|
||||
\begin{proof}
|
||||
When training an autoencoder $h$ using MSE on $m$ examples with $n$ features, the following problem is solved:
|
||||
\[
|
||||
\begin{split}
|
||||
\arg\min_{\theta} \Vert h(\vec{x}, \theta) - \vec{x} \Vert_2^2
|
||||
&= \arg\min_{\theta} \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \\
|
||||
&= \arg\min_{\theta} \log\exp \left( \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\
|
||||
&= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\
|
||||
&= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( (h(\vec{x}_i, \theta) - \vec{x}_{i})^T \matr{I} (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right)
|
||||
\end{split}
|
||||
\]
|
||||
The following adjustments can be done without altering the problem:
|
||||
\begin{itemize}
|
||||
\item Negate the argument of $\exp$ and solve a maximization problem,
|
||||
\item Multiply the argument of $\exp$ by $\frac{1}{2}\sigma$, for some constant $\sigma$,
|
||||
\item Multiply $\exp$ by $\frac{1}{\sqrt{2\pi}\sigma}$.
|
||||
\end{itemize}
|
||||
The problem becomes:
|
||||
\[ \arg\max_{\theta} \log \prod_{i=1}^{m} \frac{1}{\sqrt{2\pi}\sigma} \exp \left( -\frac{1}{2} (h(\vec{x}_i, \theta) - \vec{x}_{i})^T (\sigma\matr{I}) (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right) \]
|
||||
which is the PDF of a multivariate normal distribution $f(\vec{x}_i, h(\vec{x}_i), \sigma\matr{I})$ with a diagonal covariance matrix. More specifically, this is a distribution:
|
||||
\begin{itemize}
|
||||
\item Centered on $h(\vec{x}_i)$,
|
||||
\item With independent normal components,
|
||||
\item With components with uniform variance.
|
||||
\end{itemize}
|
||||
|
||||
Therefore, when using MSE as loss, training is a likelihood maximization problem.
|
||||
\end{proof}
|
||||
\end{theorem}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Threshold optimization]
|
||||
The threshold can be determined in the same way as in \Cref{sec:ad_taxi_kde_uni}.
|
||||
|
||||
\item[Multiple signal analysis]
|
||||
With autoencoders, it is possible to compare the reconstruction error of single components.
|
||||
|
||||
\begin{remark}
|
||||
In most cases, reconstruction errors are often concentrated on a few features.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal.pdf}
|
||||
\caption{Reconstruction error of each feature}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
It is possible to rank the feature with the highest reconstruction error to provide more insights.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal_rank.pdf}
|
||||
\caption{Top-20 features with the largest error}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
\end{description}
|
||||
@ -0,0 +1,100 @@
|
||||
\chapter{Missing data: Traffic data}
|
||||
|
||||
|
||||
\section{Data}
|
||||
|
||||
Time series on traffic anomalies with missing values.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_md_traffic_data.pdf}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Plot of the data. Straight lines are artifacts of missing values. Red dots below represent the actual data points.
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
|
||||
|
||||
\section{Preliminaries}
|
||||
|
||||
The dataset has sparse indexes (i.e., indexes are non-contiguous) and missing values are represented by gaps. It is necessary to use dense indexes where missing values are explicitly marked as \texttt{NaN}.
|
||||
|
||||
\subsection{Resampling / Binning}
|
||||
|
||||
\begin{description}
|
||||
\item[Resampling / binning] \marginnote{Resampling / binning}
|
||||
Resample the indexes of the dataset so that they have a regular step (e.g., 5 minutes).
|
||||
|
||||
\begin{remark}
|
||||
Values that end up in the same bin need to be aggregated (e.g., mean).
|
||||
\end{remark}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_md_traffic_resampled.pdf}
|
||||
\caption{
|
||||
Plot of the resampled data without artifacts
|
||||
}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
|
||||
\section{Approaches}
|
||||
|
||||
\begin{description}
|
||||
\item[Benchmark dataset]
|
||||
A portion of known data where some values are artificially removed can be used to evaluate a filling method. As accuracy metric, RMSE can be used.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_md_traffic_eval_data.pdf}
|
||||
\caption{Benchmark dataset. Marked points have been artificially removed.}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Forward/Backward filling}
|
||||
|
||||
\begin{description}
|
||||
\item[Forward filling]
|
||||
Set the missing value to the last valid observation.
|
||||
|
||||
\item[Backward filling]
|
||||
Set the missing value to the next valid observation.
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
The idea of this approach is that time series usually have strong local correlation (i.e., some sort of inertia).
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Forward/backward filling tend to work well on low variance portions of the data.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\subsection{Geometric interpolation}
|
||||
|
||||
Interpolate a function to determine missing points. Possible methods are:
|
||||
\begin{itemize}
|
||||
\item Linear,
|
||||
\item Nearest value,
|
||||
\item Polynomial,
|
||||
\item Spline.
|
||||
\end{itemize}
|
||||
|
||||
|
||||
|
||||
\begin{remark}
|
||||
(R)MSE assumes that the data is normally distributed, independent, and with the same variability at all points. This is not usually true with time series.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
We would like to build an estimator that is:
|
||||
\begin{itemize}
|
||||
\item At least as powerful as interpolation.
|
||||
\item Able to detect the expected variability.
|
||||
\end{itemize}
|
||||
\end{remark}
|
||||
Reference in New Issue
Block a user