Add A3I autoencoder + missing data

This commit is contained in:
2024-10-01 19:50:12 +02:00
parent 2b24d40828
commit 709c6c9c76
8 changed files with 184 additions and 2 deletions

View File

@ -14,5 +14,6 @@
\input{./sections/_preliminaries.tex}
\input{./sections/_anomaly_detection_low_dim.tex}
\input{./sections/_anomaly_detection_high_dim.tex}
\input{./sections/_missing_data.tex}
\end{document}

View File

@ -198,7 +198,7 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}.
\item[Autoencoder for anomaly detection]
By evaluating the quality of the reconstruction, an autoencoder can be used for anomaly detection:
\[ \Vert x - d(e(x, \theta_e), \theta_d) \Vert_2^2 \geq \varepsilon \]
The advantages of this approach are the following:
\begin{itemize}
\item The size of the neural network does not scale with the training data.
@ -211,4 +211,85 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}.
\begin{remark}
It is always a good idea to normalize the input of a neural network to have a more stable gradient descent. Moreover, with normalized data, common weight initialization techniques make the output approximately normalized too.
\end{remark}
\end{remark}
\begin{remark}
Counterintuitively, neural networks have a higher bias compared to traditional machine learning techniques for mainly two reasons:
\begin{itemize}
\item A change in a single parameter affects the whole network.
\item Training using SGD inherently prevents overfitting.
\end{itemize}
\end{remark}
\begin{theorem}
Under the following assumptions:
\begin{itemize}
\item Normally distributed noise (i.e., distance between prediction and ground truth),
\item Independent noise among each output component (i.e., columns),
\item Same variance (i.e., homoscedasticity) in the noise of each output component,
\end{itemize}
autoencoders are trained as density estimators.
\begin{remark}
When minimizing MSE, these assumptions hold.
\end{remark}
\begin{proof}
When training an autoencoder $h$ using MSE on $m$ examples with $n$ features, the following problem is solved:
\[
\begin{split}
\arg\min_{\theta} \Vert h(\vec{x}, \theta) - \vec{x} \Vert_2^2
&= \arg\min_{\theta} \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \\
&= \arg\min_{\theta} \log\exp \left( \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\
&= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\
&= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( (h(\vec{x}_i, \theta) - \vec{x}_{i})^T \matr{I} (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right)
\end{split}
\]
The following adjustments can be done without altering the problem:
\begin{itemize}
\item Negate the argument of $\exp$ and solve a maximization problem,
\item Multiply the argument of $\exp$ by $\frac{1}{2}\sigma$, for some constant $\sigma$,
\item Multiply $\exp$ by $\frac{1}{\sqrt{2\pi}\sigma}$.
\end{itemize}
The problem becomes:
\[ \arg\max_{\theta} \log \prod_{i=1}^{m} \frac{1}{\sqrt{2\pi}\sigma} \exp \left( -\frac{1}{2} (h(\vec{x}_i, \theta) - \vec{x}_{i})^T (\sigma\matr{I}) (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right) \]
which is the PDF of a multivariate normal distribution $f(\vec{x}_i, h(\vec{x}_i), \sigma\matr{I})$ with a diagonal covariance matrix. More specifically, this is a distribution:
\begin{itemize}
\item Centered on $h(\vec{x}_i)$,
\item With independent normal components,
\item With components with uniform variance.
\end{itemize}
Therefore, when using MSE as loss, training is a likelihood maximization problem.
\end{proof}
\end{theorem}
\begin{description}
\item[Threshold optimization]
The threshold can be determined in the same way as in \Cref{sec:ad_taxi_kde_uni}.
\item[Multiple signal analysis]
With autoencoders, it is possible to compare the reconstruction error of single components.
\begin{remark}
In most cases, reconstruction errors are often concentrated on a few features.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal.pdf}
\caption{Reconstruction error of each feature}
\end{figure}
\end{remark}
\begin{remark}
It is possible to rank the feature with the highest reconstruction error to provide more insights.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal_rank.pdf}
\caption{Top-20 features with the largest error}
\end{figure}
\end{remark}
\end{description}

View File

@ -0,0 +1,100 @@
\chapter{Missing data: Traffic data}
\section{Data}
Time series on traffic anomalies with missing values.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_md_traffic_data.pdf}
\caption{
\parbox[t]{0.7\linewidth}{
Plot of the data. Straight lines are artifacts of missing values. Red dots below represent the actual data points.
}
}
\end{figure}
\section{Preliminaries}
The dataset has sparse indexes (i.e., indexes are non-contiguous) and missing values are represented by gaps. It is necessary to use dense indexes where missing values are explicitly marked as \texttt{NaN}.
\subsection{Resampling / Binning}
\begin{description}
\item[Resampling / binning] \marginnote{Resampling / binning}
Resample the indexes of the dataset so that they have a regular step (e.g., 5 minutes).
\begin{remark}
Values that end up in the same bin need to be aggregated (e.g., mean).
\end{remark}
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_md_traffic_resampled.pdf}
\caption{
Plot of the resampled data without artifacts
}
\end{figure}
\end{description}
\section{Approaches}
\begin{description}
\item[Benchmark dataset]
A portion of known data where some values are artificially removed can be used to evaluate a filling method. As accuracy metric, RMSE can be used.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_md_traffic_eval_data.pdf}
\caption{Benchmark dataset. Marked points have been artificially removed.}
\end{figure}
\end{description}
\subsection{Forward/Backward filling}
\begin{description}
\item[Forward filling]
Set the missing value to the last valid observation.
\item[Backward filling]
Set the missing value to the next valid observation.
\end{description}
\begin{remark}
The idea of this approach is that time series usually have strong local correlation (i.e., some sort of inertia).
\end{remark}
\begin{remark}
Forward/backward filling tend to work well on low variance portions of the data.
\end{remark}
\subsection{Geometric interpolation}
Interpolate a function to determine missing points. Possible methods are:
\begin{itemize}
\item Linear,
\item Nearest value,
\item Polynomial,
\item Spline.
\end{itemize}
\begin{remark}
(R)MSE assumes that the data is normally distributed, independent, and with the same variability at all points. This is not usually true with time series.
\end{remark}
\begin{remark}
We would like to build an estimator that is:
\begin{itemize}
\item At least as powerful as interpolation.
\item Able to detect the expected variability.
\end{itemize}
\end{remark}