Add A3I autoencoder + missing data

2026-02-04 07:41:43 +01:00 · 2024-10-01 19:50:12 +02:00
parent 2b24d40828
commit 709c6c9c76
8 changed files with 184 additions and 2 deletions
--- a/src/year2/artificial-intelligence-in-industry/a3i.tex
+++ b/src/year2/artificial-intelligence-in-industry/a3i.tex
@ -14,5 +14,6 @@
    \input{./sections/_preliminaries.tex}
    \input{./sections/_anomaly_detection_low_dim.tex}
    \input{./sections/_anomaly_detection_high_dim.tex}
+    \input{./sections/_missing_data.tex}

 \end{document}
--- a/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal.pdf
--- a/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal_rank.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal_rank.pdf
--- a/src/year2/artificial-intelligence-in-industry/img/_md_traffic_data.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_data.pdf
--- a/src/year2/artificial-intelligence-in-industry/img/_md_traffic_eval_data.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_eval_data.pdf
--- a/src/year2/artificial-intelligence-in-industry/img/_md_traffic_resampled.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_resampled.pdf
--- a/src/year2/artificial-intelligence-in-industry/sections/_anomaly_detection_high_dim.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_anomaly_detection_high_dim.tex
@ -198,7 +198,7 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}.
    \item[Autoencoder for anomaly detection]
        By evaluating the quality of the reconstruction, an autoencoder can be used for anomaly detection:
        \[ \Vert x - d(e(x, \theta_e), \theta_d) \Vert_2^2 \geq \varepsilon \]
-        
+
        The advantages of this approach are the following:
        \begin{itemize}
            \item The size of the neural network does not scale with the training data.
@ -211,4 +211,85 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}.

 \begin{remark}
    It is always a good idea to normalize the input of a neural network to have a more stable gradient descent. Moreover, with normalized data, common weight initialization techniques make the output approximately normalized too.
-\end{remark}
+\end{remark}
+
+\begin{remark}
+    Counterintuitively, neural networks have a higher bias compared to traditional machine learning techniques for mainly two reasons:
+    \begin{itemize}
+        \item A change in a single parameter affects the whole network.
+        \item Training using SGD inherently prevents overfitting. 
+    \end{itemize}
+\end{remark}
+
+
+\begin{theorem}
+    Under the following assumptions:
+    \begin{itemize}
+        \item Normally distributed noise (i.e., distance between prediction and ground truth),
+        \item Independent noise among each output component (i.e., columns),
+        \item Same variance (i.e., homoscedasticity) in the noise of each output component,
+    \end{itemize}
+    autoencoders are trained as density estimators.
+
+    \begin{remark}
+        When minimizing MSE, these assumptions hold.
+    \end{remark}
+
+    \begin{proof}
+        When training an autoencoder $h$ using MSE on $m$ examples with $n$ features, the following problem is solved:
+        \[ 
+            \begin{split}
+                \arg\min_{\theta} \Vert h(\vec{x}, \theta) - \vec{x} \Vert_2^2 
+                    &= \arg\min_{\theta} \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \\
+                    &= \arg\min_{\theta} \log\exp \left( \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\
+                    &= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\
+                    &= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( (h(\vec{x}_i, \theta) - \vec{x}_{i})^T \matr{I} (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right)
+            \end{split}
+        \]
+        The following adjustments can be done without altering the problem:
+        \begin{itemize}
+            \item Negate the argument of $\exp$ and solve a maximization problem,
+            \item Multiply the argument of $\exp$ by $\frac{1}{2}\sigma$, for some constant $\sigma$,
+            \item Multiply $\exp$ by $\frac{1}{\sqrt{2\pi}\sigma}$.
+        \end{itemize}
+        The problem becomes:
+        \[ \arg\max_{\theta} \log \prod_{i=1}^{m} \frac{1}{\sqrt{2\pi}\sigma} \exp \left( -\frac{1}{2} (h(\vec{x}_i, \theta) - \vec{x}_{i})^T (\sigma\matr{I}) (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right) \]
+        which is the PDF of a multivariate normal distribution $f(\vec{x}_i, h(\vec{x}_i), \sigma\matr{I})$ with a diagonal covariance matrix. More specifically, this is a distribution:
+        \begin{itemize}
+            \item Centered on $h(\vec{x}_i)$,
+            \item With independent normal components,
+            \item With components with uniform variance.
+        \end{itemize}
+
+        Therefore, when using MSE as loss, training is a likelihood maximization problem.
+    \end{proof}
+\end{theorem}
+
+
+\begin{description}
+    \item[Threshold optimization]
+        The threshold can be determined in the same way as in \Cref{sec:ad_taxi_kde_uni}.
+
+    \item[Multiple signal analysis]
+        With autoencoders, it is possible to compare the reconstruction error of single components. 
+
+        \begin{remark}
+            In most cases, reconstruction errors are often concentrated on a few features.
+
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal.pdf}
+                \caption{Reconstruction error of each feature}
+            \end{figure}
+        \end{remark}
+
+        \begin{remark}
+            It is possible to rank the feature with the highest reconstruction error to provide more insights.
+
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal_rank.pdf}
+                \caption{Top-20 features with the largest error}
+            \end{figure}
+        \end{remark}
+\end{description}
--- a/src/year2/artificial-intelligence-in-industry/sections/_missing_data.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_missing_data.tex
@ -0,0 +1,100 @@
+\chapter{Missing data: Traffic data}
+
+
+\section{Data}
+
+Time series on traffic anomalies with missing values.
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.8\linewidth]{./img/_md_traffic_data.pdf}
+    \caption{
+        \parbox[t]{0.7\linewidth}{
+            Plot of the data. Straight lines are artifacts of missing values. Red dots below represent the actual data points.
+        }
+    }
+\end{figure}
+
+
+
+\section{Preliminaries}
+
+The dataset has sparse indexes (i.e., indexes are non-contiguous) and missing values are represented by gaps. It is necessary to use dense indexes where missing values are explicitly marked as \texttt{NaN}.
+
+\subsection{Resampling / Binning}
+
+\begin{description}
+    \item[Resampling / binning] \marginnote{Resampling / binning} 
+        Resample the indexes of the dataset so that they have a regular step (e.g., 5 minutes).
+
+        \begin{remark}
+            Values that end up in the same bin need to be aggregated (e.g., mean).
+        \end{remark}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.8\linewidth]{./img/_md_traffic_resampled.pdf}
+            \caption{
+                Plot of the resampled data without artifacts
+            }
+        \end{figure}
+\end{description}
+
+
+
+\section{Approaches}
+
+\begin{description}
+    \item[Benchmark dataset] 
+        A portion of known data where some values are artificially removed can be used to evaluate a filling method. As accuracy metric, RMSE can be used.
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.8\linewidth]{./img/_md_traffic_eval_data.pdf}
+            \caption{Benchmark dataset. Marked points have been artificially removed.}
+        \end{figure}
+\end{description}
+
+
+\subsection{Forward/Backward filling}
+
+\begin{description}
+    \item[Forward filling] 
+        Set the missing value to the last valid observation.
+
+    \item[Backward filling] 
+        Set the missing value to the next valid observation.
+\end{description}
+
+\begin{remark}
+    The idea of this approach is that time series usually have strong local correlation (i.e., some sort of inertia).
+\end{remark}
+
+\begin{remark}
+    Forward/backward filling tend to work well on low variance portions of the data.
+\end{remark}
+
+
+\subsection{Geometric interpolation}
+
+Interpolate a function to determine missing points. Possible methods are:
+\begin{itemize}
+    \item Linear,
+    \item Nearest value,
+    \item Polynomial,
+    \item Spline.
+\end{itemize}
+
+
+
+\begin{remark}
+    (R)MSE assumes that the data is normally distributed, independent, and with the same variability at all points. This is not usually true with time series.
+\end{remark}
+
+\begin{remark}
+    We would like to build an estimator that is:
+    \begin{itemize}
+        \item At least as powerful as interpolation.
+        \item Able to detect the expected variability.
+    \end{itemize}
+\end{remark}