diff --git a/src/year2/artificial-intelligence-in-industry/a3i.tex b/src/year2/artificial-intelligence-in-industry/a3i.tex index 38d7190..c99f200 100644 --- a/src/year2/artificial-intelligence-in-industry/a3i.tex +++ b/src/year2/artificial-intelligence-in-industry/a3i.tex @@ -14,5 +14,6 @@ \input{./sections/_preliminaries.tex} \input{./sections/_anomaly_detection_low_dim.tex} \input{./sections/_anomaly_detection_high_dim.tex} + \input{./sections/_missing_data.tex} \end{document} \ No newline at end of file diff --git a/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal.pdf b/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal.pdf new file mode 100644 index 0000000..f8a8a19 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal_rank.pdf b/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal_rank.pdf new file mode 100644 index 0000000..bf507fc Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_ad_hpc_multi_signal_rank.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_md_traffic_data.pdf b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_data.pdf new file mode 100644 index 0000000..516b362 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_data.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_md_traffic_eval_data.pdf b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_eval_data.pdf new file mode 100644 index 0000000..26ed39f Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_eval_data.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_md_traffic_resampled.pdf b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_resampled.pdf new file mode 100644 index 0000000..cf52d64 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_md_traffic_resampled.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/sections/_anomaly_detection_high_dim.tex b/src/year2/artificial-intelligence-in-industry/sections/_anomaly_detection_high_dim.tex index abc655b..749545c 100644 --- a/src/year2/artificial-intelligence-in-industry/sections/_anomaly_detection_high_dim.tex +++ b/src/year2/artificial-intelligence-in-industry/sections/_anomaly_detection_high_dim.tex @@ -198,7 +198,7 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}. \item[Autoencoder for anomaly detection] By evaluating the quality of the reconstruction, an autoencoder can be used for anomaly detection: \[ \Vert x - d(e(x, \theta_e), \theta_d) \Vert_2^2 \geq \varepsilon \] - + The advantages of this approach are the following: \begin{itemize} \item The size of the neural network does not scale with the training data. @@ -211,4 +211,85 @@ The KDE model, bandwidth, and threshold are fitted as in \Cref{ch:ad_low}. \begin{remark} It is always a good idea to normalize the input of a neural network to have a more stable gradient descent. Moreover, with normalized data, common weight initialization techniques make the output approximately normalized too. -\end{remark} \ No newline at end of file +\end{remark} + +\begin{remark} + Counterintuitively, neural networks have a higher bias compared to traditional machine learning techniques for mainly two reasons: + \begin{itemize} + \item A change in a single parameter affects the whole network. + \item Training using SGD inherently prevents overfitting. + \end{itemize} +\end{remark} + + +\begin{theorem} + Under the following assumptions: + \begin{itemize} + \item Normally distributed noise (i.e., distance between prediction and ground truth), + \item Independent noise among each output component (i.e., columns), + \item Same variance (i.e., homoscedasticity) in the noise of each output component, + \end{itemize} + autoencoders are trained as density estimators. + + \begin{remark} + When minimizing MSE, these assumptions hold. + \end{remark} + + \begin{proof} + When training an autoencoder $h$ using MSE on $m$ examples with $n$ features, the following problem is solved: + \[ + \begin{split} + \arg\min_{\theta} \Vert h(\vec{x}, \theta) - \vec{x} \Vert_2^2 + &= \arg\min_{\theta} \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \\ + &= \arg\min_{\theta} \log\exp \left( \sum_{i=1}^{m} \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\ + &= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( \sum_{j=1}^{n} (h_j(\vec{x}_i, \theta) - \vec{x}_{i,j})^2 \right) \\ + &= \arg\min_{\theta} \log \prod_{i=1}^{m} \exp \left( (h(\vec{x}_i, \theta) - \vec{x}_{i})^T \matr{I} (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right) + \end{split} + \] + The following adjustments can be done without altering the problem: + \begin{itemize} + \item Negate the argument of $\exp$ and solve a maximization problem, + \item Multiply the argument of $\exp$ by $\frac{1}{2}\sigma$, for some constant $\sigma$, + \item Multiply $\exp$ by $\frac{1}{\sqrt{2\pi}\sigma}$. + \end{itemize} + The problem becomes: + \[ \arg\max_{\theta} \log \prod_{i=1}^{m} \frac{1}{\sqrt{2\pi}\sigma} \exp \left( -\frac{1}{2} (h(\vec{x}_i, \theta) - \vec{x}_{i})^T (\sigma\matr{I}) (h(\vec{x}_i, \theta) - \vec{x}_{i}) \right) \] + which is the PDF of a multivariate normal distribution $f(\vec{x}_i, h(\vec{x}_i), \sigma\matr{I})$ with a diagonal covariance matrix. More specifically, this is a distribution: + \begin{itemize} + \item Centered on $h(\vec{x}_i)$, + \item With independent normal components, + \item With components with uniform variance. + \end{itemize} + + Therefore, when using MSE as loss, training is a likelihood maximization problem. + \end{proof} +\end{theorem} + + +\begin{description} + \item[Threshold optimization] + The threshold can be determined in the same way as in \Cref{sec:ad_taxi_kde_uni}. + + \item[Multiple signal analysis] + With autoencoders, it is possible to compare the reconstruction error of single components. + + \begin{remark} + In most cases, reconstruction errors are often concentrated on a few features. + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal.pdf} + \caption{Reconstruction error of each feature} + \end{figure} + \end{remark} + + \begin{remark} + It is possible to rank the feature with the highest reconstruction error to provide more insights. + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_ad_hpc_multi_signal_rank.pdf} + \caption{Top-20 features with the largest error} + \end{figure} + \end{remark} +\end{description} \ No newline at end of file diff --git a/src/year2/artificial-intelligence-in-industry/sections/_missing_data.tex b/src/year2/artificial-intelligence-in-industry/sections/_missing_data.tex new file mode 100644 index 0000000..b2acb8a --- /dev/null +++ b/src/year2/artificial-intelligence-in-industry/sections/_missing_data.tex @@ -0,0 +1,100 @@ +\chapter{Missing data: Traffic data} + + +\section{Data} + +Time series on traffic anomalies with missing values. + +\begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_md_traffic_data.pdf} + \caption{ + \parbox[t]{0.7\linewidth}{ + Plot of the data. Straight lines are artifacts of missing values. Red dots below represent the actual data points. + } + } +\end{figure} + + + +\section{Preliminaries} + +The dataset has sparse indexes (i.e., indexes are non-contiguous) and missing values are represented by gaps. It is necessary to use dense indexes where missing values are explicitly marked as \texttt{NaN}. + +\subsection{Resampling / Binning} + +\begin{description} + \item[Resampling / binning] \marginnote{Resampling / binning} + Resample the indexes of the dataset so that they have a regular step (e.g., 5 minutes). + + \begin{remark} + Values that end up in the same bin need to be aggregated (e.g., mean). + \end{remark} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_md_traffic_resampled.pdf} + \caption{ + Plot of the resampled data without artifacts + } + \end{figure} +\end{description} + + + +\section{Approaches} + +\begin{description} + \item[Benchmark dataset] + A portion of known data where some values are artificially removed can be used to evaluate a filling method. As accuracy metric, RMSE can be used. + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_md_traffic_eval_data.pdf} + \caption{Benchmark dataset. Marked points have been artificially removed.} + \end{figure} +\end{description} + + +\subsection{Forward/Backward filling} + +\begin{description} + \item[Forward filling] + Set the missing value to the last valid observation. + + \item[Backward filling] + Set the missing value to the next valid observation. +\end{description} + +\begin{remark} + The idea of this approach is that time series usually have strong local correlation (i.e., some sort of inertia). +\end{remark} + +\begin{remark} + Forward/backward filling tend to work well on low variance portions of the data. +\end{remark} + + +\subsection{Geometric interpolation} + +Interpolate a function to determine missing points. Possible methods are: +\begin{itemize} + \item Linear, + \item Nearest value, + \item Polynomial, + \item Spline. +\end{itemize} + + + +\begin{remark} + (R)MSE assumes that the data is normally distributed, independent, and with the same variability at all points. This is not usually true with time series. +\end{remark} + +\begin{remark} + We would like to build an estimator that is: + \begin{itemize} + \item At least as powerful as interpolation. + \item Able to detect the expected variability. + \end{itemize} +\end{remark}