diff --git a/src/year2/artificial-intelligence-in-industry/a3i.tex b/src/year2/artificial-intelligence-in-industry/a3i.tex index c4c03b0..34d41a3 100644 --- a/src/year2/artificial-intelligence-in-industry/a3i.tex +++ b/src/year2/artificial-intelligence-in-industry/a3i.tex @@ -11,10 +11,12 @@ \begin{document} \makenotesfront - \input{./sections/_preliminaries.tex} - \input{./sections/_anomaly_detection_low_dim.tex} - \input{./sections/_anomaly_detection_high_dim.tex} - \input{./sections/_missing_data.tex} - \input{./sections/_remaining_useful_life.tex} + \include{./sections/_preliminaries.tex} + \include{./sections/_anomaly_detection_low_dim.tex} + \include{./sections/_anomaly_detection_high_dim.tex} + \include{./sections/_missing_data.tex} + \include{./sections/_remaining_useful_life.tex} + \include{./sections/_wear_anomalies.tex} + \include{./sections/_arrivals_predicition.tex} \end{document} \ No newline at end of file diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_autoencoder.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_autoencoder.pdf new file mode 100644 index 0000000..b93827f Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_autoencoder.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_constant.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_constant.pdf new file mode 100644 index 0000000..e469bd3 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_constant.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_deviation.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_deviation.pdf new file mode 100644 index 0000000..2a7dabb Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_deviation.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_distribution.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_distribution.pdf new file mode 100644 index 0000000..775db2a Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_distribution.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_heatmap.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_heatmap.pdf new file mode 100644 index 0000000..18dafa7 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_heatmap.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_importance.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_importance.pdf new file mode 100644 index 0000000..7fe4562 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_importance.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_interarrival.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_interarrival.pdf new file mode 100644 index 0000000..86425d5 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_interarrival.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_peaks.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_peaks.pdf new file mode 100644 index 0000000..cecaf50 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_peaks.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_poisson.pdf b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_poisson.pdf new file mode 100644 index 0000000..9250b20 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_skinwrapper_poisson.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex new file mode 100644 index 0000000..7ffc601 --- /dev/null +++ b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex @@ -0,0 +1,136 @@ +\chapter{Arrivals prediction: Hospital emergency room} + + +\section{Data} + +The dataset contains accesses to the emergency room of the Maggiore Hospital in Bologna. + +Each row of the dataset represents a patient and the features are: +\begin{descriptionlist} + \item[\texttt{Triage}] Time of arrival. + \item[\texttt{TKCharge}] Time of first visit. + \item[\texttt{Code}] Priority (\texttt{white}, \texttt{green}, \texttt{yellow}, and \texttt{red}). + \item[\texttt{Outcome}] Indicates whether the patient got admitted or left. +\end{descriptionlist} + +\begin{description} + \item[Binning] + As the problem is to predict the total number of arrivals at fixed intervals, binning can be used to obtain a dataset with an hour granularity. +\end{description} + + +\section{Approaches} + +\begin{remark} + MSE assumes that the conditional distribution of the predictions follows a normal distribution. +\end{remark} + +\begin{remark} + Arrivals usually follow a Poisson distribution as: + \begin{itemize} + \item There is a skew in the tail. + \item All values are positive. + \item Events are independent. + \end{itemize} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_poisson.pdf} + \caption{Arrivals distribution at 6 a.m.} + \end{figure} + + \begin{theorem} + If the inter-arrival time is exponential with respect to a fixed rate, the counts of the arrivals will always follow a Poisson distribution. + \end{theorem} + + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_interarrival.pdf} + \caption{Dataset inter-arrival counts (the first bar is due to binning artifacts)} + \end{figure} +\end{remark} + +\subsection{Neuro-probabilistic model} + +\begin{description} + \item[Poisson distribution] \marginnote{Poisson distribution} + Distribution with discrete support whose probability mass function is defined as: + \[ p(k, \lambda) = \frac{\lambda^k e^{-\lambda}}{k!} \] + where $\lambda$ is the occurrence rate of the events. + + \begin{remark} + Both mean and standard deviation of a Poisson distribution are $\lambda$. + \end{remark} + + \begin{remark} + $\lambda^{-\frac{1}{2}}$ represent the distribution skewness. For smaller values of $\lambda$, there is a positive skew to the left. For larger values of $\lambda$, the skew becomes less observable. + \end{remark} + + \begin{remark} + For this problem, $\lambda$ is the average number of arrivals in each bin. + \end{remark} + + \item[Neuro-probabilistic model] \marginnote{Neuro-probabilistic model} + Model that combines statistics and machine learning. + + \begin{remark} + This class of models is known in the statistics literature as generalized linear model. With neural networks (i.e., non-linearity), they do not have an official name. ``Neuro-probabilistic model'' is an unofficial name. + \end{remark} + + This problem can be formulated through the following probabilistic model: + \[ y \sim \texttt{Poisson}(\lambda(x)) \] + where $y$ is the number of arrivals and $\lambda(x)$ is the rate parametrized on the temporal information $x$. + + The rate can be approximated using an estimator as: + \[ y \sim \texttt{Poisson}(\lambda(x, \theta)) \] + where $\lambda(x, \theta)$ is a regression model. + + \begin{description} + \item[Loss] + Training is done for maximum likelihood estimation using the Poisson distribution: + \[ + \begin{split} + &\arg\min_\theta - \sum_{i=1}^{m} \log\left( f(y_i, \lambda(x_i, \theta)) \right) \\ + &= \arg\min_\theta - \sum_{i=1}^{m} \log\left( \frac{\lambda(x_i, \theta)^{y_i} e^{\lambda(x_i, \theta)}}{y_i!} \right) + \end{split} + \] + + \item[Architecture] + Use an MLP to predict $\hat{\lambda}$ and then, instead of outputting the prediction directly, output a Poisson distribution object with rate $\hat{\lambda}$: + \[ + \begin{split} + \hat{\lambda} &= \texttt{MLP}(x) \\ + \texttt{out} &= \texttt{Poisson}(\cdot, \hat{\lambda}) + \end{split} + \] + + Some considerations must be made: + \begin{descriptionlist} + \item[Only positive rates] + As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm and an exponentiation to achieve this: + \[ + \begin{split} + \log(\hat{\lambda}) &= \texttt{MLP}(x) \\ + \texttt{out} &= \texttt{Poisson}\left( \cdot, \exp\left(\log(\hat{\lambda})\right) \right) = \texttt{Poisson}(\cdot, \hat{\lambda}) + \end{split} + \] + + \item[Standardization] + The input of the network can be standardized. On the other hand, standardizing the output is wrong as the Poisson distribution is discrete. + + However, if the input is standardized (for training stability), the output of the network with normal weights initialization will have $0$ mean. Therefore, at the beginning, the network will predict $\hat{\lambda} \approx 1$ (i.e., the MLP predicts $\log(\hat{\lambda}) \approx 0$ and then $\hat{\lambda} = \exp\left(\log(\hat{\lambda})\right) \approx 1$) which might not be a reasonable starting point. + + It is possible to provide an initial guess $\bar{\lambda}$ for $\hat{\lambda}$. This value can be used as a multiplicative factor, so that the first prediction will be close to $\bar{\lambda} \cdot 1$: + \[ + \begin{split} + \log(\hat{\lambda}) &= \texttt{MLP}(x) \\ + \texttt{out} &= \texttt{Poisson}\left( \cdot, \bar{\lambda}\exp\left(\log(\hat{\lambda})\right) \right) = \texttt{Poisson}(\cdot, \bar{\lambda}\hat{\lambda}) + \end{split} + \] + + \begin{remark} + For this problem $\bar{\lambda}$ can be the average number of arrivals in each bin. + \end{remark} + \end{descriptionlist} + \end{description} +\end{description} \ No newline at end of file diff --git a/src/year2/artificial-intelligence-in-industry/sections/_wear_anomalies.tex b/src/year2/artificial-intelligence-in-industry/sections/_wear_anomalies.tex new file mode 100644 index 0000000..bf8660f --- /dev/null +++ b/src/year2/artificial-intelligence-in-industry/sections/_wear_anomalies.tex @@ -0,0 +1,194 @@ +\chapter{Component wear anomalies: Skin wrapper machines} + + +\section{Data} + +The dataset represents a single run-to-failure experiment of a skin wrapper machine with a 4 ms sampling period and different segments (i.e., operating mode). + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/_skinwrapper_heatmap.pdf} + \caption{Dataset heatmap} +\end{figure} + +The following observations can be made: +\begin{itemize} + \item Some features (i.e., rows 0 and 8 of the heatmap) are fixed for long periods of time (i.e., piece-wise constant). They are most likely controlled parameters. + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_constant.pdf} + \end{figure} + + \item A feature (i.e., row 2 of the heatmap) seems constant but in reality it has many short-lived peaks. + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_peaks.pdf} + \end{figure} + + \begin{remark} + This type of signal might be useful to determine a period in the series. + \end{remark} + + \item Some features (i.e., rows 3 and 5 of the heatmap) contain sudden localized deviations. This is most likely due to external interventions. + \begin{figure}[H] + \centering + \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_deviation.pdf} + \end{figure} +\end{itemize} + + +\subsection{Binning} + +\begin{description} + \item[Binning] \marginnote{Binning} + Reduce the granularity of the data using a non-overlapping sliding window and aggregation functions. + + \begin{remark} + With high-frequency data, some approaches (e.g., a neural network) might not be able to keep up if real-time predictions are required. + \end{remark} + + \begin{remark} + After binning, the number of samples is reduced, but the number of features might increase. + \end{remark} +\end{description} + + + +\section{Approaches} + + +\subsection{Autoencoder} + +\begin{description} + \item[Autoencoder] + Train an autoencoder on earlier non-anomalous data and use the reconstruction error to detect component wear. + + Results show that there is a high reconstruction error for the operating mode feature (i.e., first row), which is a controlled parameter. This hints at the fact that some operating modes are too infrequent, so the model is unable to reconstruct them. + \begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/_skinwrapper_autoencoder.pdf} + \end{figure} + +\end{description} + +\begin{remark} + There is a distribution drift (i.e., future does not behave like the past) in the dataset. + + \begin{figure}[H] + \centering + \includegraphics[width=0.65\linewidth]{./img/_skinwrapper_distribution.pdf} + \end{figure} +\end{remark} + +\begin{remark} + To account for distribution drift, chronological splitting for the training and test sets is a better approach instead of random sampling (which ignores the drift). + + The validation set can be created through random sampling as, in case it is created chronologically, it might create a gap between training and test data. However, this approach ignores drift when validating. +\end{remark} + + +\subsection{Class rebalancing} + + +\begin{remark} + Consider two training samples $\{ (x_1, y_1), (x_2, y_2) \}$, the optimization problem over an objective function $f_\theta$ would be: + \[ + \arg\max_\theta f_\theta(y_1 | x_1) f_\theta(y_2 | x_2) + \] + If the second sample $(x_2, y_2)$ appears twice, the problem is: + \[ + \begin{split} + &\arg\max_\theta f_\theta(y_1 | x_1) f_\theta(y_2 | x_2)^2 \\ + &= \arg\max_\theta f_\theta(y_1 | x_1)^{\frac{1}{3}} f_\theta(y_2 | x_2)^{\frac{2}{3}} + \end{split} + \] +\end{remark} + +\begin{description} + \item[Importance sampling] \marginnote{Importance sampling} + Given an empirical risk minimization problem: + \[ \arg\min_\theta - \prod_{i=1}^{m} f_\theta(y_i | x_i) \] + Each training sample can have associated a different weight $w_i$: + \[ + \begin{split} + &\arg\min_\theta - \prod_{i=1}^{m} f_\theta(y_i | x_i)^{w_i} \\ + &= \arg\min_\theta - \sum_{i=1}^{m} w_i \log\left( f_\theta(y_i | x_i) \right) + \end{split} + \] + + The weights can be seen as a ratio: + \[ w_i = \frac{p_i^*}{p_i} \] + where: + \begin{itemize} + \item $p_i$ is the sampling bias that has to be canceled out. + \item $p_i^*$ is the target distribution to emulate. + \end{itemize} +\end{description} + +In this problem, we can define the weights as follows: +\[ + \begin{split} + p_i &= \frac{1}{n} | \text{samples with the same operating mode as $x_i$} | \\ + p_i^* &= \frac{1}{n} + \end{split} +\] + +\begin{remark} + As uniform distribution is assumed, the detector will not be sensitive to the mode. +\end{remark} + +\begin{figure}[H] + \centering + \includegraphics[width=0.9\linewidth]{./img/_skinwrapper_importance.pdf} + \caption{ + \parbox[t]{0.7\linewidth}{ + Autoencoder feature reconstruction error with importance sampling. Note that the operation mode (row 0) now has fewer errors. + } + } +\end{figure} + +\subsubsection{Importance sampling applications} + +\begin{description} + \item[Class rebalancing] + Oversample or undersample training samples based on the class unbalance. + + \begin{remark} + When evaluating a model trained on a rebalanced training set, accuracy might not be the ideal metric. A cost model or confusion matrix can be used. + \end{remark} + + \begin{remark} + Rebalancing should not be used when the data is unbalanced but representative of the real distribution. + \end{remark} + + \item[Remove bias from continuous attributes] + $p_i$ and $p_i^*$ can be probability densities. Therefore, with a continuous feature, it is sufficient to estimate $p_i$ using a density estimator. + + \begin{remark} + It can be useful to clip $p_i$ for numerical stability: + \[ p_i = \max(l, \min(u, f(x_i, y_i))) \] + \end{remark} + + \item[Remove bias from external attributes] + If the dataset is generated from biased external sources, it is possible to attempt to debias it by determining the probability that a sample belong to the dataset and use it as $p_i$. + + \begin{example} + A dataset for organ transplants already contains patients for which doctors think the surgery is more likely to be successful. + + $p_i$ can be the probability that the patient has been selected to be an organ receiver. + \end{example} + + \item[Sample-specific variance] + Importance sampling applied with MSE allows to remove its homoscedasticity (i.e., allow non-constant variance). + + Consider MSE formulated as follows: + \[ \arg\min_\theta - \sum_{i=i}^{m} \log\left( \frac{1}{\sqrt{2\pi}} \exp\left( -\frac{1}{2}(y_i - h_\theta(x_i))^2 \right) \right) \] + By adding as weights $\frac{1}{\hat{\sigma}_i^2}$, the problem becomes: + \[ + \begin{split} + &\arg\min_\theta - \sum_{i=i}^{m} \frac{1}{\hat{\sigma}_i^2} \log\left( \frac{1}{\sqrt{2\pi}} \exp\left( -\frac{1}{2}(y_i - h_\theta(x_i))^2 \right) \right) \\ + &= \arg\min_\theta - \sum_{i=i}^{m} \log\left( \frac{1}{\sqrt{2\pi}} \exp\left( -\frac{(y_i - h_\theta(x_i))^2}{2\hat{\sigma}_i^2} \right) \right) \\ + \end{split} + \] + In other words, the weight in the form $\frac{1}{\hat{\sigma}_i^2}$ represents the inverse sample variance. Therefore, it is possible to specify a different variance for each sample. +\end{description} \ No newline at end of file