Add A3I wear prediction + arrivals prediction

2026-02-05 00:01:43 +01:00 · 2024-10-27 11:01:09 +01:00
parent 6f2f2821b7
commit cff26da369
12 changed files with 337 additions and 5 deletions
--- a/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex
@ -0,0 +1,136 @@
+\chapter{Arrivals prediction: Hospital emergency room}
+
+
+\section{Data}
+
+The dataset contains accesses to the emergency room of the Maggiore Hospital in Bologna.
+
+Each row of the dataset represents a patient and the features are:
+\begin{descriptionlist}
+    \item[\texttt{Triage}] Time of arrival.
+    \item[\texttt{TKCharge}] Time of first visit.
+    \item[\texttt{Code}] Priority (\texttt{white}, \texttt{green}, \texttt{yellow}, and \texttt{red}).
+    \item[\texttt{Outcome}] Indicates whether the patient got admitted or left.
+\end{descriptionlist}
+
+\begin{description}
+    \item[Binning] 
+        As the problem is to predict the total number of arrivals at fixed intervals, binning can be used to obtain a dataset with an hour granularity.
+\end{description}
+
+
+\section{Approaches}
+
+\begin{remark}
+    MSE assumes that the conditional distribution of the predictions follows a normal distribution.
+\end{remark}
+
+\begin{remark}
+    Arrivals usually follow a Poisson distribution as:
+    \begin{itemize}
+        \item There is a skew in the tail.
+        \item All values are positive.
+        \item Events are independent.
+    \end{itemize}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_poisson.pdf}
+        \caption{Arrivals distribution at 6 a.m.}
+    \end{figure}
+
+    \begin{theorem}
+        If the inter-arrival time is exponential with respect to a fixed rate, the counts of the arrivals will always follow a Poisson distribution.
+    \end{theorem}
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_interarrival.pdf}
+        \caption{Dataset inter-arrival counts (the first bar is due to binning artifacts)}
+    \end{figure}
+\end{remark}
+
+\subsection{Neuro-probabilistic model}
+
+\begin{description}
+    \item[Poisson distribution] \marginnote{Poisson distribution}
+        Distribution with discrete support whose probability mass function is defined as:
+        \[ p(k, \lambda) = \frac{\lambda^k e^{-\lambda}}{k!} \]
+        where $\lambda$ is the occurrence rate of the events.
+
+        \begin{remark}
+            Both mean and standard deviation of a Poisson distribution are $\lambda$.
+        \end{remark}
+
+        \begin{remark}
+            $\lambda^{-\frac{1}{2}}$ represent the distribution skewness. For smaller values of $\lambda$, there is a positive skew to the left. For larger values of $\lambda$, the skew becomes less observable.
+        \end{remark}
+
+        \begin{remark}
+            For this problem, $\lambda$ is the average number of arrivals in each bin.
+        \end{remark}
+
+    \item[Neuro-probabilistic model] \marginnote{Neuro-probabilistic model}
+        Model that combines statistics and machine learning.
+
+        \begin{remark}
+            This class of models is known in the statistics literature as generalized linear model. With neural networks (i.e., non-linearity), they do not have an official name. ``Neuro-probabilistic model'' is an unofficial name.
+        \end{remark}
+
+        This problem can be formulated through the following probabilistic model:
+        \[ y \sim \texttt{Poisson}(\lambda(x)) \]
+        where $y$ is the number of arrivals and $\lambda(x)$ is the rate parametrized on the temporal information $x$.
+
+        The rate can be approximated using an estimator as:
+        \[ y \sim \texttt{Poisson}(\lambda(x, \theta)) \]
+        where $\lambda(x, \theta)$ is a regression model.
+
+        \begin{description}
+            \item[Loss]
+                Training is done for maximum likelihood estimation using the Poisson distribution:
+                \[
+                    \begin{split}
+                        &\arg\min_\theta - \sum_{i=1}^{m} \log\left( f(y_i, \lambda(x_i, \theta)) \right) \\
+                        &= \arg\min_\theta - \sum_{i=1}^{m} \log\left( \frac{\lambda(x_i, \theta)^{y_i} e^{\lambda(x_i, \theta)}}{y_i!} \right)
+                    \end{split}
+                \]
+
+            \item[Architecture]
+                Use an MLP to predict $\hat{\lambda}$ and then, instead of outputting the prediction directly, output a Poisson distribution object with rate $\hat{\lambda}$:
+                \[
+                    \begin{split}
+                        \hat{\lambda} &= \texttt{MLP}(x) \\
+                        \texttt{out} &= \texttt{Poisson}(\cdot, \hat{\lambda})
+                    \end{split}
+                \]
+
+                Some considerations must be made:
+                \begin{descriptionlist}
+                    \item[Only positive rates] 
+                        As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm and an exponentiation to achieve this:
+                        \[
+                            \begin{split}
+                                \log(\hat{\lambda}) &= \texttt{MLP}(x) \\
+                                \texttt{out} &= \texttt{Poisson}\left( \cdot, \exp\left(\log(\hat{\lambda})\right) \right) = \texttt{Poisson}(\cdot, \hat{\lambda})
+                            \end{split}
+                        \]
+
+                    \item[Standardization]
+                        The input of the network can be standardized. On the other hand, standardizing the output is wrong as the Poisson distribution is discrete.
+
+                        However, if the input is standardized (for training stability), the output of the network with normal weights initialization will have $0$ mean. Therefore, at the beginning, the network will predict $\hat{\lambda} \approx 1$ (i.e., the MLP predicts $\log(\hat{\lambda}) \approx 0$ and then $\hat{\lambda} = \exp\left(\log(\hat{\lambda})\right) \approx 1$) which might not be a reasonable starting point.
+
+                        It is possible to provide an initial guess $\bar{\lambda}$ for $\hat{\lambda}$. This value can be used as a multiplicative factor, so that the first prediction will be close to $\bar{\lambda} \cdot 1$:
+                        \[
+                            \begin{split}
+                                \log(\hat{\lambda}) &= \texttt{MLP}(x) \\
+                                \texttt{out} &= \texttt{Poisson}\left( \cdot, \bar{\lambda}\exp\left(\log(\hat{\lambda})\right) \right) = \texttt{Poisson}(\cdot, \bar{\lambda}\hat{\lambda})
+                            \end{split}
+                        \]
+
+                        \begin{remark}
+                            For this problem $\bar{\lambda}$ can be the average number of arrivals in each bin.
+                        \end{remark}
+                \end{descriptionlist}
+        \end{description}
+\end{description}
--- a/src/year2/artificial-intelligence-in-industry/sections/_wear_anomalies.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_wear_anomalies.tex
@ -0,0 +1,194 @@
+\chapter{Component wear anomalies: Skin wrapper machines}
+
+
+\section{Data}
+
+The dataset represents a single run-to-failure experiment of a skin wrapper machine with a 4 ms sampling period and different segments (i.e., operating mode).
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.9\linewidth]{./img/_skinwrapper_heatmap.pdf}
+    \caption{Dataset heatmap}
+\end{figure}
+
+The following observations can be made:
+\begin{itemize}
+    \item Some features (i.e., rows 0 and 8 of the heatmap) are fixed for long periods of time (i.e., piece-wise constant). They are most likely controlled parameters.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_constant.pdf}
+    \end{figure}
+
+    \item A feature (i.e., row 2 of the heatmap) seems constant but in reality it has many short-lived peaks.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_peaks.pdf}
+    \end{figure}
+
+    \begin{remark}
+        This type of signal might be useful to determine a period in the series.
+    \end{remark}
+
+    \item Some features (i.e., rows 3 and 5 of the heatmap) contain sudden localized deviations. This is most likely due to external interventions.
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.8\linewidth]{./img/_skinwrapper_deviation.pdf}
+    \end{figure}
+\end{itemize}
+
+
+\subsection{Binning}
+
+\begin{description}
+    \item[Binning] \marginnote{Binning}
+        Reduce the granularity of the data using a non-overlapping sliding window and aggregation functions.
+
+        \begin{remark}
+            With high-frequency data, some approaches (e.g., a neural network) might not be able to keep up if real-time predictions are required.
+        \end{remark}
+
+        \begin{remark}
+            After binning, the number of samples is reduced, but the number of features might increase.
+        \end{remark}
+\end{description}
+
+
+
+\section{Approaches}
+
+
+\subsection{Autoencoder}
+
+\begin{description}
+    \item[Autoencoder]
+        Train an autoencoder on earlier non-anomalous data and use the reconstruction error to detect component wear.
+
+        Results show that there is a high reconstruction error for the operating mode feature (i.e., first row), which is a controlled parameter. This hints at the fact that some operating modes are too infrequent, so the model is unable to reconstruct them.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.9\linewidth]{./img/_skinwrapper_autoencoder.pdf}
+        \end{figure}
+
+\end{description}
+
+\begin{remark}
+    There is a distribution drift (i.e., future does not behave like the past) in the dataset.
+
+    \begin{figure}[H]
+        \centering
+        \includegraphics[width=0.65\linewidth]{./img/_skinwrapper_distribution.pdf}
+    \end{figure}
+\end{remark}
+
+\begin{remark}
+    To account for distribution drift, chronological splitting for the training and test sets is a better approach instead of random sampling (which ignores the drift).
+
+    The validation set can be created through random sampling as, in case it is created chronologically, it might create a gap between training and test data. However, this approach ignores drift when validating.
+\end{remark}
+
+
+\subsection{Class rebalancing}
+
+
+\begin{remark}
+    Consider two training samples $\{ (x_1, y_1), (x_2, y_2) \}$, the optimization problem over an objective function $f_\theta$ would be:
+    \[
+        \arg\max_\theta f_\theta(y_1 | x_1) f_\theta(y_2 | x_2)
+    \]
+    If the second sample $(x_2, y_2)$ appears twice, the problem is:
+    \[
+        \begin{split}
+            &\arg\max_\theta f_\theta(y_1 | x_1) f_\theta(y_2 | x_2)^2 \\
+            &= \arg\max_\theta f_\theta(y_1 | x_1)^{\frac{1}{3}} f_\theta(y_2 | x_2)^{\frac{2}{3}}
+        \end{split}
+    \]
+\end{remark}
+
+\begin{description}
+    \item[Importance sampling] \marginnote{Importance sampling}
+        Given an empirical risk minimization problem:
+        \[ \arg\min_\theta - \prod_{i=1}^{m} f_\theta(y_i | x_i) \]
+        Each training sample can have associated a different weight $w_i$:
+        \[ 
+            \begin{split}
+                &\arg\min_\theta - \prod_{i=1}^{m} f_\theta(y_i | x_i)^{w_i} \\
+                &= \arg\min_\theta - \sum_{i=1}^{m} w_i \log\left( f_\theta(y_i | x_i) \right)
+            \end{split}
+        \]
+
+        The weights can be seen as a ratio:
+        \[ w_i = \frac{p_i^*}{p_i} \]
+        where:
+        \begin{itemize}
+            \item $p_i$ is the sampling bias that has to be canceled out.
+            \item $p_i^*$ is the target distribution to emulate.
+        \end{itemize}
+\end{description}
+
+In this problem, we can define the weights as follows:
+\[
+    \begin{split}
+        p_i &= \frac{1}{n} | \text{samples with the same operating mode as $x_i$} | \\
+        p_i^* &= \frac{1}{n}
+    \end{split}
+\]
+
+\begin{remark}
+    As uniform distribution is assumed, the detector will not be sensitive to the mode.
+\end{remark}
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.9\linewidth]{./img/_skinwrapper_importance.pdf}
+    \caption{
+        \parbox[t]{0.7\linewidth}{
+            Autoencoder feature reconstruction error with importance sampling. Note that the operation mode (row 0) now has fewer errors.
+        }
+    }
+\end{figure}
+
+\subsubsection{Importance sampling applications}
+
+\begin{description}
+    \item[Class rebalancing]
+        Oversample or undersample training samples based on the class unbalance.
+
+        \begin{remark}
+            When evaluating a model trained on a rebalanced training set, accuracy might not be the ideal metric. A cost model or confusion matrix can be used.
+        \end{remark}
+
+        \begin{remark}
+            Rebalancing should not be used when the data is unbalanced but representative of the real distribution.
+        \end{remark}
+
+    \item[Remove bias from continuous attributes]
+        $p_i$ and $p_i^*$ can be probability densities. Therefore, with a continuous feature, it is sufficient to estimate $p_i$ using a density estimator.
+
+        \begin{remark}
+            It can be useful to clip $p_i$ for numerical stability:
+            \[ p_i = \max(l, \min(u, f(x_i, y_i))) \]
+        \end{remark}
+
+    \item[Remove bias from external attributes]
+        If the dataset is generated from biased external sources, it is possible to attempt to debias it by determining the probability that a sample belong to the dataset and use it as $p_i$.
+
+        \begin{example}
+            A dataset for organ transplants already contains patients for which doctors think the surgery is more likely to be successful.
+
+            $p_i$ can be the probability that the patient has been selected to be an organ receiver.
+        \end{example}
+
+    \item[Sample-specific variance]
+        Importance sampling applied with MSE allows to remove its homoscedasticity (i.e., allow non-constant variance).
+
+        Consider MSE formulated as follows:
+        \[ \arg\min_\theta - \sum_{i=i}^{m} \log\left( \frac{1}{\sqrt{2\pi}} \exp\left( -\frac{1}{2}(y_i - h_\theta(x_i))^2 \right) \right) \]
+        By adding as weights $\frac{1}{\hat{\sigma}_i^2}$, the problem becomes:
+        \[ 
+            \begin{split}
+                &\arg\min_\theta - \sum_{i=i}^{m} \frac{1}{\hat{\sigma}_i^2} \log\left( \frac{1}{\sqrt{2\pi}} \exp\left( -\frac{1}{2}(y_i - h_\theta(x_i))^2 \right) \right) \\
+                &= \arg\min_\theta - \sum_{i=i}^{m} \log\left( \frac{1}{\sqrt{2\pi}} \exp\left( -\frac{(y_i - h_\theta(x_i))^2}{2\hat{\sigma}_i^2} \right) \right) \\
+            \end{split}
+        \]
+        In other words, the weight in the form $\frac{1}{\hat{\sigma}_i^2}$ represents the inverse sample variance. Therefore, it is possible to specify a different variance for each sample.
+\end{description}