Add A3I neuro-probabilistic model + survival analysis

2026-02-04 07:41:43 +01:00 · 2024-11-01 11:29:58 +01:00
parent e9b9b4835c
commit 7dbab460a8
4 changed files with 163 additions and 4 deletions
--- a/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf
--- a/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf
--- a/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex
@ -1,4 +1,4 @@
-\chapter{Arrivals prediction: Hospital emergency room}
+\chapter{Arrivals prediction: Hospital emergency room} \label{ch:ap_hospital}


 \section{Data}
@ -22,7 +22,7 @@ Each row of the dataset represents a patient and the features are:
 \section{Approaches}

 \begin{remark}
-    MSE assumes that the conditional distribution of the predictions follows a normal distribution.
+    MSE assumes that the conditional distribution $\prob{y | x; \theta}$ of the predictions follows a normal distribution (i.e., $y \sim \mathcal{N}(\mu(x; \theta), \sigma)$).
 \end{remark}

 \begin{remark}
@ -107,7 +107,7 @@ Each row of the dataset represents a patient and the features are:
                Some considerations must be made:
                \begin{descriptionlist}
                    \item[Only positive rates] 
-                        As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm and an exponentiation to achieve this:
+                        As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm (i.e., assume that the MLP outputs a log-rate) and an exponentiation to achieve this:
                        \[
                            \begin{split}
                                \log(\hat{\lambda}) &= \texttt{MLP}(x) \\
@ -115,6 +115,10 @@ Each row of the dataset represents a patient and the features are:
                            \end{split}
                        \]

+                        \begin{remark}
+                            A strictly positive activation function can also be used.
+                        \end{remark}
+
                    \item[Standardization]
                        The input of the network can be standardized. On the other hand, standardizing the output is wrong as the Poisson distribution is discrete.

@ -133,4 +137,59 @@ Each row of the dataset represents a patient and the features are:
                        \end{remark}
                \end{descriptionlist}
        \end{description}
+
+        \begin{remark}
+            With one-hot encoding, a linear model can be non-linearized into a lookup table.
+
+            \indenttbox
+            \begin{example}
+                \phantom{}
+
+                \begin{minipage}[t]{0.3\linewidth}
+                    Consider the dataset:
+                    \begin{table}[H]
+                        \centering
+                        \footnotesize
+                        \begin{tabular}{c|c}
+                            \toprule
+                            $x$ & $y$ \\
+                            \midrule
+                            $0$ & $1$ \\
+                            $1$ & $4$ \\
+                            $2$ & $2$ \\
+                            \bottomrule
+                        \end{tabular}
+                    \end{table}
+                \end{minipage}
+                \begin{minipage}[t]{0.65\linewidth}
+                    A linear model learns a straight line:
+                    \[ f(x) = \alpha x \]
+                \end{minipage}\\[1em]
+                \begin{minipage}[t]{0.3\linewidth}
+                    With the dataset:
+                    \begin{table}[H]
+                        \centering
+                        \footnotesize
+                        \begin{tabular}{ccc|c}
+                            \toprule
+                            $x_0$ & $x_1$ & $x_2$ & $y$ \\
+                            \midrule
+                            $1$ & $0$ & $0$ & $1$ \\
+                            $0$ & $1$ & $0$ & $4$ \\
+                            $0$ & $0$ & $1$ & $2$ \\
+                            \bottomrule
+                        \end{tabular}
+                    \end{table}
+                \end{minipage}
+                \begin{minipage}[t]{0.65\linewidth}
+                    A linear model learns a linear combination:
+                    \[ f(x) = \alpha x_0 + \beta x_1 + \gamma x_2 \]
+                    Which allows to easily learn the dataset with $\alpha=1$, $\beta=4$, and $\gamma=2$
+                \end{minipage}
+            \end{example}
+        \end{remark}
+
+        \begin{remark}
+            Having access to the distribution allows to compute all possible statistics. For instance, it is possible to plot mean and standard deviation.
+        \end{remark}
 \end{description}
--- a/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex
@ -253,4 +253,104 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
                    \end{enumerate}
                \end{enumerate}
        \end{description}
-\end{description}
+\end{description}
+
+
+\subsection{Survival analysis model}
+
+\begin{remark}
+    Chronologically, this approach has been presented after \Cref{ch:ap_hospital}.
+\end{remark}
+
+\begin{description}
+    \item[Survival analysis model] \marginnote{Survival analysis model}
+        Probabilistic model to estimate the survival time of an entity.
+
+    \item[Survival analysis formalization]
+        Consider a random variable $T$ to model the survival time. The simplest model can be defined as:
+        \[ t \sim \prob{T} \]
+        Remaining survival time depends on the time $\bar{t}$, therefore we have that:
+        \[ t \sim \prob{T \mid \bar{t}} \]
+        Remaining survival time also depends on the past sensor readings $X_{\leq \bar{t}}$ and future readings $X_{> \bar{t}}$ (at this stage, we want to capture what affects the survival time, even if we do not have access to some variables), therefore we have that:
+        \[ t \sim \prob{T \mid \bar{t}, X_{\leq \bar{t}}, X_{> \bar{t}}} \]
+
+        \begin{description}
+            \item[Marginalization] 
+                Average over all the possible outcomes of a random variable to cancel it out.
+
+                For this problem, we do not have access to the future sensor readings, so we can marginalize them:
+                \[ t = \underset{X_{> \bar{t}} \sim \prob{X_{> \bar{t}}}}{\mathbb{E}} \left[ \prob{T \mid \bar{t}, X_{\leq \bar{t}}} \right] \]
+        \end{description}
+
+        \begin{remark}
+            In probabilistic terms, the regression approach previously taken can be modelled as:
+            \[ t \sim \mathcal{N}(\mu(X_{\bar{t}}), \sigma) \]
+            where $\mu(\cdot)$ is the regressor.
+
+            Compared to the survival analysis model, there are the following differences:
+            \begin{itemize}
+                \item Regressor only reasons on the sensor readings $X_{\bar{t}}$ at a single time step. 
+                \item Regressor does not consider the current time.
+                \item Regressor assumes normal distribution with constant variance.
+            \end{itemize}
+        \end{remark}
+
+    \item[Neuro-probabilistic model]
+        We can assume that the model follows a normal distribution parametrized on both the mean and standard deviation:
+        \[ t \sim \mathcal{N}(\mu(X_{\bar{t}}, \bar{t}), \sigma(X_{\bar{t}}, \bar{t})) \]
+
+        \begin{remark}
+            The readings of a single time step are used as it can be shown that using multiple time steps does not yield significant improvements for this dataset.
+        \end{remark}
+
+        \begin{description}
+            \item[Architecture]
+                Use a neural network that output both $\mu$ and $\sigma$ that are passed to a distribution head.
+        \end{description}
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.7\linewidth]{./img/_rul_neuroprobabilistic.pdf}
+        \end{figure}
+\end{description}
+
+
+% \subsection{Survival function}
+
+% \begin{description}
+%     \item[Censoring] \marginnote{Censoring}
+%         Hide from the dataset key events.
+
+%         \begin{remark}
+%             For this dataset, it is more realistic to use partial runs as run-to-failure experiments are expensive to obtain.
+%             \begin{figure}[H]
+%                 \centering
+%                 \includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
+%                 \caption{Example of partial runs}
+%             \end{figure}
+%         \end{remark}
+
+%     \item[Survival function] \marginnote{Survival function}
+%         Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
+%         \[ S(\bar{t}) = \prob{T > \bar{t}} \]
+%         In other words, it is the probability of surviving at least until time $\bar{t}$.
+
+%         For this problem, the survival function can account for past sensor readings:
+%         \[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
+
+%     \item[Hazard function] \marginnote{Hazard function}
+%         Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
+%         \[ \lambda(\bar{t}) = \prob{T > \bar{t} \mid T > \bar{t}-1} \]
+%         In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
+
+%         With discrete time, the survival function can be factorized using the hazard function:
+%         \[ S(\bar{t}) = (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
+
+%         For this problem, the hazard function is the following:
+%         \[ 
+%             \begin{gathered}
+%                 \lambda(\bar{t}, X_{\bar{t}}) = \prob{T > \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
+%                 S(\bar{t}, X_{\bar{t}}) = (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
+%             \end{gathered}
+%         \]
+% \end{description}