Add A3I neuro-probabilistic model + survival analysis

2026-02-04 07:41:43 +01:00 · 2024-11-01 11:29:58 +01:00
parent e9b9b4835c
commit 7dbab460a8
4 changed files with 163 additions and 4 deletions
--- a/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf
--- a/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf
+++ b/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf
--- a/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex
@ -1,4 +1,4 @@
-\chapter{Arrivals prediction: Hospital emergency room}
+\chapter{Arrivals prediction: Hospital emergency room} \label{ch:ap_hospital}
 \section{Data}
@ -22,7 +22,7 @@ Each row of the dataset represents a patient and the features are:
 \section{Approaches}
 \begin{remark}
-    MSE assumes that the conditional distribution of the predictions follows a normal distribution.
+    MSE assumes that the conditional distribution $\prob{y | x; \theta}$ of the predictions follows a normal distribution (i.e., $y \sim \mathcal{N}(\mu(x; \theta), \sigma)$).
 \end{remark}
 \begin{remark}
@ -107,7 +107,7 @@ Each row of the dataset represents a patient and the features are:
                Some considerations must be made:
                \begin{descriptionlist}
                    \item[Only positive rates] 
-                        As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm and an exponentiation to achieve this:
+                        As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm (i.e., assume that the MLP outputs a log-rate) and an exponentiation to achieve this:
                        \[
                            \begin{split}
                                \log(\hat{\lambda}) &= \texttt{MLP}(x) \\
@ -115,6 +115,10 @@ Each row of the dataset represents a patient and the features are:
                            \end{split}
                        \]
                        \begin{remark}
                            A strictly positive activation function can also be used.
                        \end{remark}
                    \item[Standardization]
                        The input of the network can be standardized. On the other hand, standardizing the output is wrong as the Poisson distribution is discrete.
@ -133,4 +137,59 @@ Each row of the dataset represents a patient and the features are:
                        \end{remark}
                \end{descriptionlist}
        \end{description}
        \begin{remark}
            With one-hot encoding, a linear model can be non-linearized into a lookup table.
            \indenttbox
            \begin{example}
                \phantom{}
                \begin{minipage}[t]{0.3\linewidth}
                    Consider the dataset:
                    \begin{table}[H]
                        \centering
                        \footnotesize
                        \begin{tabular}{c|c}
                            \toprule
                            $x$ & $y$ \\
                            \midrule
                            $0$ & $1$ \\
                            $1$ & $4$ \\
                            $2$ & $2$ \\
                            \bottomrule
                        \end{tabular}
                    \end{table}
                \end{minipage}
                \begin{minipage}[t]{0.65\linewidth}
                    A linear model learns a straight line:
                    \[ f(x) = \alpha x \]
                \end{minipage}\\[1em]
                \begin{minipage}[t]{0.3\linewidth}
                    With the dataset:
                    \begin{table}[H]
                        \centering
                        \footnotesize
                        \begin{tabular}{ccc|c}
                            \toprule
                            $x_0$ & $x_1$ & $x_2$ & $y$ \\
                            \midrule
                            $1$ & $0$ & $0$ & $1$ \\
                            $0$ & $1$ & $0$ & $4$ \\
                            $0$ & $0$ & $1$ & $2$ \\
                            \bottomrule
                        \end{tabular}
                    \end{table}
                \end{minipage}
                \begin{minipage}[t]{0.65\linewidth}
                    A linear model learns a linear combination:
                    \[ f(x) = \alpha x_0 + \beta x_1 + \gamma x_2 \]
                    Which allows to easily learn the dataset with $\alpha=1$, $\beta=4$, and $\gamma=2$
                \end{minipage}
            \end{example}
        \end{remark}
        \begin{remark}
            Having access to the distribution allows to compute all possible statistics. For instance, it is possible to plot mean and standard deviation.
        \end{remark}
 \end{description}
--- a/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex
+++ b/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex
@ -253,4 +253,104 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
                    \end{enumerate}
                \end{enumerate}
        \end{description}
-\end{description}
+\end{description}
 \subsection{Survival analysis model}
 \begin{remark}
    Chronologically, this approach has been presented after \Cref{ch:ap_hospital}.
 \end{remark}
 \begin{description}
    \item[Survival analysis model] \marginnote{Survival analysis model}
        Probabilistic model to estimate the survival time of an entity.
    \item[Survival analysis formalization]
        Consider a random variable $T$ to model the survival time. The simplest model can be defined as:
        \[ t \sim \prob{T} \]
        Remaining survival time depends on the time $\bar{t}$, therefore we have that:
        \[ t \sim \prob{T \mid \bar{t}} \]
        Remaining survival time also depends on the past sensor readings $X_{\leq \bar{t}}$ and future readings $X_{> \bar{t}}$ (at this stage, we want to capture what affects the survival time, even if we do not have access to some variables), therefore we have that:
        \[ t \sim \prob{T \mid \bar{t}, X_{\leq \bar{t}}, X_{> \bar{t}}} \]
        \begin{description}
            \item[Marginalization] 
                Average over all the possible outcomes of a random variable to cancel it out.
                For this problem, we do not have access to the future sensor readings, so we can marginalize them:
                \[ t = \underset{X_{> \bar{t}} \sim \prob{X_{> \bar{t}}}}{\mathbb{E}} \left[ \prob{T \mid \bar{t}, X_{\leq \bar{t}}} \right] \]
        \end{description}
        \begin{remark}
            In probabilistic terms, the regression approach previously taken can be modelled as:
            \[ t \sim \mathcal{N}(\mu(X_{\bar{t}}), \sigma) \]
            where $\mu(\cdot)$ is the regressor.
            Compared to the survival analysis model, there are the following differences:
            \begin{itemize}
                \item Regressor only reasons on the sensor readings $X_{\bar{t}}$ at a single time step. 
                \item Regressor does not consider the current time.
                \item Regressor assumes normal distribution with constant variance.
            \end{itemize}
        \end{remark}
    \item[Neuro-probabilistic model]
        We can assume that the model follows a normal distribution parametrized on both the mean and standard deviation:
        \[ t \sim \mathcal{N}(\mu(X_{\bar{t}}, \bar{t}), \sigma(X_{\bar{t}}, \bar{t})) \]
        \begin{remark}
            The readings of a single time step are used as it can be shown that using multiple time steps does not yield significant improvements for this dataset.
        \end{remark}
        \begin{description}
            \item[Architecture]
                Use a neural network that output both $\mu$ and $\sigma$ that are passed to a distribution head.
        \end{description}
        \begin{figure}[H]
            \centering
            \includegraphics[width=0.7\linewidth]{./img/_rul_neuroprobabilistic.pdf}
        \end{figure}
 \end{description}
 % \subsection{Survival function}
 % \begin{description}
 %     \item[Censoring] \marginnote{Censoring}
 %         Hide from the dataset key events.
 %         \begin{remark}
 %             For this dataset, it is more realistic to use partial runs as run-to-failure experiments are expensive to obtain.
 %             \begin{figure}[H]
 %                 \centering
 %                 \includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
 %                 \caption{Example of partial runs}
 %             \end{figure}
 %         \end{remark}
 %     \item[Survival function] \marginnote{Survival function}
 %         Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
 %         \[ S(\bar{t}) = \prob{T > \bar{t}} \]
 %         In other words, it is the probability of surviving at least until time $\bar{t}$.
 %         For this problem, the survival function can account for past sensor readings:
 %         \[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
 %     \item[Hazard function] \marginnote{Hazard function}
 %         Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
 %         \[ \lambda(\bar{t}) = \prob{T > \bar{t} \mid T > \bar{t}-1} \]
 %         In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
 %         With discrete time, the survival function can be factorized using the hazard function:
 %         \[ S(\bar{t}) = (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
 %         For this problem, the hazard function is the following:
 %         \[ 
 %             \begin{gathered}
 %                 \lambda(\bar{t}, X_{\bar{t}}) = \prob{T > \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
 %                 S(\bar{t}, X_{\bar{t}}) = (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
 %             \end{gathered}
 %         \]
 % \end{description}