mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add A3I neuro-probabilistic model + survival analysis
This commit is contained in:
Binary file not shown.
Binary file not shown.
@ -1,4 +1,4 @@
|
||||
\chapter{Arrivals prediction: Hospital emergency room}
|
||||
\chapter{Arrivals prediction: Hospital emergency room} \label{ch:ap_hospital}
|
||||
|
||||
|
||||
\section{Data}
|
||||
@ -22,7 +22,7 @@ Each row of the dataset represents a patient and the features are:
|
||||
\section{Approaches}
|
||||
|
||||
\begin{remark}
|
||||
MSE assumes that the conditional distribution of the predictions follows a normal distribution.
|
||||
MSE assumes that the conditional distribution $\prob{y | x; \theta}$ of the predictions follows a normal distribution (i.e., $y \sim \mathcal{N}(\mu(x; \theta), \sigma)$).
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
@ -107,7 +107,7 @@ Each row of the dataset represents a patient and the features are:
|
||||
Some considerations must be made:
|
||||
\begin{descriptionlist}
|
||||
\item[Only positive rates]
|
||||
As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm and an exponentiation to achieve this:
|
||||
As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm (i.e., assume that the MLP outputs a log-rate) and an exponentiation to achieve this:
|
||||
\[
|
||||
\begin{split}
|
||||
\log(\hat{\lambda}) &= \texttt{MLP}(x) \\
|
||||
@ -115,6 +115,10 @@ Each row of the dataset represents a patient and the features are:
|
||||
\end{split}
|
||||
\]
|
||||
|
||||
\begin{remark}
|
||||
A strictly positive activation function can also be used.
|
||||
\end{remark}
|
||||
|
||||
\item[Standardization]
|
||||
The input of the network can be standardized. On the other hand, standardizing the output is wrong as the Poisson distribution is discrete.
|
||||
|
||||
@ -133,4 +137,59 @@ Each row of the dataset represents a patient and the features are:
|
||||
\end{remark}
|
||||
\end{descriptionlist}
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
With one-hot encoding, a linear model can be non-linearized into a lookup table.
|
||||
|
||||
\indenttbox
|
||||
\begin{example}
|
||||
\phantom{}
|
||||
|
||||
\begin{minipage}[t]{0.3\linewidth}
|
||||
Consider the dataset:
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\footnotesize
|
||||
\begin{tabular}{c|c}
|
||||
\toprule
|
||||
$x$ & $y$ \\
|
||||
\midrule
|
||||
$0$ & $1$ \\
|
||||
$1$ & $4$ \\
|
||||
$2$ & $2$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
\end{minipage}
|
||||
\begin{minipage}[t]{0.65\linewidth}
|
||||
A linear model learns a straight line:
|
||||
\[ f(x) = \alpha x \]
|
||||
\end{minipage}\\[1em]
|
||||
\begin{minipage}[t]{0.3\linewidth}
|
||||
With the dataset:
|
||||
\begin{table}[H]
|
||||
\centering
|
||||
\footnotesize
|
||||
\begin{tabular}{ccc|c}
|
||||
\toprule
|
||||
$x_0$ & $x_1$ & $x_2$ & $y$ \\
|
||||
\midrule
|
||||
$1$ & $0$ & $0$ & $1$ \\
|
||||
$0$ & $1$ & $0$ & $4$ \\
|
||||
$0$ & $0$ & $1$ & $2$ \\
|
||||
\bottomrule
|
||||
\end{tabular}
|
||||
\end{table}
|
||||
\end{minipage}
|
||||
\begin{minipage}[t]{0.65\linewidth}
|
||||
A linear model learns a linear combination:
|
||||
\[ f(x) = \alpha x_0 + \beta x_1 + \gamma x_2 \]
|
||||
Which allows to easily learn the dataset with $\alpha=1$, $\beta=4$, and $\gamma=2$
|
||||
\end{minipage}
|
||||
\end{example}
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Having access to the distribution allows to compute all possible statistics. For instance, it is possible to plot mean and standard deviation.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
@ -253,4 +253,104 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
|
||||
\end{enumerate}
|
||||
\end{enumerate}
|
||||
\end{description}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Survival analysis model}
|
||||
|
||||
\begin{remark}
|
||||
Chronologically, this approach has been presented after \Cref{ch:ap_hospital}.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Survival analysis model] \marginnote{Survival analysis model}
|
||||
Probabilistic model to estimate the survival time of an entity.
|
||||
|
||||
\item[Survival analysis formalization]
|
||||
Consider a random variable $T$ to model the survival time. The simplest model can be defined as:
|
||||
\[ t \sim \prob{T} \]
|
||||
Remaining survival time depends on the time $\bar{t}$, therefore we have that:
|
||||
\[ t \sim \prob{T \mid \bar{t}} \]
|
||||
Remaining survival time also depends on the past sensor readings $X_{\leq \bar{t}}$ and future readings $X_{> \bar{t}}$ (at this stage, we want to capture what affects the survival time, even if we do not have access to some variables), therefore we have that:
|
||||
\[ t \sim \prob{T \mid \bar{t}, X_{\leq \bar{t}}, X_{> \bar{t}}} \]
|
||||
|
||||
\begin{description}
|
||||
\item[Marginalization]
|
||||
Average over all the possible outcomes of a random variable to cancel it out.
|
||||
|
||||
For this problem, we do not have access to the future sensor readings, so we can marginalize them:
|
||||
\[ t = \underset{X_{> \bar{t}} \sim \prob{X_{> \bar{t}}}}{\mathbb{E}} \left[ \prob{T \mid \bar{t}, X_{\leq \bar{t}}} \right] \]
|
||||
\end{description}
|
||||
|
||||
\begin{remark}
|
||||
In probabilistic terms, the regression approach previously taken can be modelled as:
|
||||
\[ t \sim \mathcal{N}(\mu(X_{\bar{t}}), \sigma) \]
|
||||
where $\mu(\cdot)$ is the regressor.
|
||||
|
||||
Compared to the survival analysis model, there are the following differences:
|
||||
\begin{itemize}
|
||||
\item Regressor only reasons on the sensor readings $X_{\bar{t}}$ at a single time step.
|
||||
\item Regressor does not consider the current time.
|
||||
\item Regressor assumes normal distribution with constant variance.
|
||||
\end{itemize}
|
||||
\end{remark}
|
||||
|
||||
\item[Neuro-probabilistic model]
|
||||
We can assume that the model follows a normal distribution parametrized on both the mean and standard deviation:
|
||||
\[ t \sim \mathcal{N}(\mu(X_{\bar{t}}, \bar{t}), \sigma(X_{\bar{t}}, \bar{t})) \]
|
||||
|
||||
\begin{remark}
|
||||
The readings of a single time step are used as it can be shown that using multiple time steps does not yield significant improvements for this dataset.
|
||||
\end{remark}
|
||||
|
||||
\begin{description}
|
||||
\item[Architecture]
|
||||
Use a neural network that output both $\mu$ and $\sigma$ that are passed to a distribution head.
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_rul_neuroprobabilistic.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
|
||||
|
||||
% \subsection{Survival function}
|
||||
|
||||
% \begin{description}
|
||||
% \item[Censoring] \marginnote{Censoring}
|
||||
% Hide from the dataset key events.
|
||||
|
||||
% \begin{remark}
|
||||
% For this dataset, it is more realistic to use partial runs as run-to-failure experiments are expensive to obtain.
|
||||
% \begin{figure}[H]
|
||||
% \centering
|
||||
% \includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
|
||||
% \caption{Example of partial runs}
|
||||
% \end{figure}
|
||||
% \end{remark}
|
||||
|
||||
% \item[Survival function] \marginnote{Survival function}
|
||||
% Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
|
||||
% \[ S(\bar{t}) = \prob{T > \bar{t}} \]
|
||||
% In other words, it is the probability of surviving at least until time $\bar{t}$.
|
||||
|
||||
% For this problem, the survival function can account for past sensor readings:
|
||||
% \[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
|
||||
|
||||
% \item[Hazard function] \marginnote{Hazard function}
|
||||
% Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
|
||||
% \[ \lambda(\bar{t}) = \prob{T > \bar{t} \mid T > \bar{t}-1} \]
|
||||
% In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
|
||||
|
||||
% With discrete time, the survival function can be factorized using the hazard function:
|
||||
% \[ S(\bar{t}) = (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
|
||||
|
||||
% For this problem, the hazard function is the following:
|
||||
% \[
|
||||
% \begin{gathered}
|
||||
% \lambda(\bar{t}, X_{\bar{t}}) = \prob{T > \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
|
||||
% S(\bar{t}, X_{\bar{t}}) = (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
|
||||
% \end{gathered}
|
||||
% \]
|
||||
% \end{description}
|
||||
Reference in New Issue
Block a user