diff --git a/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf b/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf new file mode 100644 index 0000000..4077dbc Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_rul_censoring.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf b/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf new file mode 100644 index 0000000..f01f583 Binary files /dev/null and b/src/year2/artificial-intelligence-in-industry/img/_rul_neuroprobabilistic.pdf differ diff --git a/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex index 7ffc601..0d8b5bf 100644 --- a/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex +++ b/src/year2/artificial-intelligence-in-industry/sections/_arrivals_predicition.tex @@ -1,4 +1,4 @@ -\chapter{Arrivals prediction: Hospital emergency room} +\chapter{Arrivals prediction: Hospital emergency room} \label{ch:ap_hospital} \section{Data} @@ -22,7 +22,7 @@ Each row of the dataset represents a patient and the features are: \section{Approaches} \begin{remark} - MSE assumes that the conditional distribution of the predictions follows a normal distribution. + MSE assumes that the conditional distribution $\prob{y | x; \theta}$ of the predictions follows a normal distribution (i.e., $y \sim \mathcal{N}(\mu(x; \theta), \sigma)$). \end{remark} \begin{remark} @@ -107,7 +107,7 @@ Each row of the dataset represents a patient and the features are: Some considerations must be made: \begin{descriptionlist} \item[Only positive rates] - As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm and an exponentiation to achieve this: + As $\hat{\lambda}$ must be positive, it is possible to combine a logarithm (i.e., assume that the MLP outputs a log-rate) and an exponentiation to achieve this: \[ \begin{split} \log(\hat{\lambda}) &= \texttt{MLP}(x) \\ @@ -115,6 +115,10 @@ Each row of the dataset represents a patient and the features are: \end{split} \] + \begin{remark} + A strictly positive activation function can also be used. + \end{remark} + \item[Standardization] The input of the network can be standardized. On the other hand, standardizing the output is wrong as the Poisson distribution is discrete. @@ -133,4 +137,59 @@ Each row of the dataset represents a patient and the features are: \end{remark} \end{descriptionlist} \end{description} + + \begin{remark} + With one-hot encoding, a linear model can be non-linearized into a lookup table. + + \indenttbox + \begin{example} + \phantom{} + + \begin{minipage}[t]{0.3\linewidth} + Consider the dataset: + \begin{table}[H] + \centering + \footnotesize + \begin{tabular}{c|c} + \toprule + $x$ & $y$ \\ + \midrule + $0$ & $1$ \\ + $1$ & $4$ \\ + $2$ & $2$ \\ + \bottomrule + \end{tabular} + \end{table} + \end{minipage} + \begin{minipage}[t]{0.65\linewidth} + A linear model learns a straight line: + \[ f(x) = \alpha x \] + \end{minipage}\\[1em] + \begin{minipage}[t]{0.3\linewidth} + With the dataset: + \begin{table}[H] + \centering + \footnotesize + \begin{tabular}{ccc|c} + \toprule + $x_0$ & $x_1$ & $x_2$ & $y$ \\ + \midrule + $1$ & $0$ & $0$ & $1$ \\ + $0$ & $1$ & $0$ & $4$ \\ + $0$ & $0$ & $1$ & $2$ \\ + \bottomrule + \end{tabular} + \end{table} + \end{minipage} + \begin{minipage}[t]{0.65\linewidth} + A linear model learns a linear combination: + \[ f(x) = \alpha x_0 + \beta x_1 + \gamma x_2 \] + Which allows to easily learn the dataset with $\alpha=1$, $\beta=4$, and $\gamma=2$ + \end{minipage} + \end{example} + \end{remark} + + \begin{remark} + Having access to the distribution allows to compute all possible statistics. For instance, it is possible to plot mean and standard deviation. + \end{remark} \end{description} \ No newline at end of file diff --git a/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex b/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex index f90ea64..cd54441 100644 --- a/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex +++ b/src/year2/artificial-intelligence-in-industry/sections/_remaining_useful_life.tex @@ -253,4 +253,104 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that \end{enumerate} \end{enumerate} \end{description} -\end{description} \ No newline at end of file +\end{description} + + +\subsection{Survival analysis model} + +\begin{remark} + Chronologically, this approach has been presented after \Cref{ch:ap_hospital}. +\end{remark} + +\begin{description} + \item[Survival analysis model] \marginnote{Survival analysis model} + Probabilistic model to estimate the survival time of an entity. + + \item[Survival analysis formalization] + Consider a random variable $T$ to model the survival time. The simplest model can be defined as: + \[ t \sim \prob{T} \] + Remaining survival time depends on the time $\bar{t}$, therefore we have that: + \[ t \sim \prob{T \mid \bar{t}} \] + Remaining survival time also depends on the past sensor readings $X_{\leq \bar{t}}$ and future readings $X_{> \bar{t}}$ (at this stage, we want to capture what affects the survival time, even if we do not have access to some variables), therefore we have that: + \[ t \sim \prob{T \mid \bar{t}, X_{\leq \bar{t}}, X_{> \bar{t}}} \] + + \begin{description} + \item[Marginalization] + Average over all the possible outcomes of a random variable to cancel it out. + + For this problem, we do not have access to the future sensor readings, so we can marginalize them: + \[ t = \underset{X_{> \bar{t}} \sim \prob{X_{> \bar{t}}}}{\mathbb{E}} \left[ \prob{T \mid \bar{t}, X_{\leq \bar{t}}} \right] \] + \end{description} + + \begin{remark} + In probabilistic terms, the regression approach previously taken can be modelled as: + \[ t \sim \mathcal{N}(\mu(X_{\bar{t}}), \sigma) \] + where $\mu(\cdot)$ is the regressor. + + Compared to the survival analysis model, there are the following differences: + \begin{itemize} + \item Regressor only reasons on the sensor readings $X_{\bar{t}}$ at a single time step. + \item Regressor does not consider the current time. + \item Regressor assumes normal distribution with constant variance. + \end{itemize} + \end{remark} + + \item[Neuro-probabilistic model] + We can assume that the model follows a normal distribution parametrized on both the mean and standard deviation: + \[ t \sim \mathcal{N}(\mu(X_{\bar{t}}, \bar{t}), \sigma(X_{\bar{t}}, \bar{t})) \] + + \begin{remark} + The readings of a single time step are used as it can be shown that using multiple time steps does not yield significant improvements for this dataset. + \end{remark} + + \begin{description} + \item[Architecture] + Use a neural network that output both $\mu$ and $\sigma$ that are passed to a distribution head. + \end{description} + + \begin{figure}[H] + \centering + \includegraphics[width=0.7\linewidth]{./img/_rul_neuroprobabilistic.pdf} + \end{figure} +\end{description} + + +% \subsection{Survival function} + +% \begin{description} +% \item[Censoring] \marginnote{Censoring} +% Hide from the dataset key events. + +% \begin{remark} +% For this dataset, it is more realistic to use partial runs as run-to-failure experiments are expensive to obtain. +% \begin{figure}[H] +% \centering +% \includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf} +% \caption{Example of partial runs} +% \end{figure} +% \end{remark} + +% \item[Survival function] \marginnote{Survival function} +% Given the random variable $T$ to model survival time, the survival function $S$ is defined as: +% \[ S(\bar{t}) = \prob{T > \bar{t}} \] +% In other words, it is the probability of surviving at least until time $\bar{t}$. + +% For this problem, the survival function can account for past sensor readings: +% \[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \] + +% \item[Hazard function] \marginnote{Hazard function} +% Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as: +% \[ \lambda(\bar{t}) = \prob{T > \bar{t} \mid T > \bar{t}-1} \] +% In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$. + +% With discrete time, the survival function can be factorized using the hazard function: +% \[ S(\bar{t}) = (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \] + +% For this problem, the hazard function is the following: +% \[ +% \begin{gathered} +% \lambda(\bar{t}, X_{\bar{t}}) = \prob{T > \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\ +% S(\bar{t}, X_{\bar{t}}) = (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots +% \end{gathered} +% \] +% \end{description} \ No newline at end of file