Add A3I survival analysis + biomedical data

This commit is contained in:
2024-11-10 14:19:32 +01:00
parent 605ff0b812
commit a89d800353
11 changed files with 181 additions and 34 deletions

View File

@ -18,5 +18,6 @@
\include{./sections/_remaining_useful_life.tex}
\include{./sections/_wear_anomalies.tex}
\include{./sections/_arrivals_predicition.tex}
\include{./sections/_features_selection.tex}
\end{document}

View File

@ -0,0 +1,63 @@
\chapter{Feature selection and important: Biomedical analysis}
\section{Data}
The dataset contains anonymized biomedical data and is composed of a binary target variable and unknown variables.
\begin{remark}
In reality, this dataset contains real-world examples. Features have been anonymized for the purpose of feature analysis.
\end{remark}
\subsection{Preliminary analysis}
\begin{description}
\item[Data distribution]
There are both categorical and numerical features.
\begin{figure}[H]
\centering
\begin{subfigure}{0.75\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_biomed_numeric_distr.pdf}
\caption{Numerical features}
\end{subfigure}
\begin{subfigure}{0.75\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_biomed_categ_distr.pdf}
\caption{Categorical features}
\end{subfigure}
\begin{subfigure}{0.2\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_biomed_target_distr.pdf}
\caption{Target}
\end{subfigure}
\caption{Distribution of the dataset}
\end{figure}
\item[Univariate dependencies]
Determine the fraction of examples with target $Y=1$ (i.e., likelihood that a feature has a specific value while the target is $1$).
\begin{figure}[H]
\centering
\begin{subfigure}{0.75\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_biomed_target_num_distr.pdf}
\caption{Numerical features}
\end{subfigure}
\begin{subfigure}{0.75\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_biomed_target_categ_distr.pdf}
\caption{Categorical features}
\end{subfigure}
\caption{Univariate dependencies with $Y=1$}
\end{figure}
\item[Linear correlation]
Determine the Pearson's correlation between variables.
\begin{figure}[H]
\centering
\includegraphics[width=0.8\linewidth]{./img/_biomed_corr_matrix.pdf}
\end{figure}
\end{description}

View File

@ -135,7 +135,7 @@ Predict RUL with a regressor $f$ and set a threshold to trigger maintenance:
\end{description}
\subsection{Classifier}
\subsection{Classifier} \label{sec:rul_classifier_naive}
Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that determines whether a failure will occur in $\varepsilon$ steps:
\[ f_\varepsilon(x, \theta) = \begin{cases}
@ -256,7 +256,7 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
\end{description}
\subsection{Survival analysis model}
\subsection{Survival analysis (regression)}
\begin{remark}
Chronologically, this approach has been presented after \Cref{ch:ap_hospital}.
@ -315,42 +315,125 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
\end{description}
% \subsection{Survival function}
\subsection{Survival analysis (classification)}
% \begin{description}
% \item[Censoring] \marginnote{Censoring}
% Hide from the dataset key events.
\begin{description}
\item[Censoring] \marginnote{Censoring}
Hide key events from the dataset.
% \begin{remark}
% For this dataset, it is more realistic to use partial runs as run-to-failure experiments are expensive to obtain.
% \begin{figure}[H]
% \centering
% \includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
% \caption{Example of partial runs}
% \end{figure}
% \end{remark}
\begin{remark}
For this dataset, it is more realistic to have more partial runs than run-to-failure experiments as they are expensive to obtain.
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
\caption{Example of partial runs}
\end{figure}
\end{remark}
% \item[Survival function] \marginnote{Survival function}
% Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
% \[ S(\bar{t}) = \prob{T > \bar{t}} \]
% In other words, it is the probability of surviving at least until time $\bar{t}$.
\item[Survival function] \marginnote{Survival function}
Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
\[ S(\bar{t}) = \prob{T > \bar{t}} \]
In other words, it is the probability of surviving at least until time $\bar{t}$.
% For this problem, the survival function can account for past sensor readings:
% \[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
For this problem, the survival function can account for past sensor readings $X_{\leq \bar{t}}$:
\[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
% \item[Hazard function] \marginnote{Hazard function}
% Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
% \[ \lambda(\bar{t}) = \prob{T > \bar{t} \mid T > \bar{t}-1} \]
% In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
\item[Hazard function] \marginnote{Hazard function}
Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
\[ \lambda(\bar{t}) = \prob{T < \bar{t} \mid T > \bar{t}-1} \]
In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
% With discrete time, the survival function can be factorized using the hazard function:
% \[ S(\bar{t}) = (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
With discrete time, the survival function can be factorized using the hazard function:
\[ S(\bar{t}) \approx (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
% For this problem, the hazard function is the following:
% \[
% \begin{gathered}
% \lambda(\bar{t}, X_{\bar{t}}) = \prob{T > \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
% S(\bar{t}, X_{\bar{t}}) = (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
% \end{gathered}
% \]
% \end{description}
\begin{remark}
The hazard function only depends on one observation.
\end{remark}
\begin{remark}
The fact that the probability of not surviving is used is for historical reasons.
\end{remark}
For this problem, the hazard function is the following:
\[
\begin{gathered}
\lambda(\bar{t}, X_{\bar{t}}) = \prob{T < \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
S(\bar{t}, X_{\bar{t}}) \approx (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
\end{gathered}
\]
\item[Hazard estimator]
We want to train an estimator $\hat{\lambda}_\theta(\bar{t}, x_{\bar{t}})$ for the hazard function.
Consider the $k$-th experiment that ended at time $e^{(k)}$. The probability of the survival event can be modelled as follows:
\[
\underbrace{
\vphantom{\prod_{t=1}^{e^{(k)}-1}}
\hat{\lambda}_\theta(e^{(k)}, x^{(k)}_{e^{(k)}})
}_{\qquad\mathllap{\text{Not surviving at time $e^{(k)}$}}}
\underbrace{
\prod_{t=1}^{e^{(k)}-1} (1 - \hat{\lambda}_\theta(t, x^{(k)}_{t}))
}_{\mathrlap{\text{Surviving until time $e^{(k)}-1$}}\qquad}
\]
\begin{description}
\item[Training]
In likelihood maximization terms, the problem for $m$ experiments is formulated as:
\[
\arg\max_\theta \prod_{k=1}^{m} \left(
\hat{\lambda}_\theta(e^{(k)}, x^{(k)}_{e^{(k)}})
\prod_{t=1}^{e^{(k)}-1} (1 - \hat{\lambda}_\theta(t, x^{(k)}_{t}))
\right)
\]
Let $d^{(k)}_{t} = 1 \iff t = e^{(k)}$ (i.e., $1$ when not surviving at time $t$), the problem can be rewritten as:
\[
\begin{split}
&\arg\max_\theta \prod_{k=1}^{m} \prod_{t=1}^{e^{(k)}}
d^{(k)}_{t} \hat{\lambda}_\theta(t, x^{(k)}_{t}) + (1-d^{(k)}_{t})(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})) \\
&= \arg\max_\theta \sum_{k=1}^{m} \sum_{t=1}^{e^{(k)}}
\log\left( d^{(k)}_{t} \hat{\lambda}_\theta(t, x^{(k)}_{t}) + (1-d^{(k)}_{t})(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})) \right) \\
&= \arg\max_\theta \sum_{k=1}^{m} \sum_{t=1}^{e^{(k)}}
d^{(k)}_{t} \log\left(\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) + (1-d^{(k)}_{t}) \log\left(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) \\
&= \arg\min_\theta -\sum_{k=1}^{m} \sum_{t=1}^{e^{(k)}}
d^{(k)}_{t} \log\left(\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) + (1-d^{(k)}_{t}) \log\left(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) \\
\end{split}
\]
Which corresponds to a binary cross-entropy minimization problem where $d^{(k)}_{t}$ can be seen as the class. Therefore, $\hat{\lambda}_\theta(t, x^{(k)}_{t})$ can be seen as a classifier.
\begin{remark}
This shows that the classification approach used in \Cref{sec:rul_classifier_naive} is not strictly wrong. Instead of predicting whether the machine fails in $\varepsilon$ time steps, in probabilistic terms, the correct approach is to consider whether the machine fails at a time step $t$ knowing it survived at time $t-1$.
\end{remark}
\begin{remark}
Censored data skews the distribution of the data. Sample weights can be used to deal with the issue.
\end{remark}
\end{description}
\end{description}
\begin{figure}[H]
\centering
\includegraphics[width=0.9\linewidth]{./img/_rul_survival_analysis_classifier.pdf}
\caption{Failure probability of the classifier}
\end{figure}
\begin{description}
\item[Inference] \phantom{}
Given a threshold $\varepsilon$, maintenance is triggered when:
\[ \hat{\lambda}(t, x_t) \geq \varepsilon \]
\item[Forecasting]
Determine the probability of surviving $n$ more steps as:
\[
\begin{split}
\frac{S(t+n)}{S(t)} &= \prod_{h=0}^{n} (1 - \lambda(t+h, X_{t+h})) \\
&\approx \prod_{h=0}^{n} (1 - \hat{\lambda}(t+h, x^{(k)}_{t+h}))
\end{split}
\]
However, this would require accessing future values of $X_t$. Possible workarounds are:
\begin{itemize}
\item Ignore time-varying inputs (i.e., sort of marginalization) so that $\hat{\lambda}(t+h, x_{t+h}) \approx \hat{\lambda}(t+h, x^{(k)})$ where $x^{(k)}$ only contains stable information.
\item Predict future values of $x^{(k)}_t$ by training another estimator.
\item Assume that all the variables $x^{(k)}_t$ are stable for some time so that:
\[ \frac{S(t+n)}{S(t)} \approx \prod_{h=0}^{n} (1 - \hat{\lambda}(t+h, x^{(k)}_{t})) \]
\end{itemize}
\end{description}