mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-14 18:51:52 +01:00
Add A3I survival analysis + biomedical data
This commit is contained in:
@ -18,5 +18,6 @@
|
||||
\include{./sections/_remaining_useful_life.tex}
|
||||
\include{./sections/_wear_anomalies.tex}
|
||||
\include{./sections/_arrivals_predicition.tex}
|
||||
\include{./sections/_features_selection.tex}
|
||||
|
||||
\end{document}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,63 @@
|
||||
\chapter{Feature selection and important: Biomedical analysis}
|
||||
|
||||
|
||||
\section{Data}
|
||||
|
||||
The dataset contains anonymized biomedical data and is composed of a binary target variable and unknown variables.
|
||||
|
||||
\begin{remark}
|
||||
In reality, this dataset contains real-world examples. Features have been anonymized for the purpose of feature analysis.
|
||||
\end{remark}
|
||||
|
||||
|
||||
\subsection{Preliminary analysis}
|
||||
|
||||
\begin{description}
|
||||
\item[Data distribution]
|
||||
There are both categorical and numerical features.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.75\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_biomed_numeric_distr.pdf}
|
||||
\caption{Numerical features}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.75\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_biomed_categ_distr.pdf}
|
||||
\caption{Categorical features}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.2\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_biomed_target_distr.pdf}
|
||||
\caption{Target}
|
||||
\end{subfigure}
|
||||
\caption{Distribution of the dataset}
|
||||
\end{figure}
|
||||
|
||||
\item[Univariate dependencies]
|
||||
Determine the fraction of examples with target $Y=1$ (i.e., likelihood that a feature has a specific value while the target is $1$).
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.75\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_biomed_target_num_distr.pdf}
|
||||
\caption{Numerical features}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.75\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/_biomed_target_categ_distr.pdf}
|
||||
\caption{Categorical features}
|
||||
\end{subfigure}
|
||||
\caption{Univariate dependencies with $Y=1$}
|
||||
\end{figure}
|
||||
|
||||
\item[Linear correlation]
|
||||
Determine the Pearson's correlation between variables.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/_biomed_corr_matrix.pdf}
|
||||
\end{figure}
|
||||
\end{description}
|
||||
@ -135,7 +135,7 @@ Predict RUL with a regressor $f$ and set a threshold to trigger maintenance:
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Classifier}
|
||||
\subsection{Classifier} \label{sec:rul_classifier_naive}
|
||||
|
||||
Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that determines whether a failure will occur in $\varepsilon$ steps:
|
||||
\[ f_\varepsilon(x, \theta) = \begin{cases}
|
||||
@ -256,7 +256,7 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{Survival analysis model}
|
||||
\subsection{Survival analysis (regression)}
|
||||
|
||||
\begin{remark}
|
||||
Chronologically, this approach has been presented after \Cref{ch:ap_hospital}.
|
||||
@ -315,42 +315,125 @@ Predict RUL with a classifier $f_\varepsilon$ (for a chosen $\varepsilon$) that
|
||||
\end{description}
|
||||
|
||||
|
||||
% \subsection{Survival function}
|
||||
\subsection{Survival analysis (classification)}
|
||||
|
||||
% \begin{description}
|
||||
% \item[Censoring] \marginnote{Censoring}
|
||||
% Hide from the dataset key events.
|
||||
\begin{description}
|
||||
\item[Censoring] \marginnote{Censoring}
|
||||
Hide key events from the dataset.
|
||||
|
||||
% \begin{remark}
|
||||
% For this dataset, it is more realistic to use partial runs as run-to-failure experiments are expensive to obtain.
|
||||
% \begin{figure}[H]
|
||||
% \centering
|
||||
% \includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
|
||||
% \caption{Example of partial runs}
|
||||
% \end{figure}
|
||||
% \end{remark}
|
||||
\begin{remark}
|
||||
For this dataset, it is more realistic to have more partial runs than run-to-failure experiments as they are expensive to obtain.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/_rul_censoring.pdf}
|
||||
\caption{Example of partial runs}
|
||||
\end{figure}
|
||||
\end{remark}
|
||||
|
||||
% \item[Survival function] \marginnote{Survival function}
|
||||
% Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
|
||||
% \[ S(\bar{t}) = \prob{T > \bar{t}} \]
|
||||
% In other words, it is the probability of surviving at least until time $\bar{t}$.
|
||||
\item[Survival function] \marginnote{Survival function}
|
||||
Given the random variable $T$ to model survival time, the survival function $S$ is defined as:
|
||||
\[ S(\bar{t}) = \prob{T > \bar{t}} \]
|
||||
In other words, it is the probability of surviving at least until time $\bar{t}$.
|
||||
|
||||
% For this problem, the survival function can account for past sensor readings:
|
||||
% \[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
|
||||
For this problem, the survival function can account for past sensor readings $X_{\leq \bar{t}}$:
|
||||
\[ S(\bar{t}, X_{\leq \bar{t}}) = \prob{T > \bar{t} \mid X_{\leq \bar{t}}} \]
|
||||
|
||||
% \item[Hazard function] \marginnote{Hazard function}
|
||||
% Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
|
||||
% \[ \lambda(\bar{t}) = \prob{T > \bar{t} \mid T > \bar{t}-1} \]
|
||||
% In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
|
||||
\item[Hazard function] \marginnote{Hazard function}
|
||||
Given the random variable $T$ to model survival time, the hazard function $\lambda$ is defined as:
|
||||
\[ \lambda(\bar{t}) = \prob{T < \bar{t} \mid T > \bar{t}-1} \]
|
||||
In other words, it is the conditional probability of not surviving at time $\bar{t}$ knowing that the entity survived until time $\bar{t}-1$.
|
||||
|
||||
% With discrete time, the survival function can be factorized using the hazard function:
|
||||
% \[ S(\bar{t}) = (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
|
||||
With discrete time, the survival function can be factorized using the hazard function:
|
||||
\[ S(\bar{t}) \approx (1-\lambda(\bar{t})) \cdot (1 - \lambda(\bar{t}-1)) \cdot \dots \]
|
||||
|
||||
% For this problem, the hazard function is the following:
|
||||
% \[
|
||||
% \begin{gathered}
|
||||
% \lambda(\bar{t}, X_{\bar{t}}) = \prob{T > \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
|
||||
% S(\bar{t}, X_{\bar{t}}) = (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
|
||||
% \end{gathered}
|
||||
% \]
|
||||
% \end{description}
|
||||
\begin{remark}
|
||||
The hazard function only depends on one observation.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
The fact that the probability of not surviving is used is for historical reasons.
|
||||
\end{remark}
|
||||
|
||||
For this problem, the hazard function is the following:
|
||||
\[
|
||||
\begin{gathered}
|
||||
\lambda(\bar{t}, X_{\bar{t}}) = \prob{T < \bar{t} \mid T > \bar{t}-1, X_{\bar{t}}} \\
|
||||
S(\bar{t}, X_{\bar{t}}) \approx (1-\lambda(\bar{t}, X_{\bar{t}})) \cdot (1 - \lambda(\bar{t}-1, X_{\bar{t}-1})) \cdot \dots
|
||||
\end{gathered}
|
||||
\]
|
||||
|
||||
\item[Hazard estimator]
|
||||
We want to train an estimator $\hat{\lambda}_\theta(\bar{t}, x_{\bar{t}})$ for the hazard function.
|
||||
|
||||
Consider the $k$-th experiment that ended at time $e^{(k)}$. The probability of the survival event can be modelled as follows:
|
||||
\[
|
||||
\underbrace{
|
||||
\vphantom{\prod_{t=1}^{e^{(k)}-1}}
|
||||
\hat{\lambda}_\theta(e^{(k)}, x^{(k)}_{e^{(k)}})
|
||||
}_{\qquad\mathllap{\text{Not surviving at time $e^{(k)}$}}}
|
||||
\underbrace{
|
||||
\prod_{t=1}^{e^{(k)}-1} (1 - \hat{\lambda}_\theta(t, x^{(k)}_{t}))
|
||||
}_{\mathrlap{\text{Surviving until time $e^{(k)}-1$}}\qquad}
|
||||
\]
|
||||
|
||||
\begin{description}
|
||||
\item[Training]
|
||||
In likelihood maximization terms, the problem for $m$ experiments is formulated as:
|
||||
\[
|
||||
\arg\max_\theta \prod_{k=1}^{m} \left(
|
||||
\hat{\lambda}_\theta(e^{(k)}, x^{(k)}_{e^{(k)}})
|
||||
\prod_{t=1}^{e^{(k)}-1} (1 - \hat{\lambda}_\theta(t, x^{(k)}_{t}))
|
||||
\right)
|
||||
\]
|
||||
Let $d^{(k)}_{t} = 1 \iff t = e^{(k)}$ (i.e., $1$ when not surviving at time $t$), the problem can be rewritten as:
|
||||
\[
|
||||
\begin{split}
|
||||
&\arg\max_\theta \prod_{k=1}^{m} \prod_{t=1}^{e^{(k)}}
|
||||
d^{(k)}_{t} \hat{\lambda}_\theta(t, x^{(k)}_{t}) + (1-d^{(k)}_{t})(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})) \\
|
||||
&= \arg\max_\theta \sum_{k=1}^{m} \sum_{t=1}^{e^{(k)}}
|
||||
\log\left( d^{(k)}_{t} \hat{\lambda}_\theta(t, x^{(k)}_{t}) + (1-d^{(k)}_{t})(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})) \right) \\
|
||||
&= \arg\max_\theta \sum_{k=1}^{m} \sum_{t=1}^{e^{(k)}}
|
||||
d^{(k)}_{t} \log\left(\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) + (1-d^{(k)}_{t}) \log\left(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) \\
|
||||
&= \arg\min_\theta -\sum_{k=1}^{m} \sum_{t=1}^{e^{(k)}}
|
||||
d^{(k)}_{t} \log\left(\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) + (1-d^{(k)}_{t}) \log\left(1-\hat{\lambda}_\theta(t, x^{(k)}_{t})\right) \\
|
||||
\end{split}
|
||||
\]
|
||||
Which corresponds to a binary cross-entropy minimization problem where $d^{(k)}_{t}$ can be seen as the class. Therefore, $\hat{\lambda}_\theta(t, x^{(k)}_{t})$ can be seen as a classifier.
|
||||
|
||||
\begin{remark}
|
||||
This shows that the classification approach used in \Cref{sec:rul_classifier_naive} is not strictly wrong. Instead of predicting whether the machine fails in $\varepsilon$ time steps, in probabilistic terms, the correct approach is to consider whether the machine fails at a time step $t$ knowing it survived at time $t-1$.
|
||||
\end{remark}
|
||||
|
||||
\begin{remark}
|
||||
Censored data skews the distribution of the data. Sample weights can be used to deal with the issue.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{./img/_rul_survival_analysis_classifier.pdf}
|
||||
\caption{Failure probability of the classifier}
|
||||
\end{figure}
|
||||
|
||||
\begin{description}
|
||||
\item[Inference] \phantom{}
|
||||
Given a threshold $\varepsilon$, maintenance is triggered when:
|
||||
\[ \hat{\lambda}(t, x_t) \geq \varepsilon \]
|
||||
|
||||
\item[Forecasting]
|
||||
Determine the probability of surviving $n$ more steps as:
|
||||
\[
|
||||
\begin{split}
|
||||
\frac{S(t+n)}{S(t)} &= \prod_{h=0}^{n} (1 - \lambda(t+h, X_{t+h})) \\
|
||||
&\approx \prod_{h=0}^{n} (1 - \hat{\lambda}(t+h, x^{(k)}_{t+h}))
|
||||
\end{split}
|
||||
\]
|
||||
However, this would require accessing future values of $X_t$. Possible workarounds are:
|
||||
\begin{itemize}
|
||||
\item Ignore time-varying inputs (i.e., sort of marginalization) so that $\hat{\lambda}(t+h, x_{t+h}) \approx \hat{\lambda}(t+h, x^{(k)})$ where $x^{(k)}$ only contains stable information.
|
||||
\item Predict future values of $x^{(k)}_t$ by training another estimator.
|
||||
\item Assume that all the variables $x^{(k)}_t$ are stable for some time so that:
|
||||
\[ \frac{S(t+n)}{S(t)} \approx \prod_{h=0}^{n} (1 - \hat{\lambda}(t+h, x^{(k)}_{t})) \]
|
||||
\end{itemize}
|
||||
\end{description}
|
||||
Reference in New Issue
Block a user