Add A3I UDE + PINN + PFL + DFL

This commit is contained in:
2024-12-05 19:41:09 +01:00
parent 00381a20d0
commit 42025c2109
13 changed files with 327 additions and 0 deletions

View File

@ -20,5 +20,6 @@
\include{./sections/_arrivals_predicition.tex}
\include{./sections/_features_selection.tex}
\include{./sections/_knowledge_injection.tex}
\include{./sections/_prediction_focused_learning.tex}
\end{document}

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

View File

@ -242,4 +242,131 @@
Note that $V_S$ and $\tau$ must be positive values. We can use the same trick as in \Cref{sec:arrivals_neuroprob} by using $\exp\log$ with a scaling factor to start with a reasonable guess.
\end{example}
\item[ODE learning improvements]
\phantom{}
\begin{description}
\item[Training speed/Network depth]
Given a sequence of training measurements $\{ y_k \}_{k=0}^n$, view them as a sequence of pairs $\{ (y_{k-1}, y_k) \}_{k=1}^n$. During training, each pair trains a distinct ODE, each with shared parameters. The overall problem becomes:
\[
\begin{gathered}
\arg\min_\theta \sum_{k=1}^{n} \mathcal{L}(\hat{y}_k(t_k), y_k) \\
\begin{split}
\text{subject to }& \dot{\hat{y}}_k = f(\hat{y}_k, t; \theta) \qquad\forall k = 1, \dots, n \\
& \hat{y}_k(t_{k-1}) = y_{k-1} \qquad\forall k = 1, \dots, n
\end{split}
\end{gathered}
\]
\begin{remark}
This approach is only possible if full states are available and the loss is separable. Moreover, it is not strictly equivalent to the original problem as compound errors are disregarded.
\end{remark}
\begin{remark}
Training now requires shallower networks and mini-batches can be used.
\end{remark}
\item[Accuracy improvement]
Accuracy issues are due to the fact that the Euler method intrinsically has low performance. As the problem consists of fitting a curve, the model might need to estimate wrong parameters to achieve a better result.
Therefore, if the objective is to fit a curve, this is not necessarily a problem. However, if the correct parameters are the goal, a more accurate integration method is needed.
\begin{remark}
A straightforward approach to achieve better results is to increase the granularity of the steps of the Euler method (i.e., use more steps).
\end{remark}
\end{description}
\item[Universal ODE (UDE)] \marginnote{Universal ODE (UDE)}
Use black-box functions as the parameter of an ODE:
\[ \dot{y} = f(y, t, U(y, t; \theta)) \]
where $U$ is a universal approximator (e.g., a neural network).
\begin{example}
Consider the SIR (susceptible-infected-recovered) model for epidemic modeling:
\[
\begin{split}
\dot{S} &= -\beta \frac{1}{N} SI \\
\dot{I} &= +\beta \frac{1}{N} SI - \gamma I \\
\dot{R} &= +\gamma I
\end{split}
\]
Non-pharmaceutical interventions (NPI) (e.g., masks) have an effect on $\beta$, making it time dependent:
\[
\begin{split}
\dot{S} &= -\beta(t) \frac{1}{N} SI \\
\dot{I} &= +\beta(t) \frac{1}{N} SI - \gamma I \\
\dot{R} &= +\gamma I
\end{split}
\]
For this example, we assume $\beta(t)$ is modeled as:
\[ \beta(t) = \beta_0 \prod_{i \in \mathcal{I}} e_i \]
where $\beta_0$ is the baseline spread if nothing is done, $\mathcal{I}$ is the set of active NPIs, and $e_i$ is the effect of the $i$-th NPI.
In practice, the time argument for $\beta$ is to retrieve the active NPIs at time $t$, therefore, the overall model can be formulated as:
\[
\begin{split}
\dot{S} &= -\beta(\texttt{NPI}(t)) \frac{1}{N} SI \\
\dot{I} &= +\beta(\texttt{NPI}(t)) \frac{1}{N} SI - \gamma I \\
\dot{R} &= +\gamma I
\end{split}
\]
Architecturally, a neural network as in ODE can be used. The only addition is that it takes as input the active NPIs.
\end{example}
\end{description}
\subsection{Physics informed neural network}
\begin{description}
\item[Physics informed neural network (PINN)] \marginnote{Physics informed neural network (PINN)}
Use a neural network to predict the state at time $t$:
\[ \hat{y}(t; \theta) \approx y(t) \]
\begin{remark}
ODEs predict the derivative of the change of the state, while PINNs directly predict the next state. This allows for faster inference as integration (i.e., Euler method) is no longer needed.
\end{remark}
\begin{description}
\item[Training]
The training objective is similar to the one of UDE, with some changes to the constraints:
\[
\begin{gathered}
\arg\min_\theta \mathcal{L}(\hat{y}(t, \theta), y) \\
\begin{split}
\text{subject to }& \dot{\hat{y}}(t; \theta) = f(\hat{y}(t; \theta), t) \\
& \hat{y}(t_0; \theta) = y_0
\end{split}
\end{gathered}
\]
where the first constraint imposes some physics knowledge and depends on the input (not the parameters).
\begin{remark}
A PINN can be seen as neural network that learns ODE integration.
\end{remark}
In practice, the overall loss can be formulated as a Lagrangian:
\[
\begin{split}
\mathcal{L}(y, \hat{y}, t, \theta) = &\mathcal{L}(\hat{y}(t; \theta), y) \\
&+ \lambda_{de}^T \Vert \dot{\hat{y}}(t; \theta) - f(\hat{y}(t; \theta), t) \Vert^2_2 \\
&+ \lambda_{bc}^T \Vert \dot{\hat{y}}(t; \theta) - y_0 \Vert^2_2 \\
\end{split}
\]
where $\lambda_{de}^T$ and $\lambda_{bc}^T$ are multipliers.
\begin{remark}
If needed, the loss can be generalized further.
\end{remark}
\begin{remark}
If the physics ($f$) is known, the two constraints can be enforced without data points as only time steps (which can be discretized and decided a priori) are needed. Moreover, the derivative ($\dot{\hat{y}}(t; \theta)$) w.r.t. $t$ can be computed using automatic differentiation.
Therefore, during training, it is possible to rely on the two constraints more than the loss.
\end{remark}
\begin{remark}
Finding the weights $\lambda_{de}^T$ and $\lambda_{bc}^T$ can be tricky depending on the reliability and robustness of the physics model that is used. Cross-validation can be used if unsure and dual ascent can be used if the model is trusted.
\end{remark}
\end{description}
\end{description}

View File

@ -0,0 +1,199 @@
\chapter{Predict and optimize}
\section{Approaches}
\subsection{Prediction focused learning}
\begin{description}
\item[Prediction focused learning] \marginnote{Prediction focused learning}
Inference method that solves an optimization problem by using inputs predicted by an estimator. More specifically, there are two steps:
\begin{descriptionlist}
\item[Predict]
Train a predictor for the parameters of the problem. The optimal predictor $h$ has the following parameters:
\[ \theta^* = \arg\min_\theta \left\{ \mathbb{E}_{(x, y) \sim P(X, Y)}[\mathcal{L}(y, \hat{y})] \mid \hat{y} = h(x, \theta) \right\} \]
\item[Optimize]
Solve an optimization problem with the estimated parameters as input:
\[ z^*(y) = \arg\min_\vec{z} \left\{ f(\vec{z}, y) \mid \vec{z} \in F \right\} \]
where $\vec{z}$ is the decision vector, $f$ is the cost function, $F$ is the feasible space, and $y$ is the output of the predictor $h$.
\end{descriptionlist}
Therefore, during inference, the following is computed:
\[ z^*(h(x; \theta)) = z^*(\hat{y}) \]
\begin{figure}[H]
\centering
\includegraphics[width=0.5\linewidth]{./img/dfl_setup.png}
\end{figure}
\begin{remark}
This approach is asymptotically correct. The perfect predictor allows to reach the optimal result.
\end{remark}
\begin{remark}
The predictor should be trained for minimal decision cost (instead of maximal accuracy) so that the optimizer can make the correct choice.
\end{remark}
\begin{example}
Consider the problem:
\[ \arg\min_{z} \{y_0 z_0 + y_1 z_1 \mid z_0 + z_1 = 1, z \in \{0, 1\}^2\} \]
with some ground-truth $y_0$ and $y_1$.
Assume that the predictor can only learn a model of form:
\[ \hat{y}_0 = \theta^2x \qquad \hat{y}_1 = 0.5 \cdot \theta \]
By maximizing accuracy, the following predictions are obtained:
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_pfl_example1.pdf}
\end{figure}
The intersection point is important for optimization. In this case, predictions are not ideal. Instead, by minimizing decision cost, the following predictions are made:
\begin{figure}[H]
\centering
\includegraphics[width=0.7\linewidth]{./img/_pfl_example2.pdf}
\end{figure}
\end{example}
\end{description}
\subsection{Decision focused learning}
\begin{description}
\item[Decision focused learning (DFL)] \marginnote{Decision focused learning (DFL)}
PFD where linear cost functions are assumed. The optimization problem is therefore:
\[ z^*(y) = \arg\min_z \{ y^Tz \mid z \in F \} \]
where $y$ cannot be measured but depends on some observable $x$ (i.e., $X, Y \sim P(X, Y)$).
The training problem of the predictor aims to minimize the decision cost and is defined as:
\[
\theta^* = \arg\min_\theta = \left\{ \mathbb{E}_{(x, y) \sim P(X,Y)} \left[ \texttt{regret}(y, \hat{y}) \mid \hat{y} = h(x, \theta) \right] \right\}
\]
\begin{description}
\item[Regret] \marginnote{Regret}
Measures the difference between the solution obtained using the predictor and the perfect solution. It is defined as:
\[ \texttt{regret}(y, \hat{y}) = y^T z^*(\hat{y}) - y^T z^*(y) \]
where:
\begin{itemize}
\item $z^*(y)$ is the best solution with access to the ground-truth (i.e., an oracle).
\item $z^*(\hat{y})$ is the solution computed with the estimated parameters.
\end{itemize}
\begin{remark}
Optimizing regret is equivalent to optimizing the cost function, but regret is lower-bounded at $0$.
\end{remark}
\begin{remark}
Regret is non-differentiable in many points and, when it is, its gradient is not informative. In practice, a surrogate for regret is used instead.
\end{remark}
\begin{example}
Consider a collection of normally distributed data $(x, y)$:
\begin{figure}[H]
\centering
\includegraphics[width=0.75\linewidth]{./img/_dfl_regret_example1.pdf}
\end{figure}
The regret landscape for varying parameter $\theta$ is:
\begin{figure}[H]
\centering
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_dfl_regret_example2.pdf}
\caption{Regret for a single sample}
\end{subfigure}
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_dfl_regret_example3.pdf}
\caption{Regret for a batch}
\end{subfigure}
\end{figure}
\end{example}
\item[Self-contrastive loss] \marginnote{Self-contrastive loss}
Surrogate for regret defined as:
\[ \hat{y}^T z^*(y) - \hat{y}^T z^*(\hat{y}) \]
The idea is that a good prediction vector $\hat{y}$ should make the optimal cost $\hat{y}^T z^*(y)$ not worse than the cost of the estimated one $\hat{y}^T z^*(\hat{y})$.
\begin{remark}
By differentiating over $\hat{y}$, the result is a subgradient (i.e., coefficient used to compute a non-defined gradient) and it is computed as:
\[ \nabla(\hat{y}^T z^*(y) - \hat{y}^T z^*(\hat{y})) = z^*(y) - z^*(\hat{y}) \]
\end{remark}
\begin{remark}
Self-contrastive loss creates spurious minima (i.e., false minima that are not in the regret) as a trivial solution is to predict $\hat{y} = 0$.
\end{remark}
\begin{example}
The self-contrastive loss using the same data as before is:
\begin{figure}[H]
\centering
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_dfl_self_contrastive_example1.pdf}
\caption{Self-contrastive loss for a single sample}
\end{subfigure}
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_dfl_self_contrastive_example2.pdf}
\caption{Self-contrastive loss for a batch}
\end{subfigure}
\end{figure}
\end{example}
\item[SPO+ loss] \marginnote{SPO+ loss}
Surrogate for regret defined as a perturbed version of the self-contrastive loss:
\[
\texttt{spo+}(y, \hat{y}) = \hat{y}^T_\text{spo} z^*(y) - \hat{y}^T_\text{spo} z^*(\hat{y}_\text{spo}) \qquad \text{with } \hat{y}_\text{spo} = 2 \hat{y} - y
\]
\begin{remark}
With many samples, the spurious minima tend to cancel out.
\end{remark}
\begin{example}
The SPO+ loss using the same data as before is:
\begin{figure}[H]
\centering
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_dfl_spop_example1.pdf}
\caption{SPO+ for a single sample}
\end{subfigure}
\begin{subfigure}{0.49\linewidth}
\centering
\includegraphics[width=\linewidth]{./img/_dfl_spop_example2.pdf}
\caption{SPO+ for a batch}
\end{subfigure}
\end{figure}
\end{example}
\end{description}
\begin{remark}
DLF are slow to train as each iteration requires to solve an optimization problem.
\end{remark}
\begin{description}
\item[DLF training speed-up] \phantom{}
\begin{description}
\item[Warm start]
Initialize a DFL network with PFL weights.
\item[Solution caching]
Assuming that the feasible space is constant, caching can be done as follows:
\begin{enumerate}
\item Initialize the cache $\mathcal{S}$ with the true optimal solutions $z^*(y_i)$.
\item When it is required to compute $z^*(\hat{y})$:
\begin{itemize}
\item With probability $p$, invoke the solver and compute the real solution. The newly computed value is cached.
\item With probability $p-1$, do a cache lookup as:
\[ \hat{z}^*(\hat{y}) = \arg\min_z \{ f(z) \mid z \in \mathcal{S} \} \]
\end{itemize}
\end{enumerate}
\end{description}
\end{description}
\end{description}
\begin{remark}
PFL with more complex networks allows reaching comparable performance to DLF.
\end{remark}