mirror of
https://github.com/NotXia/unibo-ai-notes.git
synced 2025-12-15 19:12:22 +01:00
Add FAIKR3 approximate inference
This commit is contained in:
Binary file not shown.
Binary file not shown.
@ -10,6 +10,8 @@
|
|||||||
\input{sections/_intro.tex}
|
\input{sections/_intro.tex}
|
||||||
\input{sections/_probability.tex}
|
\input{sections/_probability.tex}
|
||||||
\input{sections/_bayesian_net.tex}
|
\input{sections/_bayesian_net.tex}
|
||||||
\input{sections/_inference.tex}
|
\input{sections/_exact_inference.tex}
|
||||||
|
\input{sections/_approx_inference.tex}
|
||||||
|
\eoc
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
@ -0,0 +1,196 @@
|
|||||||
|
\chapter{Approximate inference}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Stochastic simulation] \marginnote{Stochastic simulation}
|
||||||
|
Class of methods that draw $N$ samples from the distribution
|
||||||
|
and estimate an approximate posterior $\hat{\mathcal{P}}$.
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[$\delta$-stochastic absolute approximation]
|
||||||
|
Given $\delta \in ]0, 0.5[$ and $\varepsilon \in ]0, 0.5[$, a $\delta$-stochastic absolute approximation has error:
|
||||||
|
\[ \left\vert \prob{X | \matr{E}} - \hat{\mathcal{P}}(X | \matr{E}) \right\vert \leq \varepsilon \]
|
||||||
|
Moreover, the method might fail (with greater error) with probability $\delta$.
|
||||||
|
|
||||||
|
\item[$\delta$-stochastic relative approximation]
|
||||||
|
Given $\delta \in ]0, 0.5[$ and $\varepsilon \in ]0, 0.5[$, a $\delta$-stochastic relative approximation has error:
|
||||||
|
\[ \frac{\left\vert \prob{X | \matr{E}} - \hat{\mathcal{P}}(X | \matr{E}) \right\vert}{\prob{X | \matr{E}}} \leq \varepsilon \]
|
||||||
|
Moreover, the method might fail (with greater error) with probability $\delta$.
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
\begin{theorem}
|
||||||
|
Approximate inference is NP-hard for any $\delta, \epsilon < 0.5$.
|
||||||
|
\end{theorem}
|
||||||
|
|
||||||
|
\item[Consistency] \marginnote{Consistency}
|
||||||
|
A sampling method is consistent if:
|
||||||
|
\[ \lim_{N \rightarrow \infty} \hat{\mathcal{P}}(x) = \prob{x} \]
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Sampling from an empty network}
|
||||||
|
\marginnote{Sampling from an empty network}
|
||||||
|
|
||||||
|
Sample each variable in topological order (i.e. from parents to children).
|
||||||
|
|
||||||
|
The probability $\mathcal{S}$ of sampling a specific event $x_1, \dots, x_n$ is given by the
|
||||||
|
probability of the single events knowing their parents:
|
||||||
|
\[ \mathcal{S}(x_1, \dots, x_n) = \prod_{i=1}^n \prob{x_i | \texttt{parents}(X_i)} = \prob{x_1, \dots, x_n} \]
|
||||||
|
|
||||||
|
\begin{theorem}
|
||||||
|
Sampling from an empty network is consistent.
|
||||||
|
|
||||||
|
\begin{proof}
|
||||||
|
Let $N$ be the number of samples and
|
||||||
|
$\mathcal{N}(x_1, \dots, x_n)$ the number of times the event $x_1, \dots, x_n$ has been sampled.
|
||||||
|
\[
|
||||||
|
\begin{split}
|
||||||
|
\lim_{N \rightarrow \infty} \hat{\mathcal{P}}(x_1, \dots, x_n) &=
|
||||||
|
\lim_{N \rightarrow \infty} \frac{\mathcal{N}(x_1, \dots, x_n)}{N} \\
|
||||||
|
&= \mathcal{S}(x_1, \dots, x_n) =
|
||||||
|
\prob{x_1, \dots, x_n}
|
||||||
|
\end{split}
|
||||||
|
\]
|
||||||
|
\end{proof}
|
||||||
|
\end{theorem}
|
||||||
|
|
||||||
|
\begin{example}
|
||||||
|
Given the following Bayesian network:
|
||||||
|
\begin{center}
|
||||||
|
\includegraphics[width=0.5\textwidth]{img/_approx_infer_example.pdf}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
A possible sampling order is \texttt{Cloudy}, \texttt{Sprinkler}, \texttt{Rain}, \texttt{WetGrass}.
|
||||||
|
|
||||||
|
Assuming that a random generator gives the sequence of probabilities $(0.4, 0.8, 0.1, 0.5)$,
|
||||||
|
the sample will be:
|
||||||
|
\[ \langle \prob{C}, \prob{S | C}, \prob{R | C}, \prob{W | S, R} \rangle \]
|
||||||
|
\[ \langle C=\texttt{false}, \prob{S | C=\texttt{false}}, \prob{R | C=\texttt{false}}, \prob{W | S, R} \rangle \]
|
||||||
|
\[ \langle C=\texttt{false}, S=\texttt{false}, R=\texttt{true}, \prob{W | S=\texttt{false}, R=\texttt{true}} \rangle \]
|
||||||
|
\[ \langle C=\texttt{false}, S=\texttt{false}, R=\texttt{true}, W=\texttt{true} \rangle \]
|
||||||
|
|
||||||
|
Note that the adopted convention is the following:
|
||||||
|
if $r$ it the probability given by a random generator and $\prob{X} = p$, $X = \texttt{true}$ if $r \leq p$.
|
||||||
|
\end{example}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Rejection sampling}
|
||||||
|
\marginnote{Rejection sampling}
|
||||||
|
|
||||||
|
Given a known evidence $\matr{E}$, rejection sampling works as sampling from an empty network
|
||||||
|
but removes any sample that does no agree with the evidence.
|
||||||
|
|
||||||
|
Obviously if $\prob{\matr{E}}$ is low, the majority of the samples will be discarded and
|
||||||
|
more iterations are required to reach the desired number of samples.
|
||||||
|
|
||||||
|
\begin{theorem}
|
||||||
|
Rejection sampling is consistent.
|
||||||
|
|
||||||
|
\begin{proof}
|
||||||
|
Let $\mathcal{N}(\matr{X})$ be the number of times the event $\matr{X}$ has been sampled.
|
||||||
|
\[
|
||||||
|
\begin{split}
|
||||||
|
\hat{\mathcal{P}}(\matr{X} | \matr{E}) &=
|
||||||
|
\frac{\mathcal{N}(\matr{X}, \matr{E})}{\mathcal{N}(\matr{E})} \\
|
||||||
|
&\approx \frac{\prob{\matr{X}, \matr{E}}}{\prob{\matr{E}}} =
|
||||||
|
\prob{\matr{X} | \matr{E}}
|
||||||
|
\end{split}
|
||||||
|
\]
|
||||||
|
The approximation derives from the consistency of sampling from an empty network.
|
||||||
|
\end{proof}
|
||||||
|
\end{theorem}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Likelihood weighting}
|
||||||
|
\marginnote{Likelihood weighting}
|
||||||
|
|
||||||
|
Given a known evidence $\matr{E}$, likelihood weighting samples non-evidence variables and
|
||||||
|
weights each sample by the likelihood of the evidence.
|
||||||
|
|
||||||
|
The probability $\mathcal{S}$ of sampling a specific event $\matr{Z}$ and evidence $\matr{E}$ is given by the
|
||||||
|
probability of the single events in $\matr{Z}$ knowing their parents:
|
||||||
|
\[ \mathcal{S}(\matr{Z}, \matr{E}) = \prod_{z_i \in \matr{Z}} \prob{z_i | \texttt{parents}(z_i)} \]
|
||||||
|
|
||||||
|
The weights of a sample $(\matr{Z}, \matr{E})$ is given by the
|
||||||
|
probability of the single events in $\matr{E}$ knowing their parents:
|
||||||
|
\[ \text{w}(\matr{Z}, \matr{E}) = \prod_{e_i \in \matr{E}} \prob{e_i | \texttt{parents}(e_i)} \]
|
||||||
|
|
||||||
|
\begin{theorem}
|
||||||
|
Likelihood weighting is consistent.
|
||||||
|
|
||||||
|
\begin{proof}
|
||||||
|
The weighted sampling probability is given by:
|
||||||
|
\[
|
||||||
|
\begin{split}
|
||||||
|
\mathcal{S}(\matr{Z}, \matr{E}) \cdot \text{w}(\matr{Z}, \matr{E})
|
||||||
|
&= \prod_{z_i \in \matr{Z}} \prob{z_i | \texttt{parents}(z_i)} \cdot \prod_{e_i \in E} \prob{e_i | \texttt{parents}(e_i)} \\
|
||||||
|
&= \prob{\matr{Z}, \matr{E}}
|
||||||
|
\end{split}
|
||||||
|
\]
|
||||||
|
This is consequence of the global semantics of Bayesian networks.
|
||||||
|
\end{proof}
|
||||||
|
\end{theorem}
|
||||||
|
|
||||||
|
\begin{example}
|
||||||
|
Given the following Bayesian network:
|
||||||
|
\begin{center}
|
||||||
|
\includegraphics[width=0.5\textwidth]{img/_approx_infer_example.pdf}
|
||||||
|
\end{center}
|
||||||
|
|
||||||
|
Knowing that $S=\texttt{true}$ and $W=\texttt{false}$,
|
||||||
|
we sample in the order: \texttt{Cloudy}, \texttt{Rain}.
|
||||||
|
|
||||||
|
Assuming that a random generator gives the sequence of probabilities $(0.4, 0.1)$,
|
||||||
|
the sample will be:
|
||||||
|
\[ \langle \prob{C}, S=\texttt{true}, \prob{R | C}, W=\texttt{false} \rangle \]
|
||||||
|
\[ \langle C=\texttt{true}, S=\texttt{true}, \prob{R | C=\texttt{true}}, W=\texttt{false} \rangle \]
|
||||||
|
\[ \langle C=\texttt{true}, S=\texttt{true}, R=\texttt{true}, W=\texttt{false} \rangle \]
|
||||||
|
|
||||||
|
The weight associated to the sample is given by the probabilities of the evidence:
|
||||||
|
\[
|
||||||
|
\begin{split}
|
||||||
|
\text{w} &= \prob{S=\texttt{true} | C=\texttt{true}} \cdot \prob{W=\texttt{false} | S=\texttt{true}, R=\texttt{true}} \\
|
||||||
|
&= 0.1 \cdot (1 - 0.99) = 0.001
|
||||||
|
\end{split}
|
||||||
|
\]
|
||||||
|
\end{example}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
\section{Markov chain Monte Carlo}
|
||||||
|
\marginnote{Markov chain Monte Carlo}
|
||||||
|
|
||||||
|
Sampling on a Markov chain where states contain an assignment to all variables.
|
||||||
|
|
||||||
|
Adjacent states of the Markov chain differ by only one variable.
|
||||||
|
Therefore, the probability of an edge connecting two states is given by the probability of the updated variable known its Markov blanket:
|
||||||
|
\[
|
||||||
|
\prob{x_i | \texttt{markov\_blanket}(X_i)} =
|
||||||
|
\prob{x_i | \texttt{parents}(X_i)} \cdot \prod_{Z_j \in \texttt{children}(x_i)} \prob{z_j | \texttt{parents}(Z_j)}
|
||||||
|
\]
|
||||||
|
|
||||||
|
\begin{theorem}
|
||||||
|
Markov chain Monte Carlo is consistent.
|
||||||
|
|
||||||
|
Note: nevertheless, it is difficult to tell if convergence has been achieved.
|
||||||
|
|
||||||
|
\begin{proof}
|
||||||
|
Consequence of the fact that a long-run on a Markov chain converges to the posterior probability of the states.
|
||||||
|
\end{proof}
|
||||||
|
\end{theorem}
|
||||||
|
|
||||||
|
\begin{description}
|
||||||
|
\item[Compiled network]
|
||||||
|
A naive implementation of Markov chain Monte Carlo requires to repeatedly compute the probabilities with the Markov blanket.
|
||||||
|
A solution is to compile the network into a model-specific inference code.
|
||||||
|
\end{description}
|
||||||
|
|
||||||
|
\begin{example}
|
||||||
|
Given the evidence $S=\texttt{true}$ and $W=\texttt{true}$,
|
||||||
|
the structure of the Markov chain can be defined as follows:
|
||||||
|
\begin{center}
|
||||||
|
\includegraphics[width=0.5\textwidth]{img/_markov_chain_sampling.pdf}
|
||||||
|
\end{center}
|
||||||
|
\end{example}
|
||||||
Reference in New Issue
Block a user