Add FAIKR3 approximate inference

2026-02-04 07:41:43 +01:00 · 2023-12-08 19:45:01 +01:00
parent 35f18b074f
commit 04835d2cea
5 changed files with 199 additions and 1 deletions
--- a/src/fundamentals-of-ai-and-kr/module3/img/_approx_infer_example.pdf
+++ b/src/fundamentals-of-ai-and-kr/module3/img/_approx_infer_example.pdf
--- a/src/fundamentals-of-ai-and-kr/module3/img/_markov_chain_sampling.pdf
+++ b/src/fundamentals-of-ai-and-kr/module3/img/_markov_chain_sampling.pdf
--- a/src/fundamentals-of-ai-and-kr/module3/main.tex
+++ b/src/fundamentals-of-ai-and-kr/module3/main.tex
@ -10,6 +10,8 @@
    \input{sections/_intro.tex}
    \input{sections/_probability.tex}
    \input{sections/_bayesian_net.tex}
-    \input{sections/_inference.tex}
+    \input{sections/_exact_inference.tex}
+    \input{sections/_approx_inference.tex}
+    \eoc

 \end{document}
--- a/src/fundamentals-of-ai-and-kr/module3/sections/_approx_inference.tex
+++ b/src/fundamentals-of-ai-and-kr/module3/sections/_approx_inference.tex
@ -0,0 +1,196 @@
+\chapter{Approximate inference}
+
+\begin{description}
+    \item[Stochastic simulation] \marginnote{Stochastic simulation}
+        Class of methods that draw $N$ samples from the distribution
+        and estimate an approximate posterior $\hat{\mathcal{P}}$.
+
+        \begin{description}
+            \item[$\delta$-stochastic absolute approximation] 
+                Given $\delta \in ]0, 0.5[$ and $\varepsilon \in ]0, 0.5[$, a $\delta$-stochastic absolute approximation has error:
+                \[ \left\vert \prob{X | \matr{E}} - \hat{\mathcal{P}}(X | \matr{E}) \right\vert \leq \varepsilon \]
+                Moreover, the method might fail (with greater error) with probability $\delta$.
+
+            \item[$\delta$-stochastic relative approximation] 
+                Given $\delta \in ]0, 0.5[$ and $\varepsilon \in ]0, 0.5[$, a $\delta$-stochastic relative approximation has error:
+                \[ \frac{\left\vert \prob{X | \matr{E}} - \hat{\mathcal{P}}(X | \matr{E}) \right\vert}{\prob{X | \matr{E}}} \leq \varepsilon \]
+                Moreover, the method might fail (with greater error) with probability $\delta$.
+        \end{description}
+
+        \begin{theorem}
+            Approximate inference is NP-hard for any $\delta, \epsilon < 0.5$.
+        \end{theorem}
+
+    \item[Consistency] \marginnote{Consistency}
+        A sampling method is consistent if:
+        \[ \lim_{N \rightarrow \infty} \hat{\mathcal{P}}(x) = \prob{x} \]
+\end{description}
+
+
+
+\section{Sampling from an empty network}
+\marginnote{Sampling from an empty network}
+
+Sample each variable in topological order (i.e. from parents to children).
+
+The probability $\mathcal{S}$ of sampling a specific event $x_1, \dots, x_n$ is given by the
+probability of the single events knowing their parents:
+\[ \mathcal{S}(x_1, \dots, x_n) = \prod_{i=1}^n \prob{x_i | \texttt{parents}(X_i)} = \prob{x_1, \dots, x_n} \]
+
+\begin{theorem}
+    Sampling from an empty network is consistent.
+
+    \begin{proof}
+        Let $N$ be the number of samples and 
+        $\mathcal{N}(x_1, \dots, x_n)$ the number of times the event $x_1, \dots, x_n$ has been sampled.
+        \[
+            \begin{split}
+                \lim_{N \rightarrow \infty} \hat{\mathcal{P}}(x_1, \dots, x_n) &=
+                \lim_{N \rightarrow \infty} \frac{\mathcal{N}(x_1, \dots, x_n)}{N} \\
+                &= \mathcal{S}(x_1, \dots, x_n) = 
+                \prob{x_1, \dots, x_n}
+            \end{split}    
+        \]
+    \end{proof}
+\end{theorem}
+
+\begin{example}
+    Given the following Bayesian network:
+    \begin{center}
+        \includegraphics[width=0.5\textwidth]{img/_approx_infer_example.pdf}
+    \end{center}
+    
+    A possible sampling order is \texttt{Cloudy}, \texttt{Sprinkler}, \texttt{Rain}, \texttt{WetGrass}.
+
+    Assuming that a random generator gives the sequence of probabilities $(0.4, 0.8, 0.1, 0.5)$,
+    the sample will be:
+    \[ \langle \prob{C}, \prob{S | C}, \prob{R | C}, \prob{W | S, R} \rangle \]
+    \[ \langle C=\texttt{false}, \prob{S | C=\texttt{false}}, \prob{R | C=\texttt{false}}, \prob{W | S, R} \rangle \]
+    \[ \langle C=\texttt{false}, S=\texttt{false}, R=\texttt{true}, \prob{W | S=\texttt{false}, R=\texttt{true}} \rangle \]
+    \[ \langle C=\texttt{false}, S=\texttt{false}, R=\texttt{true}, W=\texttt{true} \rangle \]
+
+    Note that the adopted convention is the following: 
+    if $r$ it the probability given by a random generator and $\prob{X} = p$, $X = \texttt{true}$ if $r \leq p$.
+\end{example}
+
+
+
+\section{Rejection sampling}
+\marginnote{Rejection sampling}
+
+Given a known evidence $\matr{E}$, rejection sampling works as sampling from an empty network
+but removes any sample that does no agree with the evidence.
+
+Obviously if $\prob{\matr{E}}$ is low, the majority of the samples will be discarded and 
+more iterations are required to reach the desired number of samples.
+
+\begin{theorem}
+    Rejection sampling is consistent.
+
+    \begin{proof}
+        Let $\mathcal{N}(\matr{X})$ be the number of times the event $\matr{X}$ has been sampled.
+        \[
+            \begin{split}
+                \hat{\mathcal{P}}(\matr{X} | \matr{E}) &= 
+                \frac{\mathcal{N}(\matr{X}, \matr{E})}{\mathcal{N}(\matr{E})} \\
+                &\approx \frac{\prob{\matr{X}, \matr{E}}}{\prob{\matr{E}}} =
+                \prob{\matr{X} | \matr{E}}
+            \end{split}    
+        \]
+        The approximation derives from the consistency of sampling from an empty network.
+    \end{proof}
+\end{theorem}
+
+
+
+\section{Likelihood weighting}
+\marginnote{Likelihood weighting}
+
+Given a known evidence $\matr{E}$, likelihood weighting samples non-evidence variables and 
+weights each sample by the likelihood of the evidence.
+
+The probability $\mathcal{S}$ of sampling a specific event $\matr{Z}$ and evidence $\matr{E}$ is given by the
+probability of the single events in $\matr{Z}$ knowing their parents:
+\[ \mathcal{S}(\matr{Z}, \matr{E}) = \prod_{z_i \in \matr{Z}} \prob{z_i | \texttt{parents}(z_i)} \]
+
+The weights of a sample $(\matr{Z}, \matr{E})$ is given by the
+probability of the single events in $\matr{E}$ knowing their parents:
+\[ \text{w}(\matr{Z}, \matr{E}) = \prod_{e_i \in \matr{E}} \prob{e_i | \texttt{parents}(e_i)} \]
+
+\begin{theorem}
+    Likelihood weighting is consistent.
+
+    \begin{proof}
+        The weighted sampling probability is given by:
+        \[ 
+            \begin{split}
+                \mathcal{S}(\matr{Z}, \matr{E}) \cdot \text{w}(\matr{Z}, \matr{E})
+                &= \prod_{z_i \in \matr{Z}} \prob{z_i | \texttt{parents}(z_i)} \cdot \prod_{e_i \in E} \prob{e_i | \texttt{parents}(e_i)} \\
+                &= \prob{\matr{Z}, \matr{E}}
+            \end{split}
+        \]
+        This is consequence of the global semantics of Bayesian networks.
+    \end{proof}
+\end{theorem}
+
+\begin{example}
+    Given the following Bayesian network:
+    \begin{center}
+        \includegraphics[width=0.5\textwidth]{img/_approx_infer_example.pdf}
+    \end{center}
+
+    Knowing that $S=\texttt{true}$ and $W=\texttt{false}$,
+    we sample in the order: \texttt{Cloudy}, \texttt{Rain}.
+    
+    Assuming that a random generator gives the sequence of probabilities $(0.4, 0.1)$,
+    the sample will be:
+    \[ \langle \prob{C}, S=\texttt{true}, \prob{R | C}, W=\texttt{false} \rangle \]
+    \[ \langle C=\texttt{true}, S=\texttt{true}, \prob{R | C=\texttt{true}}, W=\texttt{false} \rangle \]
+    \[ \langle C=\texttt{true}, S=\texttt{true}, R=\texttt{true}, W=\texttt{false} \rangle \]
+
+    The weight associated to the sample is given by the probabilities of the evidence:
+    \[ 
+        \begin{split}
+            \text{w} &= \prob{S=\texttt{true} | C=\texttt{true}} \cdot \prob{W=\texttt{false} | S=\texttt{true}, R=\texttt{true}} \\
+            &= 0.1 \cdot (1 - 0.99) = 0.001
+        \end{split}
+    \]
+\end{example}
+
+
+
+\section{Markov chain Monte Carlo}
+\marginnote{Markov chain Monte Carlo}
+
+Sampling on a Markov chain where states contain an assignment to all variables.
+
+Adjacent states of the Markov chain differ by only one variable.
+Therefore, the probability of an edge connecting two states is given by the probability of the updated variable known its Markov blanket:
+\[ 
+    \prob{x_i | \texttt{markov\_blanket}(X_i)} = 
+    \prob{x_i | \texttt{parents}(X_i)} \cdot \prod_{Z_j \in \texttt{children}(x_i)} \prob{z_j | \texttt{parents}(Z_j)} 
+\]
+
+\begin{theorem}
+    Markov chain Monte Carlo is consistent.
+
+    Note: nevertheless, it is difficult to tell if convergence has been achieved.
+
+    \begin{proof}
+        Consequence of the fact that a long-run on a Markov chain converges to the posterior probability of the states.
+    \end{proof}
+\end{theorem}
+
+\begin{description}
+    \item[Compiled network]
+        A naive implementation of Markov chain Monte Carlo requires to repeatedly compute the probabilities with the Markov blanket.
+        A solution is to compile the network into a model-specific inference code.
+\end{description}
+
+\begin{example}
+    Given the evidence $S=\texttt{true}$ and $W=\texttt{true}$,
+    the structure of the Markov chain can be defined as follows:
+    \begin{center}
+        \includegraphics[width=0.5\textwidth]{img/_markov_chain_sampling.pdf}
+    \end{center}
+\end{example}
--- a/src/fundamentals-of-ai-and-kr/module3/sections/_exact_inference.tex
+++ b/src/fundamentals-of-ai-and-kr/module3/sections/_exact_inference.tex