Moved FAIKR3 in year1

This commit is contained in:
2023-12-08 19:45:01 +01:00
parent ac34ec0076
commit e66b724d8c
38 changed files with 1 additions and 1 deletions

View File

@ -0,0 +1,19 @@
{
"name": "Fundamentals of Artificial Intelligence and Knowledge Representation",
"year": 1,
"semester": 1,
"pdfs": [
{
"name": "FAIKR module 1",
"path": "module1/faikr1.pdf"
},
{
"name": "FAIKR module 2",
"path": "module2/faikr2.pdf"
},
{
"name": "FAIKR module 3",
"path": "module3/faikr3.pdf"
}
]
}

View File

@ -0,0 +1 @@
../../../ainotes.cls

View File

@ -0,0 +1,18 @@
\documentclass[11pt]{ainotes}
\title{Fundamentals of Artificial Intelligence and Knowledge Representation\\(Module 3)}
\date{2023 -- 2024}
\def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
\begin{document}
\makenotesfront
\input{sections/_intro.tex}
\input{sections/_probability.tex}
\input{sections/_bayesian_net.tex}
\input{sections/_exact_inference.tex}
\input{sections/_approx_inference.tex}
\eoc
\end{document}

View File

@ -0,0 +1,172 @@
<mxfile host="app.diagrams.net" modified="2023-11-03T14:02:14.670Z" agent="Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/119.0" etag="qJOM_4TLvx20vM_2exk3" version="22.0.0" type="device">
<diagram name="Pagina-1" id="VxJ7IpKM2QnTvTh_py_2">
<mxGraphModel dx="1050" dy="606" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="BRH9kT0y2_Ae_BUota2q-1" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="700" y="340" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-2" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="580" y="340" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-3" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="640" y="410" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-5" value="" style="endArrow=classic;html=1;rounded=0;entryX=1;entryY=0;entryDx=0;entryDy=0;exitX=0;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeWidth=2;strokeColor=#e01b24;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-1" target="BRH9kT0y2_Ae_BUota2q-3">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="730" y="440" as="sourcePoint" />
<mxPoint x="740" y="400" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-6" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=0;exitX=1;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeWidth=2;strokeColor=#e01b24;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-2" target="BRH9kT0y2_Ae_BUota2q-3">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="740" y="355" as="sourcePoint" />
<mxPoint x="708" y="412" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-7" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="120" y="40" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-8" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="120" y="140" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-9" value="" style="endArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeColor=#26a269;strokeWidth=2;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-7" target="BRH9kT0y2_Ae_BUota2q-8">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="211" y="541" as="sourcePoint" />
<mxPoint x="249" y="569" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-10" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="240" y="40" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-11" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="240" y="140" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-12" value="" style="endArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeColor=#26a269;strokeWidth=2;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-10" target="BRH9kT0y2_Ae_BUota2q-11">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="331" y="541" as="sourcePoint" />
<mxPoint x="369" y="569" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-13" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="240" y="240" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-15" value="" style="endArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeColor=#26a269;strokeWidth=2;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-11" target="BRH9kT0y2_Ae_BUota2q-13">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="280" y="110" as="sourcePoint" />
<mxPoint x="280" y="150" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-16" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="360" y="40" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-17" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
<mxGeometry x="360" y="140" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-18" value="" style="endArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeColor=#e01b24;strokeWidth=2;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-16" target="BRH9kT0y2_Ae_BUota2q-17">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="451" y="541" as="sourcePoint" />
<mxPoint x="489" y="569" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-19" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="360" y="240" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-20" value="" style="endArrow=classic;html=1;rounded=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeColor=#e01b24;strokeWidth=2;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-17" target="BRH9kT0y2_Ae_BUota2q-19">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="400" y="110" as="sourcePoint" />
<mxPoint x="400" y="150" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-25" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="240" y="410" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-26" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="120" y="410" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-27" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="180" y="340" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-28" value="" style="endArrow=none;html=1;rounded=0;entryX=1;entryY=1;entryDx=0;entryDy=0;fontSize=24;fontFamily=Verdana;endFill=0;startArrow=classic;startFill=1;strokeWidth=2;exitX=0;exitY=0;exitDx=0;exitDy=0;strokeColor=#26a269;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-25" target="BRH9kT0y2_Ae_BUota2q-27">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="220" y="445" as="sourcePoint" />
<mxPoint x="280" y="330" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-29" value="" style="endArrow=none;html=1;rounded=0;entryX=0;entryY=1;entryDx=0;entryDy=0;exitX=1;exitY=0;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;endFill=0;startArrow=classic;startFill=1;strokeWidth=2;strokeColor=#26a269;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-26" target="BRH9kT0y2_Ae_BUota2q-27">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="280" y="285" as="sourcePoint" />
<mxPoint x="248" y="342" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-35" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="450" y="410" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-36" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="330" y="410" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-37" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
<mxGeometry x="390" y="340" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-38" value="" style="endArrow=none;html=1;rounded=0;entryX=1;entryY=1;entryDx=0;entryDy=0;fontSize=24;fontFamily=Verdana;endFill=0;startArrow=classic;startFill=1;strokeWidth=2;exitX=0;exitY=0;exitDx=0;exitDy=0;strokeColor=#e01b24;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-35" target="BRH9kT0y2_Ae_BUota2q-37">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="420" y="435" as="sourcePoint" />
<mxPoint x="490" y="330" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-39" value="" style="endArrow=none;html=1;rounded=0;entryX=0;entryY=1;entryDx=0;entryDy=0;exitX=1;exitY=0;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;endFill=0;startArrow=classic;startFill=1;strokeWidth=2;strokeColor=#e01b24;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-36" target="BRH9kT0y2_Ae_BUota2q-37">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="490" y="285" as="sourcePoint" />
<mxPoint x="458" y="342" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-40" value="Y" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="910" y="340" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-41" value="X" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;" vertex="1" parent="1">
<mxGeometry x="790" y="340" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-42" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;fontFamily=Verdana;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
<mxGeometry x="850" y="410" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-43" value="" style="endArrow=classic;html=1;rounded=0;entryX=1;entryY=0;entryDx=0;entryDy=0;exitX=0;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeWidth=2;strokeColor=#26a269;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-40" target="BRH9kT0y2_Ae_BUota2q-42">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="940" y="440" as="sourcePoint" />
<mxPoint x="950" y="400" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-44" value="" style="endArrow=classic;html=1;rounded=0;entryX=0;entryY=0;entryDx=0;entryDy=0;exitX=1;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeWidth=2;strokeColor=#26a269;" edge="1" parent="1" source="BRH9kT0y2_Ae_BUota2q-41" target="BRH9kT0y2_Ae_BUota2q-42">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="950" y="355" as="sourcePoint" />
<mxPoint x="918" y="412" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-45" value="Z" style="ellipse;whiteSpace=wrap;html=1;aspect=fixed;fontSize=24;horizontal=1;fontFamily=Verdana;fillColor=#fff2cc;strokeColor=#d6b656;" vertex="1" parent="1">
<mxGeometry x="560" y="70" width="60" height="60" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-46" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;Evidence&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=24;fontFamily=Times New Roman;" vertex="1" parent="1">
<mxGeometry x="630" y="85" width="250" height="30" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-47" value="" style="endArrow=classic;html=1;rounded=0;exitX=0.5;exitY=1;exitDx=0;exitDy=0;fontSize=24;fontFamily=Verdana;strokeColor=#e01b24;strokeWidth=2;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="560" y="209" as="sourcePoint" />
<mxPoint x="620" y="209" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-48" value="" style="endArrow=classic;html=1;rounded=0;fontSize=24;fontFamily=Verdana;strokeColor=#26a269;strokeWidth=2;" edge="1" parent="1">
<mxGeometry width="50" height="50" relative="1" as="geometry">
<mxPoint x="560" y="160" as="sourcePoint" />
<mxPoint x="620" y="160" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-51" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;Active trail&lt;br style=&quot;font-size: 24px;&quot;&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=24;fontFamily=Times New Roman;" vertex="1" parent="1">
<mxGeometry x="630" y="140" width="250" height="40" as="geometry" />
</mxCell>
<mxCell id="BRH9kT0y2_Ae_BUota2q-52" value="Non-&lt;font style=&quot;font-size: 24px;&quot;&gt;active trail&lt;br style=&quot;font-size: 24px;&quot;&gt;&lt;/font&gt;" style="text;html=1;strokeColor=none;fillColor=none;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=24;fontFamily=Times New Roman;" vertex="1" parent="1">
<mxGeometry x="630" y="190" width="250" height="40" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>
</mxfile>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,196 @@
\chapter{Approximate inference}
\begin{description}
\item[Stochastic simulation] \marginnote{Stochastic simulation}
Class of methods that draw $N$ samples from the distribution
and estimate an approximate posterior $\hat{\mathcal{P}}$.
\begin{description}
\item[$\delta$-stochastic absolute approximation]
Given $\delta \in ]0, 0.5[$ and $\varepsilon \in ]0, 0.5[$, a $\delta$-stochastic absolute approximation has error:
\[ \left\vert \prob{X | \matr{E}} - \hat{\mathcal{P}}(X | \matr{E}) \right\vert \leq \varepsilon \]
Moreover, the method might fail (with greater error) with probability $\delta$.
\item[$\delta$-stochastic relative approximation]
Given $\delta \in ]0, 0.5[$ and $\varepsilon \in ]0, 0.5[$, a $\delta$-stochastic relative approximation has error:
\[ \frac{\left\vert \prob{X | \matr{E}} - \hat{\mathcal{P}}(X | \matr{E}) \right\vert}{\prob{X | \matr{E}}} \leq \varepsilon \]
Moreover, the method might fail (with greater error) with probability $\delta$.
\end{description}
\begin{theorem}
Approximate inference is NP-hard for any $\delta, \epsilon < 0.5$.
\end{theorem}
\item[Consistency] \marginnote{Consistency}
A sampling method is consistent if:
\[ \lim_{N \rightarrow \infty} \hat{\mathcal{P}}(x) = \prob{x} \]
\end{description}
\section{Sampling from an empty network}
\marginnote{Sampling from an empty network}
Sample each variable in topological order (i.e. from parents to children).
The probability $\mathcal{S}$ of sampling a specific event $x_1, \dots, x_n$ is given by the
probability of the single events knowing their parents:
\[ \mathcal{S}(x_1, \dots, x_n) = \prod_{i=1}^n \prob{x_i | \texttt{parents}(X_i)} = \prob{x_1, \dots, x_n} \]
\begin{theorem}
Sampling from an empty network is consistent.
\begin{proof}
Let $N$ be the number of samples and
$\mathcal{N}(x_1, \dots, x_n)$ the number of times the event $x_1, \dots, x_n$ has been sampled.
\[
\begin{split}
\lim_{N \rightarrow \infty} \hat{\mathcal{P}}(x_1, \dots, x_n) &=
\lim_{N \rightarrow \infty} \frac{\mathcal{N}(x_1, \dots, x_n)}{N} \\
&= \mathcal{S}(x_1, \dots, x_n) =
\prob{x_1, \dots, x_n}
\end{split}
\]
\end{proof}
\end{theorem}
\begin{example}
Given the following Bayesian network:
\begin{center}
\includegraphics[width=0.5\textwidth]{img/_approx_infer_example.pdf}
\end{center}
A possible sampling order is \texttt{Cloudy}, \texttt{Sprinkler}, \texttt{Rain}, \texttt{WetGrass}.
Assuming that a random generator gives the sequence of probabilities $(0.4, 0.8, 0.1, 0.5)$,
the sample will be:
\[ \langle \prob{C}, \prob{S | C}, \prob{R | C}, \prob{W | S, R} \rangle \]
\[ \langle C=\texttt{false}, \prob{S | C=\texttt{false}}, \prob{R | C=\texttt{false}}, \prob{W | S, R} \rangle \]
\[ \langle C=\texttt{false}, S=\texttt{false}, R=\texttt{true}, \prob{W | S=\texttt{false}, R=\texttt{true}} \rangle \]
\[ \langle C=\texttt{false}, S=\texttt{false}, R=\texttt{true}, W=\texttt{true} \rangle \]
Note that the adopted convention is the following:
if $r$ it the probability given by a random generator and $\prob{X} = p$, $X = \texttt{true}$ if $r \leq p$.
\end{example}
\section{Rejection sampling}
\marginnote{Rejection sampling}
Given a known evidence $\matr{E}$, rejection sampling works as sampling from an empty network
but removes any sample that does no agree with the evidence.
Obviously if $\prob{\matr{E}}$ is low, the majority of the samples will be discarded and
more iterations are required to reach the desired number of samples.
\begin{theorem}
Rejection sampling is consistent.
\begin{proof}
Let $\mathcal{N}(\matr{X})$ be the number of times the event $\matr{X}$ has been sampled.
\[
\begin{split}
\hat{\mathcal{P}}(\matr{X} | \matr{E}) &=
\frac{\mathcal{N}(\matr{X}, \matr{E})}{\mathcal{N}(\matr{E})} \\
&\approx \frac{\prob{\matr{X}, \matr{E}}}{\prob{\matr{E}}} =
\prob{\matr{X} | \matr{E}}
\end{split}
\]
The approximation derives from the consistency of sampling from an empty network.
\end{proof}
\end{theorem}
\section{Likelihood weighting}
\marginnote{Likelihood weighting}
Given a known evidence $\matr{E}$, likelihood weighting samples non-evidence variables and
weights each sample by the likelihood of the evidence.
The probability $\mathcal{S}$ of sampling a specific event $\matr{Z}$ and evidence $\matr{E}$ is given by the
probability of the single events in $\matr{Z}$ knowing their parents:
\[ \mathcal{S}(\matr{Z}, \matr{E}) = \prod_{z_i \in \matr{Z}} \prob{z_i | \texttt{parents}(z_i)} \]
The weight of a sample $(\matr{Z}, \matr{E})$ is given by the
probability of the single events in $\matr{E}$ knowing their parents:
\[ \text{w}(\matr{Z}, \matr{E}) = \prod_{e_i \in \matr{E}} \prob{e_i | \texttt{parents}(e_i)} \]
\begin{theorem}
Likelihood weighting is consistent.
\begin{proof}
The weighted sampling probability is given by:
\[
\begin{split}
\mathcal{S}(\matr{Z}, \matr{E}) \cdot \text{w}(\matr{Z}, \matr{E})
&= \prod_{z_i \in \matr{Z}} \prob{z_i | \texttt{parents}(z_i)} \cdot \prod_{e_i \in E} \prob{e_i | \texttt{parents}(e_i)} \\
&= \prob{\matr{Z}, \matr{E}}
\end{split}
\]
This is a consequence of the global semantics of Bayesian networks.
\end{proof}
\end{theorem}
\begin{example}
Given the following Bayesian network:
\begin{center}
\includegraphics[width=0.5\textwidth]{img/_approx_infer_example.pdf}
\end{center}
Knowing that $S=\texttt{true}$ and $W=\texttt{false}$,
we sample in the order: \texttt{Cloudy}, \texttt{Rain}.
Assuming that a random generator gives the sequence of probabilities $(0.4, 0.1)$,
the sample will be:
\[ \langle \prob{C}, S=\texttt{true}, \prob{R | C}, W=\texttt{false} \rangle \]
\[ \langle C=\texttt{true}, S=\texttt{true}, \prob{R | C=\texttt{true}}, W=\texttt{false} \rangle \]
\[ \langle C=\texttt{true}, S=\texttt{true}, R=\texttt{true}, W=\texttt{false} \rangle \]
The weight associated to the sample is given by the probability of the evidence:
\[
\begin{split}
\text{w} &= \prob{S=\texttt{true} | C=\texttt{true}} \cdot \prob{W=\texttt{false} | S=\texttt{true}, R=\texttt{true}} \\
&= 0.1 \cdot (1 - 0.99) = 0.001
\end{split}
\]
\end{example}
\section{Markov chain Monte Carlo}
\marginnote{Markov chain Monte Carlo}
Sampling on a Markov chain where states contain an assignment to all variables.
Adjacent states of the Markov chain differ by only one variable.
Therefore, the probability of an edge connecting two states is given by the probability of the updated variable known its Markov blanket:
\[
\prob{x_i | \texttt{markov\_blanket}(X_i)} =
\prob{x_i | \texttt{parents}(X_i)} \cdot \prod_{Z_j \in \texttt{children}(x_i)} \prob{z_j | \texttt{parents}(Z_j)}
\]
\begin{theorem}
Markov chain Monte Carlo is consistent.
Note: nevertheless, it is difficult to tell if convergence has been achieved.
\begin{proof}
Consequence of the fact that a long-run on a Markov chain converges to the posterior probability of the states.
\end{proof}
\end{theorem}
\begin{description}
\item[Compiled network]
A naive implementation of Markov chain Monte Carlo requires to repeatedly compute the probabilities with the Markov blanket.
A solution is to compile the network into a model-specific inference code.
\end{description}
\begin{example}
Given the evidence $S=\texttt{true}$ and $W=\texttt{true}$,
the structure of the Markov chain can be defined as follows:
\begin{center}
\includegraphics[width=0.5\textwidth]{img/_markov_chain_sampling.pdf}
\end{center}
\end{example}

View File

@ -0,0 +1,557 @@
\chapter{Bayesian networks}
\section{Bayes' rule}
\begin{description}
\item[Bayes' rule] \marginnote{Bayes' rule}
\[ \prob{a \,\vert\, b} = \frac{\prob{b \,\vert\, a} \prob{a}}{\prob{b}} \]
\item[Bayes' rule and conditional independence]
Given the random variables $\texttt{Cause}$ and\\
$\texttt{Effect}_1, \dots, \texttt{Effect}_n$, with $\texttt{Effect}_i$ independent from each other,
we can compute $\textbf{P}(\texttt{Cause}, \texttt{Effect}_1, \dots, \texttt{Effect}_n)$ as follows:
\[
\textbf{P}(\texttt{Cause}, \texttt{Effect}_1, \dots, \texttt{Effect}_n) =
\left(\prod_i \textbf{P}(\texttt{Effect}_i \,\vert\, \texttt{Cause})\right) \textbf{P}(\texttt{Cause})
\]
The number of parameters is linear.
\begin{example}
Knowing that $\textbf{P} \models (\texttt{Catch} \perp \texttt{Toothache} \vert \texttt{Cavity})$:
\[
\begin{split}
\textbf{P}&(\texttt{Cavity} \,\vert\, \texttt{toothache} \land \texttt{catch}) \\
&= \alpha\textbf{P}(\texttt{toothache} \land \texttt{catch} \,\vert\, \texttt{Cavity})\textbf{P}(\texttt{Cavity}) \\
&= \alpha\textbf{P}(\texttt{toothache} \,\vert\, \texttt{Cavity})
\textbf{P}(\texttt{catch} \,\vert\, \texttt{Cavity})\textbf{P}(\texttt{Cavity}) \\
\end{split}
\]
\end{example}
\end{description}
\section{Bayesian network reasoning}
\begin{description}
\item[Bayesian network] \marginnote{Bayesian network}
Graph for conditional independence assertions and a compact specification of full joint distributions.
\begin{itemize}
\item Directed acyclic graph.
\item Nodes represent variables.
\item The conditional distribution of a node is given by its parents
\[ \textbf{P}(X_i \,\vert\, \texttt{parents}(X_i)) \]
In other words, if there is an edge from $A$ to $B$, then $A$ (cause) influences $B$ (effect).
\end{itemize}
\begin{description}
\item[Conditional probability table (CPT)] \marginnote{Conditional probability table (CPT)}
In the case of boolean variables, the conditional distribution of a node can be represented using
a table by considering all the combinations of the parents.
\begin{example}
Given the boolean variables $A$, $B$ and $C$, with $C$ depending on $A$ and $B$, we have that:\\
\begin{minipage}{.48\linewidth}
\centering
\includegraphics[width=0.35\linewidth]{img/_cpt_graph.pdf}
\end{minipage}
\begin{minipage}{.48\linewidth}
\centering
\begin{tabular}{c|c|c|c}
A & B & $\prob{c \vert A, B}$ & $\prob{\lnot c \vert A, B}$ \\
\hline
a & b & $\alpha$ & $1-\alpha$ \\
$\lnot$a & b & $\beta$ & $1-\beta$ \\
a & $\lnot$b & $\gamma$ & $1-\gamma$ \\
$\lnot$a & $\lnot$b & $\delta$ & $1-\delta$ \\
\end{tabular}
\end{minipage}
\end{example}
\end{description}
\item[Reasoning patterns] \marginnote{Reasoning patterns}
Given a Bayesian network, the following reasoning patterns can be used:
\begin{descriptionlist}
\item[Causal] \marginnote{Causal reasoning}
To make a prediction. From the cause, derive the effect.
\begin{example}
Knowing $\texttt{Intelligence}$, it is possible to make a prediction of $\texttt{Letter}$.
\begin{center}
\includegraphics[width=0.5\linewidth]{img/_causal_example.pdf}
\end{center}
\end{example}
\item[Evidential] \marginnote{Evidential reasoning}
To find an explanation. From the effect, derive the cause.
\begin{example}
Knowing $\texttt{Grade}$, it is possible to explain it by estimating\\$\texttt{Intelligence}$.
\begin{center}
\includegraphics[width=0.7\linewidth]{img/_evidential_example.pdf}
\end{center}
\end{example}
\item[Explain away] \marginnote{Explain away reasoning}
Observation obtained "passing through" other observations.
\begin{example}
Knowing $\texttt{Difficulty}$ and $\texttt{Grade}$,
it is possible to estimate \\$\texttt{Intelligence}$.
Note that if $\texttt{Grade}$ was not known,
$\texttt{Difficulty}$ and $\texttt{Intelligence}$ would have been independent.
\begin{center}
\includegraphics[width=0.75\linewidth]{img/_explainaway_example.pdf}
\end{center}
\end{example}
\end{descriptionlist}
\item[Independence] \marginnote{Bayesian network independence}
Intuitively, an effect is independent from a cause,
if there is another cause in the middle whose value is already known.
\begin{example}
\phantom{}
\begin{minipage}{.3\linewidth}
\centering
\includegraphics[width=0.85\linewidth]{img/_independence_example.pdf}
\end{minipage}
\begin{minipage}{.6\linewidth}
\[ \textbf{P} \models (\texttt{L} \perp \texttt{D}, \texttt{I}, \texttt{S} \,\vert\, \texttt{G}) \]
\[ \textbf{P} \models (\texttt{S} \perp \texttt{L} \,\vert\, \texttt{G}) \]
\[ \textbf{P} \models (\texttt{S} \perp \texttt{D}) \text{ but }
\textbf{P} \models (\texttt{S} \,\cancel{\perp}\, \texttt{D} \,\vert\, \texttt{G}) \text{ (explain away)} \]
\end{minipage}
\end{example}
\item[V-structure] \marginnote{V-structure}
Effect with two causes.
If the effect is not in the evidence, the causes are independent.
\begin{figure}[H]
\centering
\includegraphics[width=0.2\textwidth]{img/_v_structure.pdf}
\caption{V-structure}
\end{figure}
\item[Active two-edge trail] \marginnote{Active two-edge trail}
The trail $X \leftrightharpoons Z \leftrightharpoons Y$ is active either if:
\begin{itemize}
\item $X$, $Z$, $Y$ is a v-structure $X \rightarrow Z \leftarrow Y$
and $Z$ or one of its children is in the evidence.
\item $Z$ is not in the evidence.
\end{itemize}
In other words, influence can flow from $X$ to $Y$ passing by $Z$.
\begin{figure}[h]
\centering
\includegraphics[width=0.65\textwidth]{img/_active_trail.pdf}
\caption{Example of active and non-active two-edge trails}
\end{figure}
\item[Active trail] \marginnote{Active trail}
A trail $X_1 \leftrightharpoons \dots \leftrightharpoons X_n$ is active iff
each two-edge trail $X_{i-1} \leftrightharpoons X_i \leftrightharpoons X_{i+1}$ along the trail is active.
\item[D-separation] \marginnote{D-separation}
Two sets of nodes $\vec{X}$ and $\vec{Y}$ are d-separated given the evidence $\vec{Z}$ if
there is no active trail between any $X \in \vec{X}$ and $Y \in \vec{Y}$.
\begin{theorem}
Two d-separated nodes are independent.
In other words, two nodes are independent if there are no active trails between them.
\end{theorem}
\item[Independence algorithm] \phantom{}
\begin{description}
\item[Blocked node]
A node is blocked if it blocks the flow.
This happens if one and only one of the following conditions are met:
\begin{itemize}
\item The node is in the middle of an unmarked v-structure.
\item The node is in the evidence.
\end{itemize}
\end{description}
To determine if $X \perp Y$ given the evidence $\vec{Z}$:
\begin{enumerate}
\item Traverse the graph bottom-up marking all nodes in $\vec{Z}$ or
having a child in $\vec{Z}$.
\item Find a path from $X$ to $Y$ that does not pass through a blocked node.
\item If $Y$ is not reachable from $X$, then $X$ and $Y$ are independent.
Otherwise $X$ and $Y$ are dependent.
\end{enumerate}
\begin{example}
To determine if $J \perp D$:
\begin{center}
\includegraphics[width=0.5\textwidth]{img/_d_sep_example.pdf}
\end{center}
As a path has been found, $J \,\cancel{\perp}\, D$.
\end{example}
\item[Global semantics] \marginnote{Global semantics}
Given a Bayesian network, the full joint distribution can be defined as
the product of the local conditional distributions:
\[ \prob{x_1, \dots, x_n} = \prod_{i=1}^{n} \prob{x_i \,\vert\, \texttt{parents}(X_i)} \]
\begin{example}
Given the following Bayesian network:
\begin{minipage}{.3\linewidth}
\centering
\includegraphics[width=0.7\linewidth]{img/_global_semantics_example.pdf}
\end{minipage}
\begin{minipage}{.6\linewidth}
\[
\begin{split}
&\prob{j \land m \land a \land \lnot b \land \lnot e} \\
&= \prob{\lnot b} \prob{\lnot e} \prob{a \,\vert\, \lnot b, \lnot e}
\prob{j \,\vert\, a} \prob{m \,\vert\, a}
\end{split}
\]
\end{minipage}
\end{example}
\item[Local semantics]
Each node is conditionally independent of its non-descendants given its parents.
\begin{figure}[h]
\centering
\includegraphics[width=0.35\textwidth]{img/_local_independence.pdf}
\caption{Local independence}
\end{figure}
\begin{theorem}
Local semantics $\iff$ Global semantics
\end{theorem}
\item[Markov blanket]
Each node is conditionally independent of all the other nodes
if its Markov blanket (parents, children, children's parents) is in the evidence.
\begin{figure}[h]
\centering
\includegraphics[width=0.35\textwidth]{img/_markov_blanket.pdf}
\caption{Markov blanket}
\end{figure}
\end{description}
\section{Building Bayesian networks}
\subsection{Algorithm}
The following algorithm can be used to construct a Bayesian network of $n$ random variables:
\begin{enumerate}
\item Choose an ordering of the variables $X_1, \dots, X_n$.
\item For $i=1, \dots, n$:
\begin{itemize}
\item Add $X_i$ to the network.
\item Select the parents of $X_i$ from $X_1, \dots, X_{i-1}$ such that:
\[ \textbf{P}(X_i \,\vert\, \texttt{parents}(X_i)) =
\textbf{P}(X_i \,\vert\, X_1, \dots, X_{i-1}) \]
\end{itemize}
\end{enumerate}
By construction, this algorithm guarantees the global semantics.
\begin{example}[Monty Hall]
The variables are:
\begin{itemize}
\item $G$: the choice of the guest.
\item $H$: the choice of the host.
\item $P$: the position of the prize.
\end{itemize}
Note that $P \perp G$.
Let the order be fixed as follows: $P$, $G$, $H$.
\begin{figure}[h]
\begin{subfigure}{.3\textwidth}
\centering
\includegraphics[width=0.15\linewidth]{img/_monty_hall1.pdf}
\caption{First interaction}
\end{subfigure}
\begin{subfigure}{.3\textwidth}
\centering
\includegraphics[width=0.45\linewidth]{img/_monty_hall2.pdf}
\caption{Second interaction (note that $P \perp G$)}
\end{subfigure}
\begin{subfigure}{.3\textwidth}
\centering
\includegraphics[width=0.45\linewidth]{img/_monty_hall3.pdf}
\caption{Third interaction}
\end{subfigure}
\end{figure}
\end{example}
The nodes of the resulting network can be classified as:
\begin{descriptionlist}
\item[Initial evidence] The initial observation.
\item[Testable variables] Variables that can be verified.
\item[Operable variables] Variables that can be changed by intervening on them.
\item[Hidden variables] Variables that "compress" more variables to reduce the parameters.
\end{descriptionlist}
\begin{example} \phantom{}\\
\begin{minipage}{.4\linewidth}
\begin{description}
\item[Initial evidence] Red.
\item[Testable variables] Green.
\item[Operable variables] Orange.
\item[Hidden variables] Gray.
\end{description}
\end{minipage}
\begin{minipage}{.5\linewidth}
\begin{center}
\includegraphics[width=\linewidth]{img/_car_example.pdf}
\end{center}
\end{minipage}
\end{example}
\subsection{Structure learning}
\marginnote{Structure learning}
Learn the network from the available data.
\begin{description}
\item[Constraint-based]
Independence tests to identify the constraints of the edges.
\item[Score-based]
Define a score to evaluate the network.
\end{description}
\section{Causal networks}
When building a Bayesian network, a correct ordering of the nodes
that respects the causality allows to obtain more compact networks.
\begin{description}
\item[Structural equation] \marginnote{Structural equation}
Given a variable $X_i$ with values $x_i$, its structural equation is a function $f_i$
such that it represents all its possible values:
\[ x_i = f_i(\text{other variables}, U_i) \]
$U_i$ represents unmodeled variables or error terms.
\item[Causal network] \marginnote{Causal network}
Restricted class of Bayesian networks that only allows causally compatible ordering.
An edge exists between $X_j \rightarrow X_i$ iff $X_j$ is an argument of
the structural equation $f_i$ of $X_i$.
\begin{example} \phantom{}\\[0.5em]
\begin{minipage}{.3\linewidth}
\centering
\includegraphics[width=\linewidth]{img/_causal_network_example1.pdf}
\end{minipage}
\begin{minipage}{.6\linewidth}
The structural equations are:
\[
\begin{split}
\texttt{cloudy} &= f_C(U_C) \\
\texttt{sprinkler} &= f_S(\texttt{Cloudy}, U_S) \\
\texttt{rain} &= f_R(\texttt{Cloudy}, U_R) \\
\texttt{wet\_grass} &= f_W(\texttt{Sprinkler}, \texttt{Rain}, U_W) \\
\texttt{greener\_grass} &= f_G(\texttt{WetGrass}, U_G)
\end{split}
\]
\end{minipage}\\[0.5em]
If the sprinkler is disabled, the network becomes:\\[0.5em]
\begin{minipage}{.3\linewidth}
\centering
\includegraphics[width=\linewidth]{img/_causal_network_example2.pdf}
\end{minipage}
\begin{minipage}{.6\linewidth}
The structural equations become:
\[
\begin{split}
\texttt{cloudy} &= f_C(U_C) \\
\texttt{sprinkler} &= f_S(U_S) \\
\texttt{rain} &= f_R(\texttt{Cloudy}, U_R) \\
\texttt{wet\_grass} &= f_W(\texttt{Rain}, U_W) \\
\texttt{greener\_grass} &= f_G(\texttt{WetGrass}, U_G)
\end{split}
\]
\end{minipage}
\end{example}
\item[do-operator] \marginnote{do-operator}
The do-operator allows to represent manual interventions on the network.
The operation $\texttt{do}(X_i = x_i)$ makes the structural equation of $X_i$
constant (i.e. $f_i = x_i$, without arguments, so there won't be inward edges to $X_i$).
\begin{example} \phantom{}\\[0.5em]
\begin{minipage}{.3\linewidth}
\centering
\includegraphics[width=\linewidth]{img/_do_operator_example1.pdf}
\end{minipage}
\begin{minipage}{.65\linewidth}
By applying $\texttt{do}(\texttt{Sprinkler} = \texttt{true})$, the structural equations become:
\[
\begin{split}
\texttt{cloudy} &= f_C(U_C) \\
\texttt{sprinkler} &= \texttt{true} \\
\texttt{rain} &= f_R(\texttt{Cloudy}, U_R) \\
\texttt{wet\_grass} &= f_W(\texttt{Sprinkler}, \texttt{Rain}, U_W) \\
\texttt{greener\_grass} &= f_G(\texttt{WetGrass}, U_G)
\end{split}
\]
\end{minipage}\\[0.5em]
\begin{minipage}{.3\linewidth}
\centering
\includegraphics[width=\linewidth]{img/_do_operator_example2.pdf}
\end{minipage}
\begin{minipage}{.65\linewidth}
Note that Bayesian networks are not capable of modelling manual interventions.
In fact, intervening and observing a variable are different concepts:
\[ \prob{\texttt{WetGrass} \mid \texttt{do}(\texttt{Sprinkler} = \texttt{true})} \]
\[ \neq \]
\[ \prob{\texttt{WetGrass} \mid \texttt{Sprinkler} = \texttt{true}} \]
\end{minipage}
\end{example}
\end{description}
\section{Compact conditional distributions}
Use canonical distributions (standard patterns) to reduce
the number of variables in a conditional probability table.
\subsection{Noisy-OR}
\marginnote{Noisy-OR}
Noisy-OR distributions model a network of non-interacting causes with a common effect.
A node $X$ has $k$ parents $U_1, \dots, U_k$ and possibly a leak node $U_L$ to capture unmodeled concepts.
\begin{figure}[h]
\centering
\includegraphics[width=0.3\textwidth]{img/_noisy_or_example.pdf}
\caption{Example of noisy-OR network}
\end{figure}
Each node $U_i$ has a failure (inhibition) probability $q_i$:
\[ q_i = \prob{\lnot x \mid u_i, \lnot u_j \text{ for } j \neq i} \]
The CPT can be built by computing the probabilities as:
\[ \prob{\lnot x \mid \texttt{Parents($X$)}} = \prod_{j:\, U_j = \texttt{true}} q_j \]
In other words:
\[ \prob{\lnot x \mid u_1, \dots, u_n} =
\prob{\lnot x \mid u_1} \cdot \prob{\lnot x \mid u_2} \cdot \text{\dots} \cdot \prob{\lnot x \mid u_n} \]
Because only the failure probabilities are required, the number of parameters is linear in the number of parents.
\begin{example}
We have as causes \texttt{Cold}, \texttt{Flu} and \texttt{Malaria} and as effect \texttt{Fever}.
For simplicity there are no leak nodes.
The failure probabilities are:
\[
\begin{split}
q_\texttt{cold} &= \prob{\lnot \texttt{fever} \mid \texttt{cold}, \lnot\texttt{flu}, \lnot\texttt{malaria}} = 0.6 \\
q_\texttt{flu} &= \prob{\lnot \texttt{fever} \mid \lnot\texttt{cold}, \texttt{flu}, \lnot\texttt{malaria}} = 0.2 \\
q_\texttt{malaria} &= \prob{\lnot \texttt{fever} \mid \lnot\texttt{cold}, \lnot\texttt{flu}, \texttt{malaria}} = 0.1
\end{split}
\]
Known the failure probabilities, the entire CPT can be computed:
\begin{center}
\begin{tabular}{c|c|c|rc|c}
\hline
\texttt{Cold} & \texttt{Flu} & \texttt{Malaria} & \multicolumn{2}{c|}{$\prob{\lnot\texttt{fever}}$} & $1-\prob{\lnot\texttt{fever}}$ \\
\hline
F & F & F & & 0.0 & 1.0 \\
F & F & T & $q_\texttt{malaria} =$ & 0.1 & 0.9 \\
F & T & F & $q_\texttt{flu} =$ & 0.2 & 0.8 \\
F & T & T & $q_\texttt{flu} \cdot q_\texttt{malaria} =$ & 0.02 & 0.98 \\
T & F & F & $q_\texttt{cold} =$ & 0.6 & 0.4 \\
T & F & T & $q_\texttt{cold} \cdot q_\texttt{malaria} =$ & 0.06 & 0.94 \\
T & T & F & $q_\texttt{cold} \cdot q_\texttt{flu} =$ & 0.12 & 0.88 \\
T & T & T & $q_\texttt{cold} \cdot q_\texttt{flu} \cdot q_\texttt{malaria} =$ & 0.012 & 0.988 \\
\hline
\end{tabular}
\end{center}
\end{example}
\subsection{Hybrid Bayesian networks}
\marginnote{Hybrid Bayesian networks}
Network with discrete and continuous random variables.
Continuous variables must be converted into a finite representation.
Possible approaches are:
\begin{description}
\item[Discretization] \marginnote{Discretization}
Values are divided into a fixed set of intervals.
This approach may introduce large errors and large CPTs.
\item[Finitely parametrized canonical families] \marginnote{Finitely parametrized canonical families}
There are two cases to handle using this approach:
\begin{descriptionlist}
\item[Continuous child]
Given the continuous variables $X$ and $C$ and a discrete (boolean, for simplicity) variable $D$,
we want to compute the distribution $\textbf{P}(X \mid C, D)$.
The discrete parent is handled by enumeration, by computing the probability over the domain of $D$.
For the continuous parent, an arbitrarily chosen distribution over the values of $X$ is used.
A common choice is the \textbf{linear Gaussian} \marginnote{Linear Gaussian}
whose mean is a linear combination of the values of the parents and the variance is fixed.
A network with all continuous linear Gaussian distributions has the property
of having a multivariate Gaussian distribution as joint distribution.
Moreover, if a continuous variable has some discrete parents, it defines a conditional Gaussian distribution
where, fixed the values of the discrete variables, the distribution over the continuous variable is a multivariate Gaussian.
\begin{example}
Let \texttt{Subsidy} and \texttt{Buys} be discrete variables and
\texttt{Harvest} and \texttt{Cost} be continuous variables.
\begin{center}
\includegraphics[width=0.3\textwidth]{img/_linear_gaussian_example.pdf}
\end{center}
To compute $\textbf{P}(\texttt{Cost} \mid \texttt{Harvest}, \texttt{Subsidy})$,
we split the probabilities over the values of the discrete variable \texttt{Subsidy}
and use a linear Gaussian for \texttt{Harvest}.
We therefore have that:
\[
\begin{split}
\prob{\texttt{C} = \texttt{c} \mid \texttt{Harvest} = \texttt{h}, \texttt{Subsidy} = \texttt{true}}
&= \mathcal{N}(a_t h + b_t, \sigma_t)(c) \\
\prob{\texttt{C} = \texttt{c} \mid \texttt{Harvest} = \texttt{h}, \texttt{Subsidy} = \texttt{false}}
&= \mathcal{N}(a_f h + b_f, \sigma_f)(c)
\end{split}
\]
where $a_t$, $b_t$, $\sigma_t$, $a_f$, $b_f$ and $\sigma_t$ are parameters.
\end{example}
\item[Discrete child with continuous parents]
Given the continuous variable $C$ and a discrete variable $X$,
the probability of $X$ given $C$ in obtained by using a threshold function.
For instance, probit or sigmoid distributions can be used.
\end{descriptionlist}
\end{description}
\subsection{Other methods}
\begin{description}
\item[Dynamic Bayesian network] \marginnote{Dynamic Bayesian network}
Useful to model the evolution through time.
A template variable $X_i$ is instantiated as $X_i^{(t)}$ at each time step.
\begin{figure}[h]
\centering
\includegraphics[width=0.3\textwidth]{img/_dynamic_bn_example.pdf}
\caption{Example of dynamic Bayesian network}
\end{figure}
\item[Density estimation] \marginnote{Density estimation}
Parameters of the conditional distribution can be learned using:
\begin{description}
\item[Bayesian learning] calculate the probability of each hypothesis.
\item[Approximations] using the maximum-a-posteriori and maximum-likelihood hypothesis.
\item[Expectation-maximization algorithm{\normalfont.}]
\end{description}
\item[Undirected graphical models] \marginnote{Undirected graphical models}
Markov networks are an alternative to probabilistic graphical models (as Bayesian networks).
Markov networks are undirected graphs with factors (instead of probabilities) and
are able to naturally capture independence relations.
\end{description}

View File

@ -0,0 +1,118 @@
\chapter{Exact inference}
\section{Inference by enumeration}
\marginnote{Inference by enumeration}
Method to sum out a joint probability without explicitly representing it
by using CPT entries.
Enumeration follows a depth-first exploration and has a space complexity of $O(n)$
and time complexity of $O(d^n)$.
It must be noted that some probabilities appear multiple times but
require to be recomputed because of the definition of the algorithm.
\begin{example}[Burglary]
Given the Bayesian network:
\begin{center}
\includegraphics[width=0.15\textwidth]{img/_burglary_net.pdf}
\end{center}
We want to compute $\textbf{P}(B \mid j, m)$:
\[
\begin{split}
\textbf{P}(B \mid j, m) &= \frac{\textbf{P}(B, j, m)}{\prob{j, m}} \\
&= \alpha \textbf{P}(B, j, m) \\
&= \alpha \sum_{e} \sum_{a} \textbf{P}(B, j, m, e, a) \\
&= \alpha \sum_{e} \sum_{a} \textbf{P}(B) \prob{e} \textbf{P}(a \mid B, e) \prob{j \mid a} \prob{m \mid a} \\
&= \alpha \textbf{P}(B) \sum_{e} \prob{e} \sum_{a} \textbf{P}(a \mid B, e) \prob{j \mid a} \prob{m \mid a} \\
\end{split}
\]
This can be represented using a tree:
\begin{center}
\includegraphics[width=0.75\textwidth]{img/_burglary_enumeration.pdf}
\end{center}
\end{example}
\section{Inference by variable elimination}
\marginnote{Inference by variable elimination}
Method that carries out summations right-to-left and stores intermediate results (called factors).
\begin{description}
\item[Pointwise product of factors] $f(X, Y) \times g(Y, Z) = p(X, Y, Z)$
\begin{figure}[h]
\centering
\includegraphics[width=0.5\textwidth]{img/_pointwise_factors.pdf}
\caption{Example of pointwise product}
\end{figure}
\item[Summing out]
To sum out a variable $X$ from a product of factors:
\begin{enumerate}
\item Move constant factors outside (i.e. factors that do not depend on $X$).
\item Compute the pointwise product of the remaining terms.
\end{enumerate}
\begin{example}
\[
\begin{split}
\sum_X f_1 \times \dots \times f_k &= f_1 \times \dots \times f_i \sum_X f_{i+1} \times \dots \times f_k \\
&= f_1 \times \dots \times f_i \times f_X
\end{split}
\]
\end{example}
\end{description}
\begin{example}[Burglary]
Given the Bayesian network:
\begin{center}
\includegraphics[width=0.15\textwidth]{img/_burglary_net.pdf}
\end{center}
We want to compute
$\textbf{P}(B \mid j, m) = \alpha \textbf{P}(B) \sum_{e} \prob{e} \sum_{a} \textbf{P}(a \mid B, e) \prob{j \mid a} \prob{m \mid a}$.
We first work on the summation on $A$.
We introduce as factors the entries of the CPT:
\[ \textbf{P}(B \mid j, m) = \alpha \textbf{P}(B) \sum_{e} \prob{e} \sum_{a} f_A(a, b, e) f_J(a) f_M(a) \]
Note that $j$ and $m$ are not parameters of the factors $f_J$ and $f_M$ because they are already given.
We then sum out on $A$:
\[ \textbf{P}(B \mid j, m) = \alpha \textbf{P}(B) \sum_{e} \prob{e} f_{AJM}(b, e) \]
Now, we repeat the same process and sum out $E$:
\[ \textbf{P}(B \mid j, m) = \alpha \textbf{P}(B) f_{EAJM}(b) \]
At last, we factor $\textbf{P}(B)$:
\[ \textbf{P}(B \mid j, m) = \alpha f_B(b) f_{EAJM}(b) \]
\end{example}
\subsection{Irrelevant variables}
\marginnote{Irrelevant variables}
A variable $X$ is irrelevant if summing over it results in a probability of $1$.
\begin{theorem}
Given a query $X$, the evidence $\matr{E}$ and a variable $Y$:
\[ Y \notin (\texttt{Ancestors($\{ X \}$)} \cup \texttt{Ancestors($\matr{E}$)}) \rightarrow Y \text{ is irrelevant} \]
\end{theorem}
\begin{theorem}
Given a query $X$, the evidence $\matr{E}$ and a variable $Y$:
\[ Y \text{ d-separated from } X \text{ by } \matr{E} \rightarrow Y \text{ is irrelevant} \]
\end{theorem}
\subsection{Complexity}
\begin{description}
\item[Singly connected networks]
Network where any two nodes are connected with at most one undirected path.
Time and space complexity is $O(d^k n)$.
\item[Multiply connected networks] The problem is NP-hard.
\end{description}
\section{Clustering algorithm}
\marginnote{Clustering algorithm}
Method that joins individual nodes to form clusters.
Allows to estimate the posterior probabilities for $n$ variables with complexity $O(n)$.

View File

@ -0,0 +1,48 @@
\chapter{Introduction}
\section{Uncertainty}
\begin{description}
\item[Uncertainty] \marginnote{Uncertainty}
A task is uncertain if it has:
\begin{itemize}
\item Partial observations
\item Noisy or wrong information
\item Uncertain outcomes of the actions
\item Complex models
\end{itemize}
A purely logic approach leads to:
\begin{itemize}
\item Risks falsehood: unreasonable conclusion when applied in practice.
\item Weak decisions: too many conditions required to make a conclusion.
\end{itemize}
\end{description}
\subsection{Handling uncertainty}
\begin{descriptionlist}
\item[Default/non-monotonic logic] \marginnote{Default/non-monotonic logic}
Works on assumptions.
An assumption can be contradicted by the evidence.
\item[Rule-based systems with fudge factors] \marginnote{Rule-based systems with fudge factors}
Formulated as premise $\rightarrow_\text{prob.}$ effect.
Have the following issues:
\begin{itemize}
\item Locality: how can the probability account all the evidence.
\item Combination: chaining of unrelated concepts.
\end{itemize}
\item[Probability] \marginnote{Probability}
Assign a probability given the available known evidence.
Note: fuzzy logic handles the degree of truth and not the uncertainty.
\end{descriptionlist}
\begin{description}
\item[Decision theory] \marginnote{Decision theory}
Defined as:
\[ \text{Decision theory} = \text{Utility theory} + \text{Probability theory} \]
where the utility theory depends on one's preferences.
\end{description}

View File

@ -0,0 +1,236 @@
\chapter{Probability}
\begin{description}
\item[Sample space] \marginnote{Sample space}
Set $\Omega$ of all possible worlds.
\begin{descriptionlist}
\item[Event] \marginnote{Event}
Subset $A \subseteq \Omega$.
\item[Sample point/Possible world/Atomic event] \marginnote{Sample point}
Element $\omega \in \Omega$.
\end{descriptionlist}
\item[Probability space] \marginnote{Probability space}
A probability space/model is a function $\prob{\cdot}: \Omega \rightarrow [0, 1]$ assigned to a sample space such that:
\begin{itemize}
\item $0 \leq \prob{\omega} \leq 1$
\item $\sum_{\omega \in \Omega} \prob{\omega} = 1$
\item $\prob{A} = \sum_{\omega \in A} \prob{\omega}$
\end{itemize}
\item[Random variable] \marginnote{Random variable}
A function from an event to some range (e.g. reals, booleans, \dots).
\item[Probability distribution] \marginnote{Probability distribution}
For any random variable $X$:
\[ \prob{X = x_i} = \sum_{\omega \text{ s.t. } X(\omega)=x_i} \prob{\omega} \]
\item[Proposition] \marginnote{Proposition}
Event where a random variable has a certain value.
\[ a = \{ \omega \,\vert\, A(\omega) = \texttt{true} \} \]
\[ \lnot a = \{ \omega \,\vert\, A(\omega) = \texttt{false} \} \]
\[ (\texttt{Weather} = \texttt{rain}) = \{ \omega \,\vert\, B(\omega) = \texttt{rain} \} \]
\item[Prior probability] \marginnote{Prior probability}
Prior/unconditional probability of a proposition based on known evidence.
\item[Probability distribution (all)] \marginnote{Probability distribution (all)}
Gives all the probabilities of a random variable.
\[ \textbf{P}(A) = \langle \prob{A=a_1}, \dots, \prob{A=a_n} \rangle \]
\item[Joint probability distribution] \marginnote{Joint probability distribution}
The joint probability distribution of a set of random variables gives
the probability of all the different combinations of their atomic events.
Note: Every question on a domain can, in theory, be answered using the joint distribution.
In practice, it is hard to apply.
\begin{example}
$\textbf{P}(\texttt{Weather}, \texttt{Cavity}) = $
\begin{center}
\small
\begin{tabular}{|c | c|c|c|c|}
\cline{2-5}
\multicolumn{1}{c|}{} & \texttt{Weather=sunny} & \texttt{Weather=rain} & \texttt{Weather=cloudy} & \texttt{Weather=snow} \\
\hline
\texttt{Cavity=true} & 0.144 & 0.02 & 0.016 & 0.02 \\
\hline
\texttt{Cavity=false} & 0.576 & 0.08 & 0.064 & 0.08 \\
\hline
\end{tabular}
\end{center}
\end{example}
\item[Probability density function] \marginnote{Probability density function}
The probability density function (PDF) of a random variable $X$ is a function $p: \mathbb{R} \rightarrow \mathbb{R}$
such that:
\[ \int_{\mathcal{T}_X} p(x) \,dx = 1 \]
\begin{descriptionlist}
\item[Uniform distribution] \marginnote{Uniform distribution}
\[
p(x) = \text{Unif}[a, b](x) =
\begin{cases}
\frac{1}{b-a} & a \leq x \leq b \\
0 & \text{otherwise}
\end{cases}
\]
\item[Gaussian (normal) distribution] \marginnote{Gaussian (normal) distribution}
\[ \mathcal{N}(\mu, \sigma^2) = \frac{1}{\sigma\sqrt{2\pi}}e^{\frac{-(x-\mu)^2}{2\sigma^2}} \]
$\mathcal{N}(0, 1)$ is the standard Gaussian.
\end{descriptionlist}
\item[Conditional probability] \marginnote{Conditional probability}
Probability of a prior knowledge with new evidence:
\[ \prob{a \vert b} = \frac{\prob{a \land b}}{\prob{b}} \]
The product rule gives an alternative formulation:
\[ \prob{a \land b} = \prob{a \vert b}{\prob{b}} = \prob{b \vert a}{\prob{a}} \]
\begin{description}
\item[Chain rule] \marginnote{Chain rule}
Successive application of the product rule:
\[
\begin{split}
\textbf{P}(X_1, \dots, X_n) &= \textbf{P}(X_1, \dots, X_{n-1}) \textbf{P}(X_n \vert X_1, \dots, X_{n-1}) \\
&= \textbf{P}(X_1, \dots, X_{n-2}) \textbf{P}(X_{n-1} \vert X_1, \dots, X_{n-2}) \textbf{P}(X_n \vert X_1, \dots, X_{n-1}) \\
&= \prod_{i=1}^{n} \textbf{P}(X_i \vert X_1, \dots, X_{i-1})
\end{split}
\]
\end{description}
\item[Independence] \marginnote{Independence}
Two random variables $A$ and $B$ are independent ($A \perp B$) iff:
\[
\textbf{P}(A \vert B) = \textbf{P}(A) \,\text{ or }\,
\textbf{P}(B \vert A) = \textbf{P}(B) \,\text{ or }\,
\textbf{P}(A, B) = \textbf{P}(A)\textbf{P}(B)
\]
\item[Conditional independence] \marginnote{Conditional independence}
Two random variables $A$ and $B$ are conditionally independent iff:
\[ \textbf{P}(A \,\vert\, C, B) = \textbf{P}(A \,\vert\, C) \]
\end{description}
\section{Inference with full joint distributions}
Given a joint distribution, the probability of any proposition $\phi$
can be computed as the sum of the atomic events where $\phi$ is true:
\[ \prob{\phi} = \sum_{\omega:\, \omega \models \phi} \prob{\omega} \]
\begin{example}
Given the following joint distribution:
\begin{center}
\begin{tabular}{|c|c|c|c|c|}
\cline{2-5}
\multicolumn{1}{c|}{} & \multicolumn{2}{c|}{\texttt{toothache}} & \multicolumn{2}{c|}{$\lnot$\texttt{toothache}} \\
\cline{2-5}
\multicolumn{1}{c|}{} & \texttt{catch} & $\lnot$\texttt{catch} & \texttt{catch} & $\lnot$\texttt{catch} \\
\hline
\texttt{cavity} & 0.108 & 0.012 & 0.072 & 0.008 \\
\hline
$\lnot$\texttt{cavity} & 0.016 & 0.064 & 0.144 & 0.576 \\
\hline
\end{tabular}
\end{center}
We have that:
\begin{itemize}
\item $\prob{\texttt{toothache}} = 0.108 + 0.012 + 0.016 + 0.064 = 0.2$
\item $\prob{\texttt{cavity} \vee \texttt{toothache}} = 0.108 + 0.012 + 0.072 + 0.008 + 0.016 + 0.064 = 0.28$
\item $\prob{\lnot\texttt{cavity} \,\vert\, \texttt{toothache}} = \frac{\prob{\lnot\texttt{cavity} \land \texttt{toothache}}}{\prob{\texttt{toothache}}} =
\frac{0.016 + 0.064}{0.2} = 0.4$
\end{itemize}
\end{example}
\begin{description}
\item[Marginalization] \marginnote{Marginalization}
The probability that a random variable assumes a specific value is given by
the sum off all the joint probabilities where that random variable assumes the given value.
\begin{example}
Given the joint distribution:
\begin{center}
\small
\begin{tabular}{|c | c|c|c|c|}
\cline{2-5}
\multicolumn{1}{c|}{} & \texttt{Weather=sunny} & \texttt{Weather=rain} & \texttt{Weather=cloudy} & \texttt{Weather=snow} \\
\hline
\texttt{Cavity=true} & 0.144 & 0.02 & 0.016 & 0.02 \\
\hline
\texttt{Cavity=false} & 0.576 & 0.08 & 0.064 & 0.08 \\
\hline
\end{tabular}
\end{center}
We have that $\prob{\texttt{Weather}=\texttt{sunny}} = 0.144 + 0.576$
\end{example}
\item[Conditioning] \marginnote{Conditioning}
Adding a condition to a probability (reduction and renormalization).
\item[Normalization] \marginnote{Normalization}
Given a conditional probability distribution $\textbf{P}(A \vert B)$,
it can be formulated as:
\[ \textbf{P}(A \vert B) = \alpha\textbf{P}(A, B) \]
where $\alpha$ is a normalization constant.
In fact, fixed the evidence $B$, the denominator to compute the conditional probability is the same for each probability.
\begin{example}
Given the joint distribution:
\begin{center}
\begin{tabular}{|c|c|c|c|c|}
\cline{2-5}
\multicolumn{1}{c|}{} & \multicolumn{2}{c|}{\texttt{toothache}} & \multicolumn{2}{c|}{$\lnot$\texttt{toothache}} \\
\cline{2-5}
\multicolumn{1}{c|}{} & \texttt{catch} & $\lnot$\texttt{catch} & \texttt{catch} & $\lnot$\texttt{catch} \\
\hline
\texttt{cavity} & 0.108 & 0.012 & 0.072 & 0.008 \\
\hline
$\lnot$\texttt{cavity} & 0.016 & 0.064 & 0.144 & 0.576 \\
\hline
\end{tabular}
\end{center}
We have that:
\[
\textbf{P}(\texttt{Cavity} \vert \texttt{toothache}) =
\langle
\frac{\prob{\texttt{cavity}, \texttt{toothache}, \texttt{catch}}}{\prob{\texttt{toothache}}},
\frac{\prob{\lnot\texttt{cavity}, \texttt{toothache}, \lnot\texttt{catch}}}{\prob{\texttt{toothache}}}
\rangle
\]
\end{example}
\item[Probability query] \marginnote{Probability query}
Given a set of query variables $\bm{Y}$, the evidence variables $\vec{e}$ and the other hidden variables $\bm{H}$,
the probability of the query can be computed as:
\[
\textbf{P}(\bm{Y} \vert \bm{E}=\vec{e}) = \alpha \textbf{P}(\bm{Y}, \bm{E}=\vec{e})
= \alpha \sum_{\vec{h}} \textbf{P}(\bm{Y}, \bm{E}=\vec{e}, \bm{H}=\vec{h})
\]
The problem of this approach is that it has exponential time and space complexity
that makes it not applicable in practice.
To reduce the size of the variables, conditional independence can be exploited.
\begin{example}
Knowing that $\textbf{P} \models (\texttt{Catch} \perp \texttt{Toothache} \vert \texttt{Cavity})$,
we can compute the distribution $\textbf{P}(\texttt{Toothache}, \texttt{Catch}, \texttt{Cavity})$ as follows:
\[
\begin{split}
\textbf{P}&(\texttt{Toothache}, \texttt{Catch}, \texttt{Cavity}) = \\
&= \textbf{P}(\texttt{Toothache} \,\vert\, \texttt{Catch}, \texttt{Cavity})
\textbf{P}(\texttt{Catch} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Cavity}) \\
&= \textbf{P}(\texttt{Toothache} \,\vert\, \texttt{Cavity})
\textbf{P}(\texttt{Catch} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Cavity})
\end{split}
\]
$\textbf{P}(\texttt{Toothache}, \texttt{Catch}, \texttt{Cavity})$ has 7 independent values that grows exponentially
($2 \cdot 2 \cdot 2 = 8$ values, but one of them can be omitted as a probability always sums up to 1).
$\textbf{P}(\texttt{Toothache} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Catch} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Cavity})$
has 5 independent values that grows linearly ($4 + 4 + 2 = 10$, but a value of $\textbf{P}(\texttt{Cavity})$ can be omitted.
The conditional probabilities require two tables (one for each prior) each with 2 values,
but for each table a value can be omitted, therefore requiring $2$ independent values per conditional probability instead of $4$).
\end{example}
\end{description}