diff --git a/src/ainotes.cls b/src/ainotes.cls
index d350d25..5d97ee4 100644
--- a/src/ainotes.cls
+++ b/src/ainotes.cls
@@ -6,7 +6,7 @@
 
 \usepackage{geometry}
 \usepackage{graphicx, xcolor}
-\usepackage{amsmath, amsfonts, amssymb, amsthm, mathtools, bm, upgreek}
+\usepackage{amsmath, amsfonts, amssymb, amsthm, mathtools, bm, upgreek, cancel}
 \usepackage{hyperref}
 \usepackage[nameinlink]{cleveref}
 \usepackage[all]{hypcap} % Links hyperref to object top and not caption
diff --git a/src/fundamentals-of-ai-and-kr/module3/img/_causal_example.pdf b/src/fundamentals-of-ai-and-kr/module3/img/_causal_example.pdf
new file mode 100644
index 0000000..135e68d
Binary files /dev/null and b/src/fundamentals-of-ai-and-kr/module3/img/_causal_example.pdf differ
diff --git a/src/fundamentals-of-ai-and-kr/module3/img/_cpt_graph.pdf b/src/fundamentals-of-ai-and-kr/module3/img/_cpt_graph.pdf
new file mode 100644
index 0000000..c964452
Binary files /dev/null and b/src/fundamentals-of-ai-and-kr/module3/img/_cpt_graph.pdf differ
diff --git a/src/fundamentals-of-ai-and-kr/module3/img/_evidential_example.pdf b/src/fundamentals-of-ai-and-kr/module3/img/_evidential_example.pdf
new file mode 100644
index 0000000..87f4bb2
Binary files /dev/null and b/src/fundamentals-of-ai-and-kr/module3/img/_evidential_example.pdf differ
diff --git a/src/fundamentals-of-ai-and-kr/module3/img/_explainaway_example.pdf b/src/fundamentals-of-ai-and-kr/module3/img/_explainaway_example.pdf
new file mode 100644
index 0000000..4d5e0e3
Binary files /dev/null and b/src/fundamentals-of-ai-and-kr/module3/img/_explainaway_example.pdf differ
diff --git a/src/fundamentals-of-ai-and-kr/module3/img/_global_semantics_example.pdf b/src/fundamentals-of-ai-and-kr/module3/img/_global_semantics_example.pdf
new file mode 100644
index 0000000..97ccf0a
Binary files /dev/null and b/src/fundamentals-of-ai-and-kr/module3/img/_global_semantics_example.pdf differ
diff --git a/src/fundamentals-of-ai-and-kr/module3/img/_independence_example.pdf b/src/fundamentals-of-ai-and-kr/module3/img/_independence_example.pdf
new file mode 100644
index 0000000..a83357f
Binary files /dev/null and b/src/fundamentals-of-ai-and-kr/module3/img/_independence_example.pdf differ
diff --git a/src/fundamentals-of-ai-and-kr/module3/sections/_probability.tex b/src/fundamentals-of-ai-and-kr/module3/sections/_probability.tex
index 735307d..7ba962e 100644
--- a/src/fundamentals-of-ai-and-kr/module3/sections/_probability.tex
+++ b/src/fundamentals-of-ai-and-kr/module3/sections/_probability.tex
@@ -49,11 +49,14 @@
             $\textbf{P}(\texttt{Weather}, \texttt{Cavity}) = $
             \begin{center}
                 \small
-                \begin{tabular}{c | cccc}
-                                            & \texttt{Weather=sunny} & \texttt{Weather=rain} & \texttt{Weather=cloudy} & \texttt{Weather=snow} \\
+                \begin{tabular}{|c | c|c|c|c|}
+                    \cline{2-5}
+                    \multicolumn{1}{c|}{}    & \texttt{Weather=sunny} & \texttt{Weather=rain} & \texttt{Weather=cloudy} & \texttt{Weather=snow} \\
                     \hline
                     \texttt{Cavity=true}    & 0.144 & 0.02 & 0.016 & 0.02 \\
-                    \texttt{Cavity=false}   & 0.576 & 0.08 & 0.064 & 0.08
+                    \hline
+                    \texttt{Cavity=false}   & 0.576 & 0.08 & 0.064 & 0.08 \\
+                    \hline
                 \end{tabular}
             \end{center}
         \end{example}
@@ -125,6 +128,7 @@ can be computed as the sum of the atomic events where $\phi$ is true:
             \multicolumn{1}{c|}{}    & \texttt{catch} & $\lnot$\texttt{catch} & \texttt{catch} & $\lnot$\texttt{catch} \\
             \hline
             \texttt{cavity}         & 0.108 & 0.012 & 0.072 & 0.008 \\
+            \hline
             $\lnot$\texttt{cavity}  & 0.016 & 0.064 & 0.144 & 0.576 \\
             \hline
         \end{tabular}
@@ -147,11 +151,14 @@ can be computed as the sum of the atomic events where $\phi$ is true:
             Given the joint distribution:
             \begin{center}
                 \small
-                \begin{tabular}{c | cccc}
-                                            & \texttt{Weather=sunny} & \texttt{Weather=rain} & \texttt{Weather=cloudy} & \texttt{Weather=snow} \\
+                \begin{tabular}{|c | c|c|c|c|}
+                    \cline{2-5}
+                    \multicolumn{1}{c|}{}    & \texttt{Weather=sunny} & \texttt{Weather=rain} & \texttt{Weather=cloudy} & \texttt{Weather=snow} \\
                     \hline
                     \texttt{Cavity=true}    & 0.144 & 0.02 & 0.016 & 0.02 \\
-                    \texttt{Cavity=false}   & 0.576 & 0.08 & 0.064 & 0.08
+                    \hline
+                    \texttt{Cavity=false}   & 0.576 & 0.08 & 0.064 & 0.08 \\
+                    \hline
                 \end{tabular}
             \end{center}
             We have that $\prob{\texttt{Weather}=\texttt{sunny}} = 0.144 + 0.576$
@@ -176,6 +183,7 @@ can be computed as the sum of the atomic events where $\phi$ is true:
                     \multicolumn{1}{c|}{}    & \texttt{catch} & $\lnot$\texttt{catch} & \texttt{catch} & $\lnot$\texttt{catch} \\
                     \hline
                     \texttt{cavity}         & 0.108 & 0.012 & 0.072 & 0.008 \\
+                    \hline
                     $\lnot$\texttt{cavity}  & 0.016 & 0.064 & 0.144 & 0.576 \\
                     \hline
                 \end{tabular}
@@ -183,10 +191,10 @@ can be computed as the sum of the atomic events where $\phi$ is true:
 
             We have that:
             \[
-                \textbf{P}(\texttt{cavity} \vert \texttt{toothache}) = 
+                \textbf{P}(\texttt{Cavity} \vert \texttt{toothache}) = 
                     \langle 
                         \frac{\prob{\texttt{cavity}, \texttt{toothache}, \texttt{catch}}}{\prob{\texttt{toothache}}},
-                        \frac{\prob{\texttt{cavity}, \texttt{toothache}, \lnot\texttt{catch}}}{\prob{\texttt{toothache}}}
+                        \frac{\prob{\lnot\texttt{cavity}, \texttt{toothache}, \lnot\texttt{catch}}}{\prob{\texttt{toothache}}}
                     \rangle  
             \]
         \end{example}
@@ -195,9 +203,173 @@ can be computed as the sum of the atomic events where $\phi$ is true:
         Given a set of query variables $\bm{Y}$, the evidence variables $\vec{e}$ and the other hidden variables $\bm{H}$,
         the probability of the query can be computed as:
         \[ 
-            \textbf{P}(\bm{Y} \vert \bm{E}=\vec{e}) = \alpha \textbf{P}(\bm{Y} \vert \bm{E}=\vec{e})
-                = \alpha \sum_{\vec{h}} \textbf{P}(\bm{Y} \vert \bm{E}=\vec{e}, \bm{H}=\vec{h})
+            \textbf{P}(\bm{Y} \vert \bm{E}=\vec{e}) = \alpha \textbf{P}(\bm{Y}, \bm{E}=\vec{e})
+                = \alpha \sum_{\vec{h}} \textbf{P}(\bm{Y}, \bm{E}=\vec{e}, \bm{H}=\vec{h})
         \]
         The problem of this approach is that it has exponential time and space complexity
-        which makes it not applicable in practice.
+        that makes it not applicable in practice.
+
+        To reduce the size of the variables, conditional independence can be exploited.
+        \begin{example}
+            Knowing that $\textbf{P} \models (\texttt{Catch} \perp \texttt{Toothache} \vert \texttt{Cavity})$,
+            we can compute the distribution $\textbf{P}(\texttt{Toothache}, \texttt{Catch}, \texttt{Cavity})$ as follows:
+            \[
+                \begin{split}
+                    \textbf{P}&(\texttt{Toothache}, \texttt{Catch}, \texttt{Cavity}) = \\
+                        &= \textbf{P}(\texttt{Toothache} \,\vert\, \texttt{Catch}, \texttt{Cavity})
+                            \textbf{P}(\texttt{Catch} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Cavity}) \\
+                        &= \textbf{P}(\texttt{Toothache} \,\vert\, \texttt{Cavity})
+                            \textbf{P}(\texttt{Catch} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Cavity})
+                \end{split}
+            \]
+            $\textbf{P}(\texttt{Toothache}, \texttt{Catch}, \texttt{Cavity})$ has 7 independent values that grows exponentially
+            ($2 \cdot 2 \cdot 2 = 8$ values, but one of them can be omitted as a probability always sums up to 1).
+
+            $\textbf{P}(\texttt{Toothache} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Catch} \,\vert\, \texttt{Cavity}) \textbf{P}(\texttt{Cavity})$
+            has 5 independent values that grows linearly ($4 + 4 + 2 = 10$, but a value of $\textbf{P}(\texttt{Cavity})$ can be omitted.
+            The conditional probabilities require two tables (one for each prior) each with 2 values, 
+            but for each table a value can be omitted, therefore requiring $2$ independent values per conditional probability instead of $4$).
+        \end{example}
 \end{description}
+
+
+
+\section{Bayesian networks}
+
+\begin{description}
+    \item[Bayes' rule] \marginnote{Bayes' rule}
+        \[ \prob{a \,\vert\, b} = \frac{\prob{b \,\vert\, a} \prob{a}}{\prob{b}} \]
+
+    \item[Bayes' rule and conditional independence]
+        Given the random variables $\texttt{Cause}$ and\\
+        $\texttt{Effect}_1, \dots, \texttt{Effect}_n$, with $\texttt{Effect}_i$ independent from each other,
+        we can compute $\textbf{P}(\texttt{Cause}, \texttt{Effect}_1, \dots, \texttt{Effect}_n)$ as follows:
+        \[ 
+            \textbf{P}(\texttt{Cause}, \texttt{Effect}_1, \dots, \texttt{Effect}_n) = 
+            \left(\prod_i \textbf{P}(\texttt{Effect}_i \,\vert\, \texttt{Cause})\right) \textbf{P}(\texttt{Cause})
+        \]
+        The number of parameters is linear.
+
+        \begin{example}
+            Knowing that $\textbf{P} \models (\texttt{Catch} \perp \texttt{Toothache} \vert \texttt{Cavity})$:
+            \[
+                \begin{split}
+                    \textbf{P}&(\texttt{Cavity} \,\vert\, \texttt{toothache} \land \texttt{catch}) \\
+                        &= \alpha\textbf{P}(\texttt{toothache} \land \texttt{catch} \,\vert\, \texttt{Cavity})\textbf{P}(\texttt{Cavity}) \\
+                        &= \alpha\textbf{P}(\texttt{toothache} \,\vert\, \texttt{Cavity})
+                            \textbf{P}(\texttt{catch} \,\vert\, \texttt{Cavity})\textbf{P}(\texttt{Cavity}) \\
+                \end{split}
+            \]
+        \end{example}
+
+    \item[Bayesian network] \marginnote{Bayesian network}
+        Graph for conditional independence assertions and a compact specification of full joint distributions.
+        \begin{itemize}
+            \item Directed acyclic graph.
+            \item Nodes represent variables.
+            \item The conditional distribution of a node is given by its parents 
+                \[ \textbf{P}(X_i \,\vert\, \texttt{parents}(X_i)) \]
+                In other words, if there is an edge from $A$ to $B$, then $A$ (cause) influences $B$ (effect).
+        \end{itemize}
+
+        \begin{description}
+            \item[Conditional probability table (CPT)] \marginnote{Conditional probability table (CPT)}
+                In the case of boolean variables, the conditional distribution of a node can be represented using 
+                a table by considering all the combinations of the parents.
+
+                \begin{example} 
+                    Given the boolean variables $A$, $B$ and $C$, with $C$ depending on $A$ and $B$, we have that:\\
+                    \begin{minipage}{.48\linewidth}
+                        \centering
+                        \includegraphics[width=0.35\linewidth]{img/_cpt_graph.pdf}
+                    \end{minipage}
+                    \begin{minipage}{.48\linewidth}
+                        \centering
+                        \begin{tabular}{c|c|c|c}
+                            A           & B         & $\prob{c \vert A, B}$ & $\prob{\lnot c \vert A, B}$ \\
+                            \hline
+                            a           & b         & $\alpha$ & $1-\alpha$ \\
+                            $\lnot$a    & b         & $\beta$ & $1-\beta$ \\
+                            a           & $\lnot$b  & $\gamma$ & $1-\gamma$ \\
+                            $\lnot$a    & $\lnot$b  & $\delta$ & $1-\delta$ \\
+                        \end{tabular}
+                    \end{minipage}
+                \end{example}
+        \end{description}
+
+    \item[Reasoning patterns] \marginnote{Reasoning patterns}
+        Given a Bayesian network, the following reasoning patterns can be used:
+        \begin{descriptionlist}
+            \item[Causal] \marginnote{Causal reasoning}
+                To make a prediction. From the cause, derive the effect.
+                \begin{example}
+                    Knowing $\texttt{Intelligence}$, it is possible to make a prediction of $\texttt{Letter}$.
+                    \begin{center}
+                        \includegraphics[width=0.5\linewidth]{img/_causal_example.pdf}
+                    \end{center}
+                \end{example}
+
+            \item[Evidential] \marginnote{Evidential reasoning}
+                To find an explanation. From the effect, derive the cause.
+                \begin{example}
+                    Knowing $\texttt{Grade}$, it is possible to explain it by estimating\\$\texttt{Intelligence}$.
+                    \begin{center}
+                        \includegraphics[width=0.65\linewidth]{img/_evidential_example.pdf}
+                    \end{center}
+                \end{example}
+
+            \item[Explain away] \marginnote{Explain away reasoning}
+                Observation obtained "passing through" other observations.
+                \begin{example}
+                    Knowing $\texttt{Difficulty}$ and $\texttt{Grade}$, 
+                    it is possible to estimate \\$\texttt{Intelligence}$.
+
+                    Note that if $\texttt{Grade}$ was not known, 
+                    $\texttt{Difficulty}$ and $\texttt{Intelligence}$ would be independent.
+                    \begin{center}
+                        \includegraphics[width=0.70\linewidth]{img/_explainaway_example.pdf}
+                    \end{center}
+                \end{example}
+        \end{descriptionlist}
+
+    \item[Global semantics] \marginnote{Global semantics}
+        Given a Bayesian network, the full joint distribution can be defined as
+        the product of the local conditional distributions:
+        \[ \prob{x_1, \dots, x_n} = \prod_{i=1}^{n} \prob{x_i \,\vert\, \texttt{parents}(X_i)} \]
+
+        \begin{example}
+            Given the following Bayesian network:
+
+            \begin{minipage}{.3\linewidth}
+                \centering
+                \includegraphics[width=0.7\linewidth]{img/_global_semantics_example.pdf}
+            \end{minipage}
+            \begin{minipage}{.6\linewidth}
+                \[ 
+                    \begin{split}
+                        &\prob{j \land m \land a \land \lnot b \land \lnot e} \\
+                            &= \prob{\lnot b} \prob{\lnot e} \prob{a \,\vert\, \lnot b, \lnot e}
+                                \prob{j \,\vert\, a} \prob{m \,\vert\, a}
+                    \end{split}
+                \]
+            \end{minipage}
+        \end{example}
+
+    \item[Independence] \marginnote{Bayesian network independence}
+        Intuitively, an effect is independent from a cause, 
+        if there is another cause in the middle whose value is already known.
+        \begin{example}
+            \phantom{}
+
+            \begin{minipage}{.3\linewidth}
+                \centering
+                \includegraphics[width=0.75\linewidth]{img/_independence_example.pdf}
+            \end{minipage}
+            \begin{minipage}{.6\linewidth}
+                \[ \textbf{P} \models (\texttt{L} \perp \texttt{D}, \texttt{I}, \texttt{S} \,\vert\, \texttt{G}) \]
+                \[ \textbf{P} \models (\texttt{S} \perp \texttt{L} \,\vert\, \texttt{G}) \]
+                \[ \textbf{P} \models (\texttt{S} \perp \texttt{D}) \text{ but } 
+                    \textbf{P} \models (\texttt{S} \,\cancel{\perp}\, \texttt{D} \,\vert\, \texttt{G}) \text{ (explain away)} \]
+            \end{minipage}
+        \end{example}
+\end{description}
\ No newline at end of file