From 9dbb182edd08bcae67f56c77148e308e854eea3c Mon Sep 17 00:00:00 2001
From: NotXia <35894453+NotXia@users.noreply.github.com>
Date: Mon, 15 Apr 2024 20:02:05 +0200
Subject: [PATCH] Add missing corollary and sections reorder

---
 .../sections/_computational_learning.tex      | 141 +++++++++---------
 1 file changed, 74 insertions(+), 67 deletions(-)

diff --git a/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex b/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex
index d458ebf..cc3bb16 100644
--- a/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex
+++ b/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex
@@ -79,81 +79,88 @@
         \end{description}
 \end{description}
 
-\begin{example}[Axes-aligned rectangles in $\mathbb{R}^2_{[0, 1]}$]
-    Consider the instance space $X = \mathbb{R}^2_{[0, 1]}$
-    and the concept class $\mathcal{C}$ of concepts represented by all the points contained within a rectangle parallel to the axes of arbitrary size.
 
-    \begin{figure}[H]
-        \centering
-        \includegraphics[width=0.2\linewidth]{./img/_learning_rectangle.pdf}
-        \caption{Example of problem instance. The gray rectangle is the target concept, red dots are positive data points and blue dots are negative data points.}
-    \end{figure}
 
-    An algorithm has to guess a classifier (i.e. a rectangle) without knowing the target concept and the distribution of its training data.
-    Let an algorithm $\mathcal{A}_\text{BFP}$ be defined as follows:
+\section{Axes-aligned rectangles over $\mathbb{R}^2_{[0, 1]}$}
+
+Consider the instance space $X = \mathbb{R}^2_{[0, 1]}$
+and the concept class $\mathcal{C}$ of concepts represented by all the points contained within a rectangle parallel to the axes of arbitrary size.
+
+\begin{figure}[H]
+    \centering
+    \includegraphics[width=0.2\linewidth]{./img/_learning_rectangle.pdf}
+    \caption{Example of problem instance. The gray rectangle is the target concept, red dots are positive data points and blue dots are negative data points.}
+\end{figure}
+
+An algorithm has to guess a classifier (i.e. a rectangle) without knowing the target concept and the distribution of its training data.
+Let an algorithm $\mathcal{A}_\text{BFP}$ be defined as follows:
+\begin{itemize}
+    \item Take as input some data $\{ ((x_1, y_1), p_1), \dots, ((x_n, y_n), p_n) \}$ where 
+        $(x_i, y_i)$ are the coordinates of the point and $p_i$ indicates if the point is within the target rectangle.
+    \item Return the smallest rectangle that includes all the positive instances.
+\end{itemize}
+
+Given the rectangle $R$ predicted by $\mathcal{A}_\text{BFP}$ and the target rectangle $T$,
+the probability of error in using $R$ in place of $T$ is:
+\[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (R \smallsetminus T) \cup (T \smallsetminus R) ] \]
+In other words, a point is misclassified if it is in $R$ but not in $T$ or vice versa.
+\begin{remark}
+    By definition of $\mathcal{A}_\text{BFP}$, it always holds that $R \subseteq T$. 
+    Therefore, $(R \smallsetminus T) = \varnothing$ and the error can be rewritten as:
+    \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (T \smallsetminus R) ] \]
+\end{remark}
+
+
+\begin{theorem}[Axes-aligned rectangles over $\mathbb{R}^2_{[0, 1]}$ PAC learnability]
+    It holds that:
     \begin{itemize}
-        \item Take as input some data $\{ ((x_1, y_1), p_1), \dots, ((x_n, y_n), p_n) \}$ where 
-            $(x_i, y_i)$ are the coordinates of the point and $p_i$ indicates if the point is within the target rectangle.
-        \item Return the smallest rectangle that includes all the positive instances.
-    \end{itemize}
+        \item For every distribution $\mathcal{D}$,
+        \item For every error $0 < \varepsilon < \frac{1}{2}$, 
+        \item For every confidence $0 < \delta < \frac{1}{2}$,
+    \end{itemize} 
+    if $m \geq \frac{4}{\varepsilon}\ln\left( \frac{4}{\delta} \right)$, then:
+    \[ 
+        \mathcal{P}_{D \sim \mathcal{D}^m}
+            \left[ \text{error}_{\mathcal{D}, T}\Big( \mathcal{A}_\text{BFP}\big(T(D)\big) \Big) < \varepsilon \right]  > 1 - \delta
+    \]
+    where $D \sim \mathcal{D}^m$ is a sample of $m$ data points (i.e. training data)
+    and $T(\cdot)$ labels the input data wrt to the target rectangle $T$.
 
-    Given the rectangle $R$ predicted by $\mathcal{A}_\text{BFP}$ and the target rectangle $T$,
-    the probability of error in using $R$ in place of $T$ is:
-    \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (R \smallsetminus T) \cup (T \smallsetminus R) ] \]
-    In other words, a point is misclassified if it is in $R$ but not in $T$ or vice versa.
-    \begin{remark}
-        By definition of $\mathcal{A}_\text{BFP}$, it always holds that $R \subseteq T$. 
-        Therefore, $(R \smallsetminus T) = \varnothing$ and the error can be rewritten as:
+    \begin{proof}
+        By definition, the error of $\mathcal{A}_\text{BFP}$ is defined as:
         \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (T \smallsetminus R) ] \]
-    \end{remark}
 
+        Consider the space defined by $(T \smallsetminus R)$ divided in four sections $E_1 \cup \dots \cup E_4 = (T \smallsetminus R)$:
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.4\linewidth]{./img/_rectangle_space.pdf}
+        \end{figure}
 
-    \begin{theorem}[Axes-aligned rectangles in $\mathbb{R}^2_{[0, 1]}$ PAC learnability]
-        It holds that:
-        \begin{itemize}
-            \item For every distribution $\mathcal{D}$,
-            \item For every error $0 < \varepsilon < \frac{1}{2}$, 
-            \item For every confidence $0 < \delta < \frac{1}{2}$,
-        \end{itemize} 
-        if $m \geq \frac{4}{\varepsilon}\ln\left( \frac{4}{\delta} \right)$, then:
+        Consider the probabilistic event "$x \in E_i$".
+        For the training data $x \sim \mathcal{D}$ this holds iff none of those points
+        end up in $E_i$ as, if a training point is in $E_i$, $R$ would be bigger to include it and $E_i$ would be smaller.
+
+        Now consider four other regions $F_1, \dots, F_4$ of the plane related to $E_i$ but defined differently
+        in such a way that $\mathcal{P}_{x \sim D}[x \in F_i] = \frac{\varepsilon}{4}$.
+        This can be achieved by expanding the $E_i$ regions to take some area of the rectangle $R$.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.4\linewidth]{./img/_rectangle_space2.pdf}
+        \end{figure}
+
+        Then, as $E_i$ are smaller than $F_i$, it holds that:
         \[ 
-            \mathcal{P}_{D \sim \mathcal{D}^m}
-                \left[ \text{error}_{\mathcal{D}, T}\Big( \mathcal{A}_\text{BFP}\big(T(D)\big) \Big) < \varepsilon \right]  > 1 - \delta
+            \begin{split}
+                \mathcal{P}_{x \sim D}[x \in E_i] < \frac{\varepsilon}{4} &\Rightarrow \mathcal{P}_{x \sim D}[x \in (T \smallsetminus R)] < \varepsilon \\
+                & \Rightarrow \text{error}_{\mathcal{D}, T}(R) < \varepsilon
+            \end{split}
         \]
-        where $D \sim \mathcal{D}^m$ is a sample of $m$ data points (i.e. training data)
-        and $T(\cdot)$ labels the input data wrt to the target rectangle $T$.
+        
+        \textit{To be continued\dots}
+    \end{proof}
+\end{theorem}
 
-        \begin{proof}
-            By definition, the error of $\mathcal{A}_\text{BFP}$ is defined as:
-            \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (T \smallsetminus R) ] \]
 
-            Consider the space defined by $(T \smallsetminus R)$ divided in four sections $E_1 \cup \dots \cup E_4 = (T \smallsetminus R)$:
-            \begin{figure}[H]
-                \centering
-                \includegraphics[width=0.4\linewidth]{./img/_rectangle_space.pdf}
-            \end{figure}
-
-            Consider the probabilistic event "$x \in E_i$".
-            For the training data $x \sim \mathcal{D}$ this holds iff none of those points
-            end up in $E_i$ as, if a training point is in $E_i$, $R$ would be bigger to include it and $E_i$ would be smaller.
-
-            Now consider four other regions $F_1, \dots, F_4$ of the plane related to $E_i$ but defined differently
-            in such a way that $\mathcal{P}_{x \sim D}[x \in F_i] = \frac{\varepsilon}{4}$.
-            This can be achieved by expanding the $E_i$ regions to take some area of the rectangle $R$.
-            \begin{figure}[H]
-                \centering
-                \includegraphics[width=0.4\linewidth]{./img/_rectangle_space2.pdf}
-            \end{figure}
-
-            Then, as $E_i$ are smaller than $F_i$, it holds that:
-            \[ 
-                \begin{split}
-                    \mathcal{P}_{x \sim D}[x \in E_i] < \frac{\varepsilon}{4} &\Rightarrow \mathcal{P}_{x \sim D}[x \in (T \smallsetminus R)] < \varepsilon \\
-                    & \Rightarrow \text{error}_{\mathcal{D}, T}(R) < \varepsilon
-                \end{split}
-            \]
-            
-            \textit{To be continued\dots}
-        \end{proof}
-    \end{theorem}
-\end{example}
+\begin{corollary}
+    The concept class of axis-aligned rectangles over $\mathbb{R}^2_{[0, 1]}$ is efficiently PAC learnable.
+\end{corollary}
\ No newline at end of file