diff --git a/src/languages-and-algorithms-for-ai/module3/img/_learning_algorithm.pdf b/src/languages-and-algorithms-for-ai/module3/img/_learning_algorithm.pdf new file mode 100644 index 0000000..2dbec23 Binary files /dev/null and b/src/languages-and-algorithms-for-ai/module3/img/_learning_algorithm.pdf differ diff --git a/src/languages-and-algorithms-for-ai/module3/img/_learning_model.pdf b/src/languages-and-algorithms-for-ai/module3/img/_learning_model.pdf new file mode 100644 index 0000000..1ebf1b1 Binary files /dev/null and b/src/languages-and-algorithms-for-ai/module3/img/_learning_model.pdf differ diff --git a/src/languages-and-algorithms-for-ai/module3/img/_learning_rectangle.pdf b/src/languages-and-algorithms-for-ai/module3/img/_learning_rectangle.pdf new file mode 100644 index 0000000..fdd0183 Binary files /dev/null and b/src/languages-and-algorithms-for-ai/module3/img/_learning_rectangle.pdf differ diff --git a/src/languages-and-algorithms-for-ai/module3/img/_rectangle_space.pdf b/src/languages-and-algorithms-for-ai/module3/img/_rectangle_space.pdf new file mode 100644 index 0000000..6b61b69 Binary files /dev/null and b/src/languages-and-algorithms-for-ai/module3/img/_rectangle_space.pdf differ diff --git a/src/languages-and-algorithms-for-ai/module3/img/_rectangle_space2.pdf b/src/languages-and-algorithms-for-ai/module3/img/_rectangle_space2.pdf new file mode 100644 index 0000000..85c203b Binary files /dev/null and b/src/languages-and-algorithms-for-ai/module3/img/_rectangle_space2.pdf differ diff --git a/src/languages-and-algorithms-for-ai/module3/img/learning_model.drawio b/src/languages-and-algorithms-for-ai/module3/img/learning_model.drawio new file mode 100644 index 0000000..68a2348 --- /dev/null +++ b/src/languages-and-algorithms-for-ai/module3/img/learning_model.drawio @@ -0,0 +1,74 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/languages-and-algorithms-for-ai/module3/img/rectangle_learning.drawio b/src/languages-and-algorithms-for-ai/module3/img/rectangle_learning.drawio new file mode 100644 index 0000000..9571fa2 --- /dev/null +++ b/src/languages-and-algorithms-for-ai/module3/img/rectangle_learning.drawio @@ -0,0 +1,601 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/src/languages-and-algorithms-for-ai/module3/laai3.tex b/src/languages-and-algorithms-for-ai/module3/laai3.tex index 1be586e..e7b5307 100644 --- a/src/languages-and-algorithms-for-ai/module3/laai3.tex +++ b/src/languages-and-algorithms-for-ai/module3/laai3.tex @@ -22,5 +22,6 @@ \input{sections/_intro.tex} \input{sections/_turing.tex} \input{sections/_complexity.tex} + \input{sections/_computational_learning.tex} \end{document} \ No newline at end of file diff --git a/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex b/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex new file mode 100644 index 0000000..d458ebf --- /dev/null +++ b/src/languages-and-algorithms-for-ai/module3/sections/_computational_learning.tex @@ -0,0 +1,159 @@ +\chapter{Computational learning theory} + +\begin{description} + \item[Instance space] \marginnote{Instance space} + Set $X$ of (encoded) instances of objects that a learner wants to classify. + + Data from the instance space is drawn from a distribution $\mathcal{D}$ unknown to the learner. + + \item[Concept] \marginnote{Concept} + Subset $c \subseteq X$ of the instance space which can be intended as properties of objects (i.e. a way to classify the instance space). + + \item[Concept class] \marginnote{Concept class} + Collection $\mathcal{C} \subseteq \mathbb{P}(X)$ of concepts. + + It represents the concepts that are sufficiently simple for the algorithm to handle (i.e. the space of learnable concepts). + + \begin{description} + \item[Target concept] + Concept $c \in \mathcal{C}$ that the learner wants to learn. + \end{description} + + \begin{remark} + A learning algorithm is designed to learn concepts from a concept class + neither knowing the target concept nor its data distribution. + \end{remark} + + \item[Learning algorithm] \marginnote{Learning algorithm} + Given a concept class $\mathcal{C}$ and a target concept $c \in \mathcal{C}$ with unknown distribution $\mathcal{D}$, + a learning algorithm $\mathcal{A}$ takes as input: + \begin{itemize} + \item $\varepsilon$, the error parameter (or accuracy if seen as $(1-\varepsilon)$), + \item $\delta$, the confidence parameter, + \item $EX(c, \mathcal{D})$, an oracle that $\mathcal{A}$ can call to retrieve a data point $x \sim \mathcal{D}$ + with a label to indicate whether it is in the target concept $c$ or not (i.e. training data), + \end{itemize} + and outputs a concept $h \in \mathcal{C}$. + \begin{center} + \includegraphics[width=0.3\linewidth]{./img/_learning_algorithm.pdf} + \end{center} + + \begin{description} + \item[Probability of error] \marginnote{Probability of error} + Given a concept class $\mathcal{C}$, + a target concept $c \in \mathcal{C}$ with unknown distribution $\mathcal{D}$ and + a learning algorithm $\mathcal{A}$, + the probability of error (i.e. misclassifications) for any output $h \in \mathcal{C}$ of $\mathcal{A}$ is defined as: + \[ \text{error}_{\mathcal{D}, c} = \mathcal{P}_{x \sim \mathcal{D}}[ h(x) \neq c(x) ] \] + \end{description} + + \begin{figure}[H] + \centering + \includegraphics[width=0.35\linewidth]{./img/_learning_model.pdf} + \caption{General idea of a learning algorithm $\mathcal{A}$ computed as a function $f_\mathcal{A}$} + \end{figure} + + \item[PAC learnability] \marginnote{PAC learnability} + A concept class $\mathcal{C}$ over the instance space $X$ is probably approximately correct (PAC) learnable iff there is an algorithm $\mathcal{A}$ such that: + \begin{itemize} + \item For each target concept $c \in \mathcal{C}$, + \item For each distribution $\mathcal{D}$, + \item For each error $0 < \varepsilon < \frac{1}{2}$, + \item For each confidence $0 < \delta < \frac{1}{2}$, + \end{itemize} + it holds that: + \[ \mathcal{P}\left[ \text{error}_{\mathcal{D}, c}\Big( \mathcal{A}\big( EX(c, \mathcal{D}), \varepsilon, \delta \big) \Big) < \varepsilon \right] > 1-\delta \] + where the probability is computed by sampling data points from $EX(c, \mathcal{D})$. + + In other words, the probability that $\mathcal{A}$ has an error rate lower than $\varepsilon$ (or an accuracy higher than $(1-\varepsilon)$) is greater than $(1-\delta)$. + + \begin{description} + \item[Efficient PAC learnability] \marginnote{Efficient PAC learnability} + A concept class $\mathcal{C}$ is efficiently PAC learnable iff + it is PAC learnable and the algorithm $\mathcal{A}$ that learns it has + a time complexity bound to a polynomial in $\frac{1}{\varepsilon}$ and $\frac{1}{\delta}$. + + \begin{remark} + The complexity of $\mathcal{A}$ is measured taking into account the number of calls to $EX(c, \mathcal{D})$. + \end{remark} + \end{description} +\end{description} + +\begin{example}[Axes-aligned rectangles in $\mathbb{R}^2_{[0, 1]}$] + Consider the instance space $X = \mathbb{R}^2_{[0, 1]}$ + and the concept class $\mathcal{C}$ of concepts represented by all the points contained within a rectangle parallel to the axes of arbitrary size. + + \begin{figure}[H] + \centering + \includegraphics[width=0.2\linewidth]{./img/_learning_rectangle.pdf} + \caption{Example of problem instance. The gray rectangle is the target concept, red dots are positive data points and blue dots are negative data points.} + \end{figure} + + An algorithm has to guess a classifier (i.e. a rectangle) without knowing the target concept and the distribution of its training data. + Let an algorithm $\mathcal{A}_\text{BFP}$ be defined as follows: + \begin{itemize} + \item Take as input some data $\{ ((x_1, y_1), p_1), \dots, ((x_n, y_n), p_n) \}$ where + $(x_i, y_i)$ are the coordinates of the point and $p_i$ indicates if the point is within the target rectangle. + \item Return the smallest rectangle that includes all the positive instances. + \end{itemize} + + Given the rectangle $R$ predicted by $\mathcal{A}_\text{BFP}$ and the target rectangle $T$, + the probability of error in using $R$ in place of $T$ is: + \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (R \smallsetminus T) \cup (T \smallsetminus R) ] \] + In other words, a point is misclassified if it is in $R$ but not in $T$ or vice versa. + \begin{remark} + By definition of $\mathcal{A}_\text{BFP}$, it always holds that $R \subseteq T$. + Therefore, $(R \smallsetminus T) = \varnothing$ and the error can be rewritten as: + \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (T \smallsetminus R) ] \] + \end{remark} + + + \begin{theorem}[Axes-aligned rectangles in $\mathbb{R}^2_{[0, 1]}$ PAC learnability] + It holds that: + \begin{itemize} + \item For every distribution $\mathcal{D}$, + \item For every error $0 < \varepsilon < \frac{1}{2}$, + \item For every confidence $0 < \delta < \frac{1}{2}$, + \end{itemize} + if $m \geq \frac{4}{\varepsilon}\ln\left( \frac{4}{\delta} \right)$, then: + \[ + \mathcal{P}_{D \sim \mathcal{D}^m} + \left[ \text{error}_{\mathcal{D}, T}\Big( \mathcal{A}_\text{BFP}\big(T(D)\big) \Big) < \varepsilon \right] > 1 - \delta + \] + where $D \sim \mathcal{D}^m$ is a sample of $m$ data points (i.e. training data) + and $T(\cdot)$ labels the input data wrt to the target rectangle $T$. + + \begin{proof} + By definition, the error of $\mathcal{A}_\text{BFP}$ is defined as: + \[ \text{error}_{\mathcal{D}, T}(R) = \mathcal{P}_{x \sim \mathcal{D}} [ x \in (T \smallsetminus R) ] \] + + Consider the space defined by $(T \smallsetminus R)$ divided in four sections $E_1 \cup \dots \cup E_4 = (T \smallsetminus R)$: + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/_rectangle_space.pdf} + \end{figure} + + Consider the probabilistic event "$x \in E_i$". + For the training data $x \sim \mathcal{D}$ this holds iff none of those points + end up in $E_i$ as, if a training point is in $E_i$, $R$ would be bigger to include it and $E_i$ would be smaller. + + Now consider four other regions $F_1, \dots, F_4$ of the plane related to $E_i$ but defined differently + in such a way that $\mathcal{P}_{x \sim D}[x \in F_i] = \frac{\varepsilon}{4}$. + This can be achieved by expanding the $E_i$ regions to take some area of the rectangle $R$. + \begin{figure}[H] + \centering + \includegraphics[width=0.4\linewidth]{./img/_rectangle_space2.pdf} + \end{figure} + + Then, as $E_i$ are smaller than $F_i$, it holds that: + \[ + \begin{split} + \mathcal{P}_{x \sim D}[x \in E_i] < \frac{\varepsilon}{4} &\Rightarrow \mathcal{P}_{x \sim D}[x \in (T \smallsetminus R)] < \varepsilon \\ + & \Rightarrow \text{error}_{\mathcal{D}, T}(R) < \varepsilon + \end{split} + \] + + \textit{To be continued\dots} + \end{proof} + \end{theorem} +\end{example}