diff --git a/src/ainotes.cls b/src/ainotes.cls
index 7763c7f..166a2dd 100644
--- a/src/ainotes.cls
+++ b/src/ainotes.cls
@@ -21,6 +21,10 @@
 \usepackage{eurosym}
 \usepackage{bussproofs} % Deductive tree
 \usepackage{varwidth} 
+\usepackage[most]{tcolorbox}
+\usepackage{tikz}
+\tcbuselibrary{breakable}
+\usetikzlibrary{decorations.pathmorphing,calc}
 
 \geometry{ margin=3cm, lmargin=1.5cm, rmargin=4.5cm, marginparwidth=3cm }
 \hypersetup{ colorlinks, citecolor=black, filecolor=black, linkcolor=black, urlcolor=black, linktoc=all }
@@ -48,6 +52,7 @@
 \lstset{style=mystyle}
 \lstset{language=Python}
 
+
 \NewDocumentEnvironment{descriptionlist}{}{%
     \begin{description}[labelindent=1em]
 }{
@@ -57,15 +62,38 @@
 \renewcommand*{\marginfont}{\color{gray}\footnotesize}
 \renewcommand*\chapterpagestyle{scrheadings} % Header in chapter pages
 
+
 \theoremstyle{definition}
 \newtheorem{theorem}{Theorem}[section]
 \newtheorem{corollary}{Corollary}[theorem]
 \newtheorem{lemma}[theorem]{Lemma}
-\newtheorem*{example}{Example}
+\newtheorem*{privateexample}{Example}
 \theoremstyle{definition}
 \newtheorem*{definition}{Def}
 \newtheorem*{remark}{Remark}
 
+\newtcolorbox{marginbar}[3]{ % #1: color | #2: (number of lines - 1) | #3: line thickness
+    enhanced, blank, breakable,
+    overlay = {
+        \foreach \t in {0,...,#2}{
+            \draw[decorate, #3, #1]
+                ([xshift=-3-\t mm]frame.north west)
+                --
+                ([xshift=-3-\t mm]frame.south west);
+        },
+    }
+}
+
+\newenvironment{example}{%
+    \begin{marginbar}{lightgray}{0}{thick}
+    \begin{privateexample}
+}{%
+    \end{privateexample}
+    \end{marginbar}
+}
+
+
+
 \newcommand{\ubar}[1]{\text{\b{$#1$}}}
 \renewcommand{\vec}[1]{{\bm{\mathbf{#1}}}}
 \newcommand{\nullvec}[0]{\bar{\vec{0}}}
diff --git a/src/year1/cognition-and-neuroscience/module1/cn1.tex b/src/year1/cognition-and-neuroscience/module1/cn1.tex
index ba922dd..a39c692 100644
--- a/src/year1/cognition-and-neuroscience/module1/cn1.tex
+++ b/src/year1/cognition-and-neuroscience/module1/cn1.tex
@@ -18,7 +18,14 @@
 \DeclareAcronym{cs}{short=CS, long=conditioned stimulus}
 \DeclareAcronym{cr}{short=CR, long=conditioned response}
 
-\newtheorem*{casestudy}{Case study}
+\newtheorem*{privatecasestudy}{Case study}
+\newenvironment{casestudy}{%
+    \begin{marginbar}{olive}{0}{thick}
+    \begin{privatecasestudy}
+}{%
+    \end{privatecasestudy}
+    \end{marginbar}
+}
 
 \begin{document}
     
diff --git a/src/year1/cognition-and-neuroscience/module1/sections/_nervous_system.tex b/src/year1/cognition-and-neuroscience/module1/sections/_nervous_system.tex
index d996611..d30d2e0 100644
--- a/src/year1/cognition-and-neuroscience/module1/sections/_nervous_system.tex
+++ b/src/year1/cognition-and-neuroscience/module1/sections/_nervous_system.tex
@@ -108,7 +108,7 @@ Generally, a neuron does the following:
             \item[Cell body/soma] Metabolic center of the cell.
         \end{description}
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.5\textwidth]{img/neuron_eukaryotic.png}
             \caption{Neuron as an eukaryotic cell}
@@ -151,7 +151,7 @@ There are three types of synapses:
     \item[Axoaxonic] \marginnote{Axoaxonic}
         Synapses that a neuron makes onto the synapses of another neuron.
         In this case, the transmitting neuron can be seen as a signal modulator of the receiving neuron.
-    \begin{figure}[h]
+    \begin{figure}[H]
         \begin{subfigure}{.3\textwidth}
             \centering
             \includegraphics[width=\linewidth]{./img/axosomatic.png}
@@ -215,7 +215,7 @@ In a neuron, there are four regions that handle signals:
             \item[Electrical synapses] The \ac{ap} is directly transmitted to the next neurons.
         \end{description}
 
-    \begin{figure}[h]
+    \begin{figure}[H]
         \centering
         \includegraphics[width=0.8\textwidth]{./img/neuron_transmission.png}
         \caption{Transmitting regions of different types of neurons}
@@ -286,7 +286,7 @@ In a neuron, there are four regions that handle signals:
                         \end{remark}
                 \end{enumerate}
 
-                \begin{figure}[h]
+                \begin{figure}[H]
                     \centering
                     \includegraphics[width=0.8\textwidth]{./img/neuron_transmission2.png}
                     \caption{
diff --git a/src/year1/cognition-and-neuroscience/module2/cn2.tex b/src/year1/cognition-and-neuroscience/module2/cn2.tex
index 875361a..ed93baf 100644
--- a/src/year1/cognition-and-neuroscience/module2/cn2.tex
+++ b/src/year1/cognition-and-neuroscience/module2/cn2.tex
@@ -7,7 +7,14 @@
 \def\lastupdate{{PLACEHOLDER-LAST-UPDATE}}
 \def\giturl{{PLACEHOLDER-GIT-URL}}
 
-\newtheorem*{casestudy}{Case study}
+\newtheorem*{privatecasestudy}{Case study}
+\newenvironment{casestudy}{%
+    \begin{marginbar}{olive}{0}{thick}
+    \begin{privatecasestudy}
+}{%
+    \end{privatecasestudy}
+    \end{marginbar}
+}
 
 \begin{document}
     
diff --git a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_games.tex b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_games.tex
index 161d297..2c49a91 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_games.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_games.tex
@@ -12,7 +12,7 @@ It considers the player as the entity that maximizes (\textsc{Max}) its utility
 the opponent as the entity that (optimally) minimizes (\textsc{Min}) the utility of the player.
 
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.5\textwidth]{img/_minmax.pdf}
     \caption{Example of game tree with propagated scores}
@@ -121,7 +121,7 @@ In the average case of a random distribution, the reduction is of order $O(b^{3d
 \end{lstlisting}
 \end{algorithm}
 
-\begin{figure}[h]
+\begin{figure}[H]
     \begin{subfigure}{.3\textwidth}
         \centering
         \includegraphics[width=\linewidth]{img/alphabeta_algo_example1.png}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_intro.tex b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_intro.tex
index d48028b..a31c893 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_intro.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_intro.tex
@@ -113,7 +113,7 @@ Intelligence is defined as the ability to perceive or infer information and to r
 \marginnote{Perceptron}
 A neuron (\textbf{perceptron}) computes a weighted sum of its inputs and 
 passes the result to an activation function to produce the output.
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.40\textwidth]{img/neuron.png}
     \caption{Representation of an artificial neuron}
@@ -128,21 +128,21 @@ The expressivity of a neural network increases when more neurons are used:
 \begin{descriptionlist}
     \item[Single perceptron] 
         Able to compute a linear separation.
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.25\textwidth]{img/1perceptron.png}
             \caption{Separation performed by one perceptron}
         \end{figure}
     \item[Three-layer network] 
         Able to separate a convex region ($n_\text{edges} \leq n_\text{hidden neurons}$)
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.90\textwidth]{img/3layer.png}
             \caption{Separation performed by a three-layer network}
         \end{figure}
     \item[Four-layer network] 
         Able to separate regions of arbitrary shape.
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.40\textwidth]{img/4layer.png}
             \caption{Separation performed by a four-layer network}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_local_search.tex b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_local_search.tex
index d9ea07f..9e8f128 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_local_search.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_local_search.tex
@@ -15,7 +15,7 @@
             Problem: find a Hamiltonian tour of minimum cost in an undirected graph.
             
             A possible neighborhood of a state applies the $k$-exchange that guarantees to maintain a Hamiltonian tour.
-            \begin{figure}[ht]
+            \begin{figure}[H]
                 \begin{subfigure}{.5\textwidth}
                     \centering
                     \includegraphics[width=.70\linewidth]{img/tsp_2-exchange.png}
@@ -78,7 +78,7 @@ Can be seen as a search process over graphs:
     \item[Neighborhood graph] The search space topology.
     \item[Search graph] The explored space.
 \end{descriptionlist}
-\begin{figure}[ht]
+\begin{figure}[H]
     \begin{subfigure}{.5\textwidth}
         \centering
         \includegraphics[width=.55\linewidth]{img/_local_search_neigh_graph.pdf}
@@ -197,7 +197,7 @@ Population based meta heuristics are built on the following concepts:
     \item[Natural selection] Fit organisms have many offspring while others become extinct.
 \end{descriptionlist}
 
-\begin{table}[ht]
+\begin{table}[H]
     \centering
     \begin{tabular}{c | c}
         \textbf{Biology} & \textbf{Artificial intelligence} \\
@@ -224,7 +224,7 @@ The following terminology will be used:
     \item[Alleles] Domain of values of a gene.
 \end{descriptionlist}
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.5\textwidth]{img/_genetic_terminology.pdf}
     \caption{}
@@ -270,7 +270,7 @@ Genetic operators are:
     \end{descriptionlist}
 \end{example}
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.4\textwidth]{img/_genetic_cycle.pdf}
     \caption{Evolutionary cycle}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_planning.tex b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_planning.tex
index 58b018a..5758f04 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_planning.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_planning.tex
@@ -423,7 +423,7 @@ At each step, one of the following refinement operations can be applied until th
     \item Add a causal link to the set of causal links.
 \end{itemize}
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.45\textwidth]{img/_nonlinear_plan_example.pdf}
     \caption{Example of search tree in non-linear planning}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_search.tex b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_search.tex
index df8409d..269181d 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module1/sections/_search.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module1/sections/_search.tex
@@ -30,7 +30,7 @@
         A leaf can be a state to expand, a solution or a dead-end.
         \Cref{alg:search_tree_search} describes a generic tree search algorithm.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.25\textwidth]{img/_search_tree.pdf}
             \caption{Search tree}
@@ -122,7 +122,7 @@ Always expands the least deep node. The fringe is implemented as a queue (FIFO).
 
 The exponential space complexity makes BFS impractical for large problems.
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.30\textwidth]{img/_bfs.pdf}
     \caption{BFS visit order}
@@ -147,7 +147,7 @@ Same as BFS, but always expands the node with the lowest cumulative cost.
     \end{tabular}
 \end{center}
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.50\textwidth]{img/_ucs.pdf}
     \caption{Uniform-cost search visit order. $(n)$ is the cumulative cost}
@@ -175,7 +175,7 @@ Always expands the deepest node. The fringe is implemented as a stack (LIFO).
     \end{tabular}
 \end{center}
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.30\textwidth]{img/_dfs.pdf}
     \caption{DFS visit order}
@@ -261,7 +261,7 @@ The fringe is ordered according to the estimated scores.
         \end{center}
         % The complexity can be reduced depending on the heuristic.
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.65\textwidth]{img/_greedy_best_first_example.pdf}
             \caption{Hill climbing visit order}
@@ -337,7 +337,7 @@ The fringe is ordered according to the estimated scores.
 
         In general, it is better to use heuristics with large values (i.e. heuristics that don't underestimate too much).
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.65\textwidth]{img/_a_start_example.pdf}
             \caption{A$^*$ visit order}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module2/sections/_ontologies.tex b/src/year1/fundamentals-of-ai-and-kr/module2/sections/_ontologies.tex
index f909a22..fbe91e6 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module2/sections/_ontologies.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module2/sections/_ontologies.tex
@@ -160,7 +160,7 @@ A property of objects.
 \marginnote{Semantic networks}
 Graphical representation of objects and categories connected through labeled links.
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.4\textwidth]{img/semantic_network.png}
     \caption{Example of semantic network}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module2/sections/_semantic_web.tex b/src/year1/fundamentals-of-ai-and-kr/module2/sections/_semantic_web.tex
index 800e3a5..fb51274 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module2/sections/_semantic_web.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module2/sections/_semantic_web.tex
@@ -55,7 +55,7 @@
                             \texttt{http://www.example.org/index.html} has a \texttt{creator} with staff id \texttt{85740}.
                         \end{example}
 
-                    \item[XML]
+                    \item[XML] \phantom{}
                         \begin{example} \phantom{}
                             \begin{lstlisting}[mathescape=true, language=xml]
 <rdf:RDF
diff --git a/src/year1/fundamentals-of-ai-and-kr/module3/sections/_bayesian_net.tex b/src/year1/fundamentals-of-ai-and-kr/module3/sections/_bayesian_net.tex
index c5f65a3..91cb779 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module3/sections/_bayesian_net.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module3/sections/_bayesian_net.tex
@@ -142,7 +142,7 @@
         \end{itemize}
         In other words, influence can flow from $X$ to $Y$ passing by $Z$.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.65\textwidth]{img/_active_trail.pdf}
             \caption{Example of active and non-active two-edge trails}
@@ -214,7 +214,7 @@
 
     \item[Local semantics]
         Each node is conditionally independent of its non-descendants given its parents.
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.35\textwidth]{img/_local_independence.pdf}
             \caption{Local independence}
@@ -228,7 +228,7 @@
     \item[Markov blanket]
         Each node is conditionally independent of all the other nodes 
         if its Markov blanket (parents, children, children's parents) is in the evidence.
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.35\textwidth]{img/_markov_blanket.pdf}
             \caption{Markov blanket}
@@ -263,7 +263,7 @@ By construction, this algorithm guarantees the global semantics.
     Note that $P \perp G$.
     Let the order be fixed as follows: $P$, $G$, $H$.
 
-    \begin{figure}[h]
+    \begin{figure}[H]
         \begin{subfigure}{.3\textwidth}
             \centering
             \includegraphics[width=0.15\linewidth]{img/_monty_hall1.pdf}
@@ -423,7 +423,7 @@ the number of variables in a conditional probability table.
 Noisy-OR distributions model a network of non-interacting causes with a common effect.
 A node $X$ has $k$ parents $U_1, \dots, U_k$ and possibly a leak node $U_L$ to capture unmodeled concepts. 
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.3\textwidth]{img/_noisy_or_example.pdf}
     \caption{Example of noisy-OR network}
@@ -536,7 +536,7 @@ Possible approaches are:
     \item[Dynamic Bayesian network] \marginnote{Dynamic Bayesian network}
         Useful to model the evolution through time.
         A template variable $X_i$ is instantiated as $X_i^{(t)}$ at each time step.
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.3\textwidth]{img/_dynamic_bn_example.pdf}
             \caption{Example of dynamic Bayesian network}
diff --git a/src/year1/fundamentals-of-ai-and-kr/module3/sections/_exact_inference.tex b/src/year1/fundamentals-of-ai-and-kr/module3/sections/_exact_inference.tex
index fd3e857..90ed41e 100644
--- a/src/year1/fundamentals-of-ai-and-kr/module3/sections/_exact_inference.tex
+++ b/src/year1/fundamentals-of-ai-and-kr/module3/sections/_exact_inference.tex
@@ -41,7 +41,7 @@ Method that carries out summations right-to-left and stores intermediate results
 
 \begin{description}
     \item[Pointwise product of factors] $f(X, Y) \times g(Y, Z) = p(X, Y, Z)$
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.5\textwidth]{img/_pointwise_factors.pdf}
             \caption{Example of pointwise product}
diff --git a/src/year1/image-processing-and-computer-vision/module1/sections/_image_acquisition.tex b/src/year1/image-processing-and-computer-vision/module1/sections/_image_acquisition.tex
index 3ff67a6..e8c663a 100644
--- a/src/year1/image-processing-and-computer-vision/module1/sections/_image_acquisition.tex
+++ b/src/year1/image-processing-and-computer-vision/module1/sections/_image_acquisition.tex
@@ -25,7 +25,7 @@
             The pinhole camera is a good approximation of the geometry of the image formation mechanism of modern imaging devices.
         \end{remark}
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \begin{subfigure}{.4\textwidth}
                 \centering
                 \includegraphics[width=0.8\linewidth]{./img/pinhole.png}
@@ -205,7 +205,7 @@ Geometric model of a pinhole camera.\\
                 to find the object corresponding to $p_L$ in another image, 
                 it is sufficient to search along the horizontal axis of $p_L$ looking for the same colors or patterns.
 
-                \begin{figure}[h]
+                \begin{figure}[H]
                     \centering
                     \includegraphics[width=0.5\textwidth]{./img/stereo_matching.png}
                     \caption{Example of stereo matching}
@@ -259,7 +259,7 @@ then its length $l$ in the image plane is:
 In all the other cases (i.e. when the line is not parallel to the image plane), 
 the ratios of lengths and the parallelism of lines are not preserved.
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.25\textwidth]{./img/_perspective_projection_ratio.pdf}
     \caption{Example of not preserved ratios. It holds that $\frac{\overline{AB}}{\overline{BC}} \neq \frac{\overline{ab}}{\overline{bc}}$.}
@@ -269,7 +269,7 @@ the ratios of lengths and the parallelism of lines are not preserved.
     \item[Vanishing point] \marginnote{Vanishing point}
         Intersection point of lines that are parallel in the scene but not in the image plane.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.6\textwidth]{./img/_vanishing_point.pdf}
             \caption{Example of vanishing point}
@@ -402,7 +402,7 @@ the ratios of lengths and the parallelism of lines are not preserved.
 
 The image plane of a camera converts the received irradiance into electrical signals.
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.6\textwidth]{./img/_digitalization.pdf}
     \caption{Image digitalization steps}
diff --git a/src/year1/image-processing-and-computer-vision/module1/sections/_spatial_filtering.tex b/src/year1/image-processing-and-computer-vision/module1/sections/_spatial_filtering.tex
index 66738fa..69fc819 100644
--- a/src/year1/image-processing-and-computer-vision/module1/sections/_spatial_filtering.tex
+++ b/src/year1/image-processing-and-computer-vision/module1/sections/_spatial_filtering.tex
@@ -62,7 +62,7 @@ where $\tilde{I}(p)$ is the real information.
 
         Alternatively, it can be seen as the amount of overlap between $f(\tau)$ and $g(t - \tau)$.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.4\textwidth]{./img/continuous_convolution_example.png}
             \caption{Example of convolution}
diff --git a/src/year1/machine-learning-and-data-mining/sections/_classification.tex b/src/year1/machine-learning-and-data-mining/sections/_classification.tex
index 23faa2b..30f1c4d 100644
--- a/src/year1/machine-learning-and-data-mining/sections/_classification.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_classification.tex
@@ -30,7 +30,7 @@
         \end{example}
 
     \item[Data exploration] \marginnote{Data exploration}
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \begin{subfigure}{.5\textwidth}
                 \centering
                 \includegraphics[width=\linewidth]{img/_iris_boxplot_general.pdf}
@@ -137,7 +137,7 @@ As $N$ is at the denominator, this means that for large values of $N$, the uncer
         Note that cross-validation is done on the training set, so a final test set can still be used to
         evaluate the resulting model.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.6\textwidth]{img/cross_validation.png}
             \caption{Cross-validation example}
@@ -287,7 +287,7 @@ a macro (unweighted) average or a class-weighted average.
         When the area between the two curves is large and the curve is above the random classifier, 
         the model can be considered a good classifier.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.5\textwidth]{img/lift_chart.png}
             \caption{Example of lift chart}
@@ -301,7 +301,7 @@ a macro (unweighted) average or a class-weighted average.
         A straight line is used to represent a random classifier.
         A threshold can be considered good if it is high on the y-axis and low on the x-axis.
         
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.35\textwidth]{img/roc_curve.png}
             \caption{Example of ROC curves}
@@ -408,7 +408,7 @@ Possible solutions are:
             \item Classes distribution.
         \end{itemize}
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.5\textwidth]{img/_iris_decision_tree_example.pdf}
             \caption{Example of decision tree}
@@ -458,7 +458,7 @@ Possible solutions are:
                 Skipped.
         \end{descriptionlist}
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.35\textwidth]{img/impurity_comparison.png}
             \caption{Comparison of impurity measures}
@@ -633,7 +633,7 @@ This has complexity $O(h)$, with $h$ the height of the tree.
     \item[Perceptron] \marginnote{Perceptron}
         A single artificial neuron that takes $n$ inputs $x_1, \dots, x_n$ and a bias $b$,
         and computes a linear combination of them with weights $w_1, \dots, w_n, w_b$.
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.25\textwidth]{img/_perceptron.pdf}
             \caption{Example of perceptron}
@@ -686,7 +686,7 @@ In practice, a maximum number of iterations is set.
         In general, a subset of points (support vectors) \marginnote{Support vectors} 
         in the training set is sufficient to define the hulls.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.4\textwidth]{img/svm.png}
             \caption{Maximum margin hyperplane of linearly separable data}
@@ -724,7 +724,7 @@ For non-linearly separable data, the boundary can be found using a non-linear ma
 to map the data into a new space (feature space) where a linear separation is possible.
 Then, the data and the boundary is mapped back into the original space.
 
-\begin{figure}[h]
+\begin{figure}[H]
     \begin{subfigure}{0.49\textwidth}
         \centering
         \includegraphics[width=\linewidth]{img/svm_kernel_example1.png}
@@ -840,7 +840,7 @@ Train a set of base classifiers and make predictions by majority vote.
 If all the classifiers have the same but independent error rate, 
 the overall error of the ensemble model is lower (derived from a binomial distribution).
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.6\textwidth]{img/ensemble_error.png}
     \caption{Relationship between the error of base classifiers and ensemble models}
diff --git a/src/year1/machine-learning-and-data-mining/sections/_clustering.tex b/src/year1/machine-learning-and-data-mining/sections/_clustering.tex
index 99178dd..60d61c5 100644
--- a/src/year1/machine-learning-and-data-mining/sections/_clustering.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_clustering.tex
@@ -13,7 +13,7 @@
         0 indicates no difference while the upper bound varies.
 \end{description}
 
-\begin{table}[ht]
+\begin{table}[H]
     \centering
     \renewcommand{\arraystretch}{2}
     \begin{tabular}{c | c | c}
@@ -64,7 +64,7 @@ Given two $D$-dimensional data entries $p$ and $q$, possible distance metrics ar
         The Mahalanobis distance of $p$ and $q$ increases when the segment connecting them 
         points towards a direction of greater variation of the data.
 
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.35\textwidth]{img/mahalanobis.png}
             \caption{The Mahalanobis distance between $(A, B)$ is greater than $(A, C)$, while the Euclidean distance is the same.}
diff --git a/src/year1/machine-learning-and-data-mining/sections/_crisp.tex b/src/year1/machine-learning-and-data-mining/sections/_crisp.tex
index 973741a..9b13d60 100644
--- a/src/year1/machine-learning-and-data-mining/sections/_crisp.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_crisp.tex
@@ -3,7 +3,7 @@
 \begin{description}
     \item[\Acl{crisp}] \marginnote{\acs{crisp}}
         Standardized process for data mining.
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.45\textwidth]{img/crisp.png}
             \caption{\ac{crisp} workflow}
diff --git a/src/year1/machine-learning-and-data-mining/sections/_data_lake.tex b/src/year1/machine-learning-and-data-mining/sections/_data_lake.tex
index c7c9cdc..8f3be10 100644
--- a/src/year1/machine-learning-and-data-mining/sections/_data_lake.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_data_lake.tex
@@ -25,7 +25,7 @@
                 Less expensive.
         \end{descriptionlist}
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.5\textwidth]{img/_storage.pdf}
             \caption{Data storage technologies}
@@ -155,7 +155,7 @@
     \item[Speed layer] 
         Receives the data and prepares real-time views. The views are also stored in the serving layer.
 \end{description}
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.5\textwidth]{img/lambda_lake.png}
     \caption{Lambda lake architecture}
@@ -165,7 +165,7 @@
 \marginnote{Kappa lake}
 The data are stored in a long-term store.
 Computations only happen in the speed layer (avoids lambda lake redundancy between batch layer and speed layer).
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.5\textwidth]{img/kappa_lake.png}
     \caption{Kappa lake architecture}
@@ -181,7 +181,7 @@ Framework that adds features on top of an existing data lake.
     \item Unified batch and streaming
     \item Schema enforcement
 \end{itemize}
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.7\textwidth]{img/delta_lake.png}
     \caption{Delta lake architecture}
diff --git a/src/year1/machine-learning-and-data-mining/sections/_data_warehouse.tex b/src/year1/machine-learning-and-data-mining/sections/_data_warehouse.tex
index 8c80ebf..4ed16f3 100644
--- a/src/year1/machine-learning-and-data-mining/sections/_data_warehouse.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_data_warehouse.tex
@@ -34,7 +34,7 @@
         Navigation path created by the operations that a user applied.
 \end{description}
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.35\textwidth]{img/_olap_cube.pdf}
     \caption{\ac{olap} data cube}
@@ -280,13 +280,13 @@ The architecture of a data warehouse should meet the following requirements:
         \end{descriptionlist}
 \end{description}
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.8\textwidth]{img/dfm.png}
     \caption{Example of \ac{dfm}}
 \end{figure}
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.5\textwidth]{img/dfm_events.png}
     \caption{Example of primary and secondary events}
@@ -318,7 +318,7 @@ Aggregation operators can be classified as:
 \begin{description}
     \item[Additivity] \marginnote{Additive measure}
     A measure is additive along a dimension if an aggregation operator can be applied. 
-    \begin{table}[ht]
+    \begin{table}[H]
         \centering
         \begin{tabular}{l | c | c}
                                         & \textbf{Temporal hierarchies}                             & \textbf{Non-temporal hierarchies} \\
@@ -340,7 +340,7 @@ There are two main strategies:
 \begin{descriptionlist}
     \item[Star schema] \marginnote{Star schema}
         A fact table that contains all the measures is linked to dimensional tables.
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=\textwidth]{img/logical_star_schema.png}
             \caption{Example of star schema}
diff --git a/src/year1/machine-learning-and-data-mining/sections/_intro.tex b/src/year1/machine-learning-and-data-mining/sections/_intro.tex
index 5847519..216ed6a 100644
--- a/src/year1/machine-learning-and-data-mining/sections/_intro.tex
+++ b/src/year1/machine-learning-and-data-mining/sections/_intro.tex
@@ -87,7 +87,7 @@ Different levels of insight can be extracted by:
 
     \item[Data mining] \marginnote{Data mining}
         Discovery process for unstructured decisions.
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.8\textwidth]{img/data_mining_process.png}
             \caption{Data mining process}
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
index bec7660..bce2179 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_finite_numbers.tex
@@ -16,7 +16,7 @@
 
     \item[Inherent error] \marginnote{Inherent error}
         Caused by the finite representation of the data (floating-point).
-        \begin{figure}[h]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.6\textwidth]{img/_inherent_error.pdf}
             \caption{Inherent error visualization}
@@ -97,7 +97,7 @@ Given a floating-point system $\mathcal{F}(\beta, t, L, U)$, the total amount of
 %
 Representable numbers are more sparse towards the exponent upper bound and more dense towards the lower bound.
 It must be noted that there is an underflow area around 0.
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.8\textwidth]{img/floatingpoint_range.png}
     \caption{Floating-point numbers in $\mathcal{F}(2, 3, -1, 2)$}
@@ -132,13 +132,13 @@ Depending on the approximation approach, machine precision can be computed as:
 Therefore, rounding results in more accurate representations.
 
 $\varepsilon_{\text{mach}}$ is the smallest distance among the representable numbers (\Cref{fig:finnum_eps}).
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.2\textwidth]{img/machine_eps.png}
     \caption{Visualization of $\varepsilon_{\text{mach}}$ in $\mathcal{F}(2, 3, -1, 2)$}
     \label{fig:finnum_eps}
-\end{figure}\\
-%
+\end{figure}
+
 In alternative, $\varepsilon_{\text{mach}}$ can be defined as the smallest representable number such that:
 \begin{equation*}
     \texttt{fl}(1 + \varepsilon_{\text{mach}}) > 1.
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
index 9ce0e39..31583f3 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_gradient_methods.tex
@@ -157,7 +157,7 @@ A generic gradient-like method can then be defined as:
     \item[Flat regions and local optima] \marginnote{Flat regions and local optima}
         Flat regions slow down the learning speed,
         while a local optima causes the method to converge at a poor solution.
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.9\textwidth]{img/_descent_local_flat.pdf}
             \caption{Flat regions and local minima}
@@ -194,7 +194,7 @@ A generic gradient-like method can then be defined as:
         A valley in the objective function causes a gradient method to bounce between the sides
         to a point where no significant progress can be made.
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \begin{subfigure}{.5\textwidth}
                 \centering
                 \includegraphics[width=.30\linewidth]{img/cliff.png}
@@ -217,7 +217,7 @@ A generic gradient-like method can then be defined as:
         Informally, a set is convex if, for any two points of the set,
         the points laying on the segment connecting them are also part of the set.
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \begin{subfigure}{.5\textwidth}
                 \centering
                 \includegraphics[width=.25\linewidth]{img/convex_set.png}
@@ -239,7 +239,7 @@ A generic gradient-like method can then be defined as:
         \]
 
         In other words, the segment connecting two points of the function lays above the graph.
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.55\textwidth]{img/convex_function.png}
             \caption{Convex function}
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
index b4b0442..72e6bef 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_linear_algebra.tex
@@ -232,7 +232,7 @@ Common norms are:
         
         The vector $\vec{w} \in U^\perp$ s.t. $\Vert \vec{w} \Vert = 1$ is the \textbf{normal vector} of $U$. \marginnote{Normal vector}
         %
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.4\textwidth]{img/_orthogonal_complement.pdf}
             \caption{Orthogonal complement of a subspace $U \subseteq \mathbb{R}^3$}
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
index c67e561..27cf356 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_machine_learning.tex
@@ -160,7 +160,7 @@ The parameters are determined as the most likely to predict the correct label gi
                 which corresponds to the least squares problem.
         \end{description}
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \begin{subfigure}{.45\textwidth}
                 \centering
                 \includegraphics[width=.75\linewidth]{img/gaussian_mle_good.png}
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
index 93cda94..ef2cb08 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_matrix_decomp.tex
@@ -146,7 +146,7 @@ Therefore, the compression factor is given by: \marginnote{Compression factor}
     c_k = 1 - \frac{k(1 + m + n)}{mn}
 \]
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.60\textwidth]{img/_rank_k_approx.pdf}
     \caption{Approximation of an image}
@@ -197,7 +197,7 @@ We can formulate this as a linear system:
 that can be solved as a linear least squares problem:
 \[ \min_{\vec{c} \in \mathbb{R}^n} \Vert \vec{y} - \matr{A}\vec{c} \Vert_2^2 \]
 
-\begin{figure}[h]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.40\textwidth]{img/linear_regression.png}
     \caption{Interpolation using a polynomial of degree 1}
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_probability.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
index a000c38..4512326 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_probability.tex
@@ -504,7 +504,7 @@ Moreover, we have that:
                 $\bm{\mu} = \nullvec$ and $\matr{\Sigma} = \matr{I}$ (multivariate).
         \end{description}
 
-        \begin{figure}[ht]
+        \begin{figure}[H]
             \centering
             \includegraphics[width=0.40\textwidth]{img/normal_distribution.png}
             \caption{Normal distributions and standard normal distribution}
diff --git a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
index a1eb5df..588a1a9 100644
--- a/src/year1/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
+++ b/src/year1/statistical-and-mathematical-methods-for-ai/sections/_vector_calculus.tex
@@ -197,7 +197,7 @@ Each $\vec{f}_i$ takes as input the output of the previous layer $\vec{x}_{i-1}$
 where $\sigma_i$ is an activation function\footnote{\url{https://en.wikipedia.org/wiki/Activation_function}} (a function to add nonlinearity), 
 while $\matr{A}_{i-1}$ (linear mapping) and $\vec{b}_{i-1}$ (biases) are the parameters of $\vec{f}_i$.
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.7\textwidth]{img/_forward_pass.pdf}
     \caption{Forward pass}
@@ -231,7 +231,7 @@ This can be done by using the chain rule to compute the partial derivatives of $
     \end{split}
 \]
 
-\begin{figure}[ht]
+\begin{figure}[H]
     \centering
     \includegraphics[width=0.7\textwidth]{img/_backward_pass.pdf}
     \caption{Backward pass}