Add ML4CV Viola-Jones + object localization

2026-02-04 07:41:43 +01:00 · 2024-10-17 20:28:28 +02:00
parent c10defa867
commit 282eae9576
12 changed files with 468 additions and 6 deletions
--- a/src/year2/machine-learning-for-computer-vision/img/_adaboost_example1.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_adaboost_example1.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_adaboost_example2.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_adaboost_example2.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_adaboost_example3.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_adaboost_example3.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_adaboost_example4.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_adaboost_example4.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_cnn_object_localization.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_cnn_object_localization.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_haar_like_example.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_haar_like_example.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_haar_like_filters_example.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_haar_like_filters_example.pdf
@ -0,0 +1,199 @@
+%PDF-1.6
+%<25><><EFBFBD><EFBFBD>
+1 0 obj
+<< /Metadata 3 0 R /PageLayout /SinglePage /Pages 4 0 R /Type /Catalog /ViewerPreferences << /PageDirection /L2R >> >>
+endobj
+2 0 obj
+<< /Author () /CreationDate (D:20241017145008+00'00) /Creator (Scribus 1.6.2) /Keywords () /ModDate (D:20241017145008+00'00) /Producer (Scribus PDF Library 1.6.2) /Trapped /False >>
+endobj
+3 0 obj
+<< /Subtype /XML /Type /Metadata /Length 1384 >>
+stream
+<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>
+<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Scribus PDF Library 1.6.2">
+    <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+        <rdf:Description xmlns:xmp="http://ns.adobe.com/xap/1.0/" xmp:CreatorTool="Scribus 1.6.2" xmp:CreateDate="2024-10-17T14:50:08+00:00" rdf:about="" xmp:ModifyDate="2024-10-17T14:50:08+00:00" xmp:MetadataDate="2024-10-17T14:50:08Z"/>
+        <rdf:Description xmlns:pdf="http://ns.adobe.com/pdf/1.3/" pdf:Producer="Scribus PDF Library 1.6.2" pdf:Keywords="" rdf:about="" pdf:Trapped="False"/>
+        <rdf:Description xmlns:dc="http://purl.org/dc/elements/1.1/" dc:format="application/pdf" rdf:about="">
+            <dc:title>
+                <rdf:Alt>
+                    <rdf:li xml:lang="x-default"/>
+                </rdf:Alt>
+            </dc:title>
+            <dc:creator>
+                <rdf:Seq><rdf:li></rdf:li></rdf:Seq></dc:creator>
+            <dc:description>
+                <rdf:Alt>
+                    <rdf:li xml:lang="x-default"/>
+                </rdf:Alt>
+            </dc:description>
+        </rdf:Description>
+        <rdf:Description xmlns:xmpMM="http://ns.adobe.com/xap/1.0/mm/" xmpMM:DocumentID="uuid:298f474b-b5f5-4aee-b1d8-0ec52fb54043" xmpMM:VersionID="1" xmpMM:RenditionClass="default" rdf:about=""/>
+    </rdf:RDF>
+</x:xmpmeta>
+
+<?xpacket end="w"?>
+endstream
+endobj
+4 0 obj
+<< /Count 1 /Kids [ 5 0 R ] /Type /Pages >>
+endobj
+5 0 obj
+<< /ArtBox [ 0.00000 0.00000 960.00000 540.00000 ] /BleedBox [ 0.00000 0.00000 960.00000 540.00000 ] /Contents 6 0 R /Group 7 0 R /MediaBox [ 42 49 311 163 ] /Parent 4 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE10 10 0 R /RE12 11 0 R /RE13 12 0 R /RE15 13 0 R /RE16 14 0 R /RE18 15 0 R /RE19 16 0 R /RE3 17 0 R /RE4 18 0 R /RE6 19 0 R /RE7 20 0 R /RE9 21 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << /RE11 22 0 R /RE14 23 0 R /RE17 24 0 R /RE2 25 0 R /RE20 26 0 R /RE5 27 0 R /RE8 28 0 R >> >> /Rotate 0 /Type /Page >>
+endobj
+6 0 obj
+<< /Filter /FlateDecode /Length 429 >>
+stream
+x<EFBFBD><EFBFBD><EFBFBD>MO<EFBFBD>0<0C><><EFBFBD>;#<23>؎<EFBFBD>$w<12><>qBL8<><38>q<EFBFBD><71><EFBFBD><EFBFBD>C<0B><>JM<4A><4D><EFBFBD>u<EFBFBD>GN<47><4E>h<EFBFBD><68><EFBFBD><11>l<EFBFBD>x<EFBFBD>ton[l<><6C>ǽ<EFBFBD><C7BD>.<01>2<EFBFBD>t7<74><37><EFBFBD>{<7B><>K<EFBFBD>y(g<><67>"0<>ʷ<EFBFBD><CAB7><EFBFBD>9<EFBFBD>?v<1E>t>)<07><><EFBFBD>^<5E>f<>><3E><>ڞ<14><><EFBFBD><EFBFBD>É[Lbtm>}<7D><1B><>z(Ac<41>l<EFBFBD>x<EFBFBD>ImքB{<7B><><EFBFBD><EFBFBD>du<64>E<EFBFBD>eI%Kyv<79>jE,<2C>Bh<42>v<EFBFBD><76>4<11><>$H<>H<>u[I<17>>ҪF#<23>&#<23><><01>FTVD<56>:<3A>?<3F>R@&N
+ѐ<EFBFBD><EFBFBD>e诛4*<2A><06>)<29>i<EFBFBD><03>F
+P<>!*<2A>#O4<4F><34><EFBFBD>	<09><><EFBFBD>/<2F><><EFBFBD> <20><>7iF<<3C><><EFBFBD><EFBFBD>:i2'<27><>Ah#<23>ֶ<19><>#Q<><51>
+AL<1C><>5W@<40>&Xu)+P<><50> J<>/<2F>RY<17>o<EFBFBD><6F>MZ<4D>2<EFBFBD>hH1<48>P澒<50>[<5B>!<21>f<EFBFBD><66>UAz<41><7A>h<EFBFBD>)<29><>hR<68>>:<3A><><EFBFBD>D<EFBFBD>MZ<4D>]<5D><>2D4<44>K<EFBFBD><4B>VMZ<4D>hn<68>u<EFBFBD>'<27>p_<>endstream
+endobj
+7 0 obj
+<< /CS /DeviceRGB /S /Transparency >>
+endobj
+8 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+9 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+10 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+11 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+12 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+13 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+14 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+15 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+16 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+17 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+18 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+19 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+20 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 1.00000 >>
+endobj
+21 0 obj
+<< /AIS false /BM /Normal /OPM 1 /SMask /None /Type /ExtGState /ca 0.80000 >>
+endobj
+22 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 29 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE10 10 0 R /RE3 17 0 R /RE4 18 0 R /RE6 19 0 R /RE7 20 0 R /RE9 21 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << >> >> /Subtype /Form /Type /XObject /Length 160 >>
+stream
+xڥ<EFBFBD>;<0E>0<0C>w<EFBFBD>"<17><15>΃+<2B>1#<23>"@(<19><><EFBFBD>q*<2A>(<1D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>3<EFBFBD><33>b<EFBFBD><62>.O$c<>VK<56><0C><><EFBFBD><EFBFBD>u<EFBFBD>Z<EFBFBD><5A><EFBFBD>E\A<><41>{J<>EG%5R<><12>V<EFBFBD>Pl<50><6C><EFBFBD><EFBFBD><EFBFBD>dh{<7B>
+N<EFBFBD><EFBFBD>v{<7B>ٳ;(<28><><1F>P<EFBFBD>]W<>a	<1D><><EFBFBD>80<38><30><EFBFBD><EFBFBD>h<EFBFBD><68><EFBFBD><EFBFBD>"<22>䪦@%<25>U֕{2.<2E><>?<3F><>S<EFBFBD>
+endstream
+endobj
+23 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 30 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE10 10 0 R /RE12 11 0 R /RE13 12 0 R /RE3 17 0 R /RE4 18 0 R /RE6 19 0 R /RE7 20 0 R /RE9 21 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << >> >> /Subtype /Form /Type /XObject /Length 191 >>
+stream
+xڥ<EFBFBD>=<0E>0<0C>w<EFBFBD>"<17><>S'<27>ؘ+& <20>e<><65>$<24><>C[&<1A><><EFBFBD><EFBFBD>9~o6V<>#<23>ϰ w<>p<EFBFBD>`<60>L<EFBFBD><4C><EFBFBD>^<5E><>0<EFBFBD>C<EFBFBD>+<2B>9<EFBFBD>nϠ<6E><CFA0><EFBFBD>:2<07><>c<EFBFBD>̨Kyac<61><<3C>F<EFBFBD>X<EFBFBD>;<3B>-<2D>MR<4D><52>;<3B><><EFBFBD>4ϸ<34><CFB8>E<>E<EFBFBD><45>N<EFBFBD>%<25><>78-k.<2E><>@<40>I<EFBFBD><49><15>F<EFBFBD>_ɒ$߹<><04>^R><3E><>
+C<15>6ޡ<36><DEA1><EFBFBD><EFBFBD><EFBFBD>x<EFBFBD>k<EFBFBD><6B><EFBFBD>-i<1F><>?<3F><>r
+endstream
+endobj
+24 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 31 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE10 10 0 R /RE12 11 0 R /RE13 12 0 R /RE15 13 0 R /RE16 14 0 R /RE3 17 0 R /RE4 18 0 R /RE6 19 0 R /RE7 20 0 R /RE9 21 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << >> >> /Subtype /Form /Type /XObject /Length 160 >>
+stream
+xڥ<EFBFBD>;<0E>0<0C>w<EFBFBD>"<17><15>΃+<2B>1#<23>"@(<19><><EFBFBD>q*<2A>(<1D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>3<EFBFBD><33>b<EFBFBD><62>.O$c<>VK<56><0C><><EFBFBD><EFBFBD>u<EFBFBD>Z<EFBFBD><5A><EFBFBD>E\A<><41>{J<>EG%5R<><12>V<EFBFBD>Pl<50><6C><EFBFBD><EFBFBD><EFBFBD>dh{<7B>
+N<EFBFBD><EFBFBD>v{<7B>ٳ;(<28><><1F>P<EFBFBD>]W<>a	<1D><><EFBFBD>80<38><30><EFBFBD><EFBFBD>h<EFBFBD><68><EFBFBD><EFBFBD>"<22>䪦@%<25>U֕{2.<2E><>?<3F><>S<EFBFBD>
+endstream
+endobj
+25 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 32 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] >> /Subtype /Form /Type /XObject /Length 160 >>
+stream
+xڥ<EFBFBD>;<0E>0<0C>w<EFBFBD>"<17><15>΃+<2B>1#<23>"@(<19><><EFBFBD>q*<2A>(<1D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>3<EFBFBD><33>b<EFBFBD><62>.O$c<>VK<56><0C><><EFBFBD><EFBFBD>u<EFBFBD>Z<EFBFBD><5A><EFBFBD>E\A<><41>{J<>EG%5R<><12>V<EFBFBD>Pl<50><6C><EFBFBD><EFBFBD><EFBFBD>dh{<7B>
+N<EFBFBD><EFBFBD>v{<7B>ٳ;(<28><><1F>P<EFBFBD>]W<>a	<1D><><EFBFBD>80<38><30><EFBFBD><EFBFBD>h<EFBFBD><68><EFBFBD><EFBFBD>"<22>䪦@%<25>U֕{2.<2E><>?<3F><>S<EFBFBD>
+endstream
+endobj
+26 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 33 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE10 10 0 R /RE12 11 0 R /RE13 12 0 R /RE15 13 0 R /RE16 14 0 R /RE18 15 0 R /RE19 16 0 R /RE3 17 0 R /RE4 18 0 R /RE6 19 0 R /RE7 20 0 R /RE9 21 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << >> >> /Subtype /Form /Type /XObject /Length 191 >>
+stream
+xڥ<EFBFBD>=<0E>0<0C>w<EFBFBD>"<17><>S'<27>ؘ+& <20>e<><65>$<24><>C[&<1A><><EFBFBD><EFBFBD>9~o6V<>#<23>ϰ w<>p<EFBFBD>`<60>L<EFBFBD><4C><EFBFBD>^<5E><>0<EFBFBD>C<EFBFBD>+<2B>9<EFBFBD>nϠ<6E><CFA0><EFBFBD>:2<07><>c<EFBFBD>̨Kyac<61><<3C>F<EFBFBD>X<EFBFBD>;<3B>-<2D>MR<4D><52>;<3B><><EFBFBD>4ϸ<34><CFB8>E<>E<EFBFBD><45>N<EFBFBD>%<25><>78-k.<2E><>@<40>I<EFBFBD><49><15>F<EFBFBD>_ɒ$߹<><04>^R><3E><>
+C<15>6ޡ<36><DEA1><EFBFBD><EFBFBD><EFBFBD>x<EFBFBD>k<EFBFBD><6B><EFBFBD>-i<1F><>?<3F><>r
+endstream
+endobj
+27 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 34 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE3 17 0 R /RE4 18 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << >> >> /Subtype /Form /Type /XObject /Length 191 >>
+stream
+xڥ<EFBFBD>=<0E>0<0C>w<EFBFBD>"<17><>S'<27>ؘ+& <20>e<><65>$<24><>C[&<1A><><EFBFBD><EFBFBD>9~o6V<>#<23>ϰ w<>p<EFBFBD>`<60>L<EFBFBD><4C><EFBFBD>^<5E><>0<EFBFBD>C<EFBFBD>+<2B>9<EFBFBD>nϠ<6E><CFA0><EFBFBD>:2<07><>c<EFBFBD>̨Kyac<61><<3C>F<EFBFBD>X<EFBFBD>;<3B>-<2D>MR<4D><52>;<3B><><EFBFBD>4ϸ<34><CFB8>E<>E<EFBFBD><45>N<EFBFBD>%<25><>78-k.<2E><>@<40>I<EFBFBD><49><15>F<EFBFBD>_ɒ$߹<><04>^R><3E><>
+C<15>6ޡ<36><DEA1><EFBFBD><EFBFBD><EFBFBD>x<EFBFBD>k<EFBFBD><6B><EFBFBD>-i<1F><>?<3F><>r
+endstream
+endobj
+28 0 obj
+<< /BBox [ 0.00000 -17.12250 960.00000 17.12250 ] /Filter /FlateDecode /FormType 1 /Group 35 0 R /Resources << /ExtGState << /RE0 8 0 R /RE1 9 0 R /RE3 17 0 R /RE4 18 0 R /RE6 19 0 R /RE7 20 0 R >> /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << >> >> /Subtype /Form /Type /XObject /Length 160 >>
+stream
+xڥ<EFBFBD>;<0E>0<0C>w<EFBFBD>"<17><15>΃+<2B>1#<23>"@(<19><><EFBFBD>q*<2A>(<1D><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>3<EFBFBD><33>b<EFBFBD><62>.O$c<>VK<56><0C><><EFBFBD><EFBFBD>u<EFBFBD>Z<EFBFBD><5A><EFBFBD>E\A<><41>{J<>EG%5R<><12>V<EFBFBD>Pl<50><6C><EFBFBD><EFBFBD><EFBFBD>dh{<7B>
+N<EFBFBD><EFBFBD>v{<7B>ٳ;(<28><><1F>P<EFBFBD>]W<>a	<1D><><EFBFBD>80<38><30><EFBFBD><EFBFBD>h<EFBFBD><68><EFBFBD><EFBFBD>"<22>䪦@%<25>U֕{2.<2E><>?<3F><>S<EFBFBD>
+endstream
+endobj
+29 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+30 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+31 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+32 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+33 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+34 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+35 0 obj
+<< /I true /K false /S /Transparency /Type /Group >>
+endobj
+xref
+0 36
+0000000000 65535 f 
+0000000015 00000 n 
+0000000149 00000 n 
+0000000346 00000 n 
+0000001811 00000 n 
+0000001870 00000 n 
+0000002440 00000 n 
+0000002940 00000 n 
+0000002993 00000 n 
+0000003086 00000 n 
+0000003179 00000 n 
+0000003273 00000 n 
+0000003367 00000 n 
+0000003461 00000 n 
+0000003555 00000 n 
+0000003649 00000 n 
+0000003743 00000 n 
+0000003837 00000 n 
+0000003931 00000 n 
+0000004025 00000 n 
+0000004119 00000 n 
+0000004213 00000 n 
+0000004307 00000 n 
+0000004834 00000 n 
+0000005418 00000 n 
+0000005997 00000 n 
+0000006436 00000 n 
+0000007072 00000 n 
+0000007581 00000 n 
+0000008083 00000 n 
+0000008152 00000 n 
--- a/src/year2/machine-learning-for-computer-vision/img/_integral_image.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_integral_image.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_integral_image_feature.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_integral_image_feature.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_integral_image_filters.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_integral_image_filters.pdf
--- a/src/year2/machine-learning-for-computer-vision/img/_viola_jones_cascade.pdf
+++ b/src/year2/machine-learning-for-computer-vision/img/_viola_jones_cascade.pdf
--- a/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex
+++ b/src/year2/machine-learning-for-computer-vision/sections/_object_detection.tex
@ -37,8 +37,8 @@

        \begin{description}
            \item[True/false positive criteria] 
-                Given a threshold $\rho_\texttt{IoU}$, a detection $BB_i$ is a true positive (\texttt{TP}) w.r.t. a ground truth $\hat{BB_j}$ if it is classified with the same class and:
-                \[ \texttt{IoU}(BB_i, \hat{BB_j}) > \rho_\texttt{IoU} \]
+                Given a threshold $\rho_\texttt{IoU}$, a detection $BB_i$ is a true positive (\texttt{TP}) w.r.t. a ground-truth $\widehat{BB_j}$ if it is classified with the same class and:
+                \[ \texttt{IoU}(BB_i, \widehat{BB_j}) > \rho_\texttt{IoU} \]

                \begin{remark}
                    Confidence can also be considered when determining a match through a threshold $\rho_\text{min}$.
@ -46,8 +46,8 @@
        \end{description}

    \item[Recall]
-        Measures the number of ground truth objects that have been found:
-        \[ \texttt{recall} = \frac{\vert \texttt{TP} \vert}{\vert \text{ground truth boxes} \vert} \]
+        Measures the number of ground-truth objects that have been found:
+        \[ \texttt{recall} = \frac{\vert \texttt{TP} \vert}{\vert \text{ground-truth boxes} \vert} \]

    \item[Precision]
        Measures the number of correct detections among all the predictions:
@ -62,7 +62,7 @@
    \end{figure}

    \item[Precision-recall curve]
-        Plot that relates precision and recall.
+        Plot that relates all possible precisions and recalls of a detector.

        \begin{example}
            Consider the following image and the bounding boxes found by a detector:
@ -71,7 +71,7 @@
                \includegraphics[width=0.4\linewidth]{./img/_example_precision_recall_curve1.pdf}
                \caption{
                    \parbox[t]{0.6\linewidth}{
-                        Ground truth (yellow boxes) and predictions (orange boxes) with their confidence score
+                        Ground-truth (yellow boxes) and predictions (orange boxes) with their confidence score
                    }
                }
            \end{figure}
@ -87,4 +87,267 @@
                Recall is monotonically decreasing, while precision can both decrease and increase.
            \end{remark}
        \end{example}
+
+        \begin{description}
+            \item[Average precision (AP)] \marginnote{Average precision (AP)}
+                Area under the precision-recall curve.
+
+            \item[Mean average precision (mAP)] \marginnote{Mean AP (mAP)}
+                Mean AP over the possible classes.
+
+            \item[COCO mean average precision] \marginnote{COCO mAP}
+                Compute for each class the average AP over varying $\rho_\texttt{IoU}$ (e.g., in the original paper, $\rho_\texttt{IoU} \in [0.5, 0.95]$ with $0.05$ steps) and further average them over the possible classes.
+
+                \begin{remark}
+                    Higher COCO mAP indicates a detector with good localization capabilities.
+                \end{remark}
+        \end{description}
+\end{description}
+
+
+
+\section{Viola-Jones}
+
+\begin{description}
+    \item[Viola-Jones] \marginnote{Viola-Jones object detection}
+        General framework for object detection, mainly applied to faces.
+
+        It is one of the first successful applications of machine learning in computer vision and has the following basis:
+        \begin{itemize}
+            \item Use AdaBoost to learn an ensemble of features.
+            \item Use multi-scale rectangular features computed efficiently using integral images.
+            \item Cascade to obtain real-time speed.
+        \end{itemize}
+\end{description}
+
+
+\subsection{Boosting}
+
+\begin{description}
+    \item[Weak learner] \marginnote{Weak learner}
+        Classifier with an error rate slightly higher than a random classifier (i.e., in a balanced binary task, accuracy slightly higher than $50\%$).
+
+        \begin{description}
+            \item[Decision stump] \marginnote{Decision stump}
+                Classifier that learns a threshold for a single feature (i.e., decision tree with depth 1).
+        \end{description}
+
+    \item[Strong learner] \marginnote{Strong learner}
+        Classifier with an accuracy strongly correlated with the ground-truth.
+
+    \item[Adaptive boosting (AdaBoost)] \marginnote{Adaptive boosting (AdaBoost)}
+        Ensemble of $M$ weak learners $\texttt{WL}_i$ that creates a strong learner $\texttt{SL}$ as the linear combination of their predictions (i.e., weighted majority vote):
+        \[ \texttt{SL}(x) = \left( \sum_{i=1}^{M} \alpha_i \texttt{WL}_i(x) > 0 \right) \]
+
+    \item[Training] \marginnote{Boosting training}
+        Given $N$ training samples $(x^{(i)}, y^{(i)})$ and $M$ untrained weak learners $\texttt{WL}_i$, training is done sequentially by tuning a learner at the time:
+        \begin{enumerate}
+            \item Uniformly weigh each sample: $w^{(i)} = \frac{1}{N}$.
+            \item For each weak learner $\texttt{WL}_j$ ($j=1, \dots, M$):
+            \begin{enumerate}
+                \item Fit the weak learner on the weighted training data.
+                \item Compute its error rate:
+                \[ \varepsilon_j = \sum_{i: x^{(i)} \text{ misclassified}} w^{(i)} \]
+                \item Compute the reweigh factor:
+                \[ \beta_j = \frac{1 - \varepsilon_j}{\varepsilon_j} \]
+                \item Increase the weight of misclassified samples:
+                \[ w^{(i)} = w^{(i)} \beta_j \]
+                and re-normalize all samples so that their weights sum to $1$.
+            \end{enumerate}
+            \item Define the strong classifier as:
+            \[ \texttt{SL}(x) = \left( \sum_{j} \ln(\beta_j) \texttt{WL}_j(x) > 0 \right) \]
+        \end{enumerate}
+
+        \begin{example}
+            \small
+            Consider the problem of spam detection with two features $x_1$ and $x_2$ (number of URL and capitalized words, respectively).
+            The training samples and their initial weights are the following:
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.3\linewidth]{./img/_adaboost_example1.pdf}
+            \end{figure}
+            We want to train an ensemble of $3$ decision stumps $\texttt{WL}_{j}$.
+
+            Let's say that the first weak classifier learns to detect spam using the criteria $x_1 > 3$. The error rate and reweigh factor are:
+            \[
+                \varepsilon_1 = \frac{1}{8} + \frac{1}{8} \qquad
+                \beta_1 = \frac{1 - \varepsilon_1}{\varepsilon_1} = 3
+            \]
+            The new reweighed and normalized samples are:
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.9\linewidth]{./img/_adaboost_example2.pdf}
+            \end{figure}
+
+            Now, assume that the second classifier learns $x_1 > 10$. The error rate and reweigh factor are:
+            \[ \varepsilon_2 = \frac{1}{12} + \frac{1}{12} \qquad
+            \beta_2 = \frac{1 - \varepsilon_2}{\varepsilon_2} = 5 \]
+            The new reweighed and normalized samples are:
+            \begin{figure}[H]
+                \centering
+                \includegraphics[width=0.7\linewidth]{./img/_adaboost_example3.pdf}
+            \end{figure}
+
+            Finally, the third classifier learns $x_2 > 20$. The error rate and reweigh factor are:
+            \[ \varepsilon_3 = \frac{1}{20} + \frac{1}{20} + \frac{3}{20} \qquad
+            \beta_3 = \frac{1 - \varepsilon_3}{\varepsilon_3} = 3 \]
+
+            The strong classifier is defined as:
+            \[ \texttt{SL}(x) = \begin{cases}
+                1 & \text{if $\big( \ln(3)\texttt{WL}_1(x) + \ln(5)\texttt{WL}_2(x) + \ln(3)\texttt{WL}_3(x) \big) \geq 0$} \\
+                -1 & \text{otherwise}
+            \end{cases} \]
+        \end{example}
+
+    \item[Haar-like features] \marginnote{Haar-like features}
+        For face detection, a $24 \times 24$ patch of the image is considered (for now) and the weak classifiers define rectangular filters composed of 2 to 4 subsections applied at fixed positions of the patch.
+
+        Given a patch $x$, a weak learned $\texttt{WL}_j$ classifies it as:
+        \[
+            \texttt{WL}_j(x) = \begin{cases}
+                1 & \text{if $s_j f_j \geq s_j \rho_j$} \\
+                -1 & \text{otherwise}
+            \end{cases}
+        \]
+        where the learned parameters are:
+        \begin{itemize}
+            \item The size and position of the filter ($f_j$ is the result of applying the filter).
+            \item The polarity $s_j$.
+            \item The threshold $\rho_j$.
+        \end{itemize}
+
+        \begin{figure}[H]
+            \centering
+            \begin{subfigure}{0.6\linewidth}
+                \centering
+                \includegraphics[width=0.5\linewidth]{./img/_haar_like_example.pdf}
+                \caption{Filter applied on a patch}
+            \end{subfigure}
+            \hfill
+            \begin{subfigure}{0.35\linewidth}
+                \centering
+                \includegraphics[width=0.65\linewidth]{./img/_haar_like_filters_example.pdf}
+                \caption{Other possible filters}
+            \end{subfigure}
+            \caption{Example of filters}
+        \end{figure}
+
+        \begin{remark}
+            AdaBoost is used to select a subset of the most effective filters.
+        \end{remark}
+\end{description}
+
+
+\subsection{Integral images}
+
+\begin{description}
+    \item[Integral image] \marginnote{Integral image}
+        Given an image $I$, its corresponding integral image $II$ is defined as:
+        \[ II(i, j) = \sum_{i' \leq i, j' \leq j} I(i', j') \]
+        In other words, the value at coordinates $(i, j)$ in the integral image is the sum of all the pixels of the original image in an area that starts from the top-left corner and has as bottom-right corner the pixel at $(i, j)$.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.45\linewidth]{./img/_integral_image.pdf}
+            \caption{Example of integral image}
+        \end{figure}
+
+        \begin{remark}
+            In practice, the integral image can be computed recursively as:
+            \[ II(i, j) = II(i, j-1) + II(i-1, j) - II(i-1, j-1) + I(i, j) \]
+        \end{remark}
+
+    \item[Fast feature computation] \marginnote{Fast feature computation}
+        Given an image $I$ and its integral image $II$, the sum of the pixels in a rectangular area of $I$ can be computed in constant time as:
+        \[ II(A) - II(B) - II(C) + II(D) \]
+        where $A$, $B$, $C$, and $D$ are coordinates defined as in \Cref{fig:integral_image_features}.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.5\linewidth]{./img/_integral_image_feature.pdf}
+            \caption{Summation of the pixels in the blue area}
+            \label{fig:integral_image_features}
+        \end{figure}
+
+    \item[Multi-scale sliding window] \marginnote{Multi-scale sliding window}
+        During inference, Viola-Jones is a sliding window detector that scans the image considering patches of fixed size.
+
+        To achieve scale-invariance, patches of different size are used, scaling the rectangular filters accordingly.
+
+        \begin{remark}
+            The integral image allows to compute the features in constant time independently of the patch size.
+        \end{remark}
+\end{description}
+
+
+\subsection{Cascade}
+
+\begin{description}
+    \item[Cascade] \marginnote{Cascade}
+        To obtain real-time predictions, a hierarchy of classifiers is used to quickly reject background patches. The first classifier considers a few features while the following ones use more.
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.85\linewidth]{./img/_viola_jones_cascade.pdf}
+        \end{figure}
+\end{description}
+
+
+\subsection{Non-maximum suppression}
+
+\begin{description}
+    \item[Non-maximum suppression (NMS)] \marginnote{Non-maximum suppression (NMS)}
+        Algorithm to obtain a single bounding box from several overlapping ones. Given the set of all the bounding boxes with their confidence that a detector found, NMS works as follows:
+        \begin{enumerate}
+            \item Until there are unchecked boxes:
+            \begin{enumerate}
+                \item Consider the bounding box with the highest confidence.
+                \item Eliminate all boxes with overlap higher than a chosen threshold (e.g., $\texttt{IoU} > 0.5$).
+            \end{enumerate}
+        \end{enumerate}
+
+        \begin{remark}
+            If two objects are close, NMS might detect them as a single instance.
+        \end{remark}
+\end{description}
+
+
+
+\section{CNN object localization}
+
+\begin{description}
+    \item[Object localization] \marginnote{Object localization}
+        Subset of object detection problems where it is assumed that there is only a single object to detect.
+
+    \item[CNN for object localization] \marginnote{CNN for object localization}
+        A pre-trained CNN can be used as feature extractor with two heads:
+        \begin{descriptionlist}
+            \item[Classification head] Used to determine the class.
+            \item[Regression head] Used to determine the bounding box.
+        \end{descriptionlist}
+
+        Given:
+        \begin{itemize}
+            \item The ground-truth class $c^{(i)}$ and bounding box $BB^{(i)}$,
+            \item The predicted class logits $\texttt{scores}^{(i)}$ and bounding box $\widehat{BB}^{(i)}$,
+        \end{itemize} 
+        training is a multi-task learning problem with two losses:
+        \[ \mathcal{L}^{(i)} = \mathcal{L}_\text{CE}\left( \texttt{softmax}(\texttt{scores}^{(i)}), \mathbbm{1}[c^{(i)}] \right) + \lambda \mathcal{L}_\text{MSE}\left(\widehat{BB}^{(i)}, BB^{(i)} \right) \]
+
+        \begin{figure}[H]
+            \centering
+            \includegraphics[width=0.95\linewidth]{./img/_cnn_object_localization.pdf}
+            \caption{Localizer with AlexNet as feature extractor and 1000 classes}
+        \end{figure}
+
+        \begin{remark}
+            A localization CNN can be used as a sliding window detector to detect multiple objects.
+
+            An additional background class (\texttt{bg}) has to be added to mark patches without an object. Moreover, when a patch belongs to the background, the loss related to the bounding box should be ignored. Therefore, the loss becomes:
+            \[ \mathcal{L}^{(i)} = \mathcal{L}_\text{CE}\left( \texttt{softmax}(\texttt{scores}^{(i)}), \mathbbm{1}[c^{(i)}] \right) + \lambda \mathbbm{1}[c^{(i)} \neq \texttt{bg}] \mathcal{L}_\text{MSE}\left(\widehat{BB}^{(i)}, BB^{(i)} \right) \]
+            where $\mathbbm{1}[c^{(i)} \neq \texttt{bg}]$ is $1$ iff the ground-truth class $c^{(i)}$ is not the background class.
+
+            This approach has two main problems:
+            \begin{itemize}
+                \item Background patches are usually more frequent, requiring additional work to balance the dataset or mini-batch.
+                \item There are too many patches to check.
+            \end{itemize}
+        \end{remark}
 \end{description}