Add CN2 neural network vision
@ -14,6 +14,7 @@
|
||||
\newpage
|
||||
|
||||
\input{./sections/_object_recognition.tex}
|
||||
\input{./sections/_nn_recognition.tex}
|
||||
|
||||
\printbibliography[heading=bibintoc]
|
||||
|
||||
|
||||
BIN
src/year1/cognition-and-neuroscience/module2/img/cornet.png
Normal file
|
After Width: | Height: | Size: 113 KiB |
|
After Width: | Height: | Size: 83 KiB |
|
After Width: | Height: | Size: 54 KiB |
|
After Width: | Height: | Size: 388 KiB |
|
After Width: | Height: | Size: 180 KiB |
|
After Width: | Height: | Size: 443 KiB |
|
After Width: | Height: | Size: 101 KiB |
|
After Width: | Height: | Size: 267 KiB |
|
After Width: | Height: | Size: 885 KiB |
|
After Width: | Height: | Size: 209 KiB |
|
After Width: | Height: | Size: 165 KiB |
|
After Width: | Height: | Size: 94 KiB |
|
After Width: | Height: | Size: 51 KiB |
|
After Width: | Height: | Size: 44 KiB |
|
After Width: | Height: | Size: 76 KiB |
|
After Width: | Height: | Size: 60 KiB |
|
After Width: | Height: | Size: 176 KiB |
|
After Width: | Height: | Size: 50 KiB |
|
After Width: | Height: | Size: 29 KiB |
|
After Width: | Height: | Size: 307 KiB |
|
After Width: | Height: | Size: 81 KiB |
|
After Width: | Height: | Size: 178 KiB |
|
After Width: | Height: | Size: 61 KiB |
|
After Width: | Height: | Size: 43 KiB |
|
After Width: | Height: | Size: 41 KiB |
|
After Width: | Height: | Size: 77 KiB |
|
After Width: | Height: | Size: 158 KiB |
|
After Width: | Height: | Size: 67 KiB |
|
After Width: | Height: | Size: 129 KiB |
|
After Width: | Height: | Size: 154 KiB |
@ -20,3 +20,65 @@
|
||||
year = {2014},
|
||||
doi = {10.1073/pnas.1403112111},
|
||||
}
|
||||
|
||||
|
||||
@article {human_monkey_confusion,
|
||||
title = {Comparison of Object Recognition Behavior in Human and Monkey},
|
||||
author = {Rajalingham, Rishi and Schmidt, Kailyn and DiCarlo, James J.},
|
||||
volume = {35},
|
||||
number = {35},
|
||||
pages = {12127--12136},
|
||||
year = {2015},
|
||||
publisher = {Society for Neuroscience},
|
||||
journal = {Journal of Neuroscience},
|
||||
doi = {10.1523/JNEUROSCI.0573-15.2015},
|
||||
}
|
||||
|
||||
|
||||
@article {human_dcnn_divergence,
|
||||
title = {Large-Scale, High-Resolution Comparison of the Core Visual Object Recognition Behavior of Humans, Monkeys, and State-of-the-Art Deep Artificial Neural Networks},
|
||||
author = {Rajalingham, Rishi and Issa, Elias B. and Bashivan, Pouya and Kar, Kohitij and Schmidt, Kailyn and DiCarlo, James J.},
|
||||
volume = {38},
|
||||
number = {33},
|
||||
pages = {7255--7269},
|
||||
year = {2018},
|
||||
publisher = {Society for Neuroscience},
|
||||
journal = {Journal of Neuroscience},
|
||||
doi = {10.1523/JNEUROSCI.0388-18.2018},
|
||||
}
|
||||
|
||||
|
||||
@article{recognition_reaction,
|
||||
title = {Evidence that recurrent circuits are critical to the ventral stream's execution of core object recognition behavior},
|
||||
author = {Kar, Kohitij and Kubilius, Jonas and Schmidt, Kailyn and Issa, Elias B. and DiCarlo, James J.},
|
||||
journal = {Nature Neuroscience},
|
||||
year = {2019},
|
||||
volume = {22},
|
||||
number = {6},
|
||||
pages = {974-983},
|
||||
doi = {10.1038/s41593-019-0392-5},
|
||||
}
|
||||
|
||||
|
||||
@article{pattern_completion,
|
||||
title = {Recurrent computations for visual pattern completion},
|
||||
author = {Hanlin Tang and Martin Schrimpf and William Lotter and Charlotte Moerman and Ana Paredes and Josue Ortega Caro and Walter Hardesty and David Cox and Gabriel Kreiman },
|
||||
journal = {Proceedings of the National Academy of Sciences},
|
||||
volume = {115},
|
||||
number = {35},
|
||||
pages = {8835-8840},
|
||||
year = {2018},
|
||||
doi = {10.1073/pnas.1719397115},
|
||||
}
|
||||
|
||||
|
||||
@article{unsupervised_embedding,
|
||||
title = {Unsupervised neural network models of the ventral visual stream},
|
||||
author = {Chengxu Zhuang and Siming Yan and Aran Nayebi and Martin Schrimpf and Michael C. Frank and James J. DiCarlo and Daniel L. K. Yamins },
|
||||
journal = {Proceedings of the National Academy of Sciences},
|
||||
volume = {118},
|
||||
number = {3},
|
||||
pages = {e2014196118},
|
||||
year = {2021},
|
||||
doi = {10.1073/pnas.2014196118},
|
||||
}
|
||||
@ -0,0 +1,431 @@
|
||||
\chapter{Object recognition emulation through neural networks}
|
||||
|
||||
|
||||
\section{Convolutional neural networks}
|
||||
\marginnote{Convolutional neural networks}
|
||||
|
||||
Deep convolutional neural networks (DCNNs) show an internal feature representation similar to the representation of the ventral pathway (primate ventral visual stream).
|
||||
Moreover, object confusion in DCNNs is similar to the behavioral patterns in primates.
|
||||
|
||||
However, on a higher resolution level (i.e. not object but image level), the performance of DCNNs diverges drastically from human behavior.
|
||||
|
||||
\begin{remark}
|
||||
Studies using HCNN have also been presented in the previous chapter.
|
||||
\end{remark}
|
||||
|
||||
\begin{casestudy}[Humans and monkeys object confusion \cite{human_monkey_confusion}]
|
||||
It has been seen that monkeys show a confusion pattern correlated to that of humans on the task of object recognition.
|
||||
Convolutional neural networks also show this correlation while low-level visual representations (V1 or pixels, a baseline computed from the pixels of the image)
|
||||
correlate poorly.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/human_monkey_confusion.png}
|
||||
\end{figure}
|
||||
\end{casestudy}
|
||||
|
||||
\begin{casestudy}[Primates and DCNNs object recognition divergence \cite{human_dcnn_divergence}]
|
||||
Humans, monkeys and DCNNs are trained for the task of object recognition.
|
||||
|
||||
To enforce an invariance recognition behavior, each image has an object with a random transformation (position, rotation, size)
|
||||
and has a random natural background.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.65\linewidth]{./img/human_dcnn_divergence3.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{itemize}
|
||||
\item For humans, a trial starts with fixation. Then, an image is displayed for 100 ms followed by a binary choice.
|
||||
The human has to make its choice in 1000 ms.
|
||||
|
||||
\item For monkeys, a trial starts with fixation. Then, an image is displayed for 100 ms followed by a binary choice.
|
||||
The monkey has up to 1500 ms to freely view the response images and has to maintain fixation on its choice for 700 ms.
|
||||
|
||||
\item DCNNs are trained as usual.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.85\linewidth]{./img/human_dcnn_divergence1.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/human_dcnn_divergence2.png}
|
||||
\caption{Steps of a trial}
|
||||
\end{figure}
|
||||
|
||||
Performance is measured using behavioral metrics.
|
||||
Results show that:
|
||||
\begin{descriptionlist}
|
||||
\item[Object-level]
|
||||
Object-level measurements are obtained as an average across all images of that object.
|
||||
|
||||
Recognition confusion of primates and DCNNs are mostly correlated.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.65\linewidth]{./img/human_dcnn_divergence4.png}
|
||||
\caption{
|
||||
\parbox[t]{0.6\linewidth}{
|
||||
Object-level results. In the first part, warmer colors indicate a better classification.
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
\item[Image-level]
|
||||
Image-level measurements are obtained by normalizing the raw classification results.
|
||||
|
||||
All DCNNs fail to replicate the behavioral signatures of primates.
|
||||
This hints at the fact that the architecture and/or the training process is limiting the capability of the models.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.75\linewidth]{./img/human_dcnn_divergence5.png}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
\end{casestudy}
|
||||
|
||||
|
||||
|
||||
\section{Recurrent neural networks}
|
||||
\marginnote{Recurrent neural networks}
|
||||
|
||||
|
||||
\subsection{Object recognition}
|
||||
|
||||
The short duration for which candidates of the previous experiments were exposed to an image
|
||||
suggests that recurrent computation is not relevant for core object recognition.
|
||||
However, the following points are in contrast with this hypothesis:
|
||||
\begin{itemize}
|
||||
\item DCNNs fail to predict primate behavior in many cases.
|
||||
\item Specific image instances (e.g. blurred, cluttered, occluded) are easy for primates but difficult for DCNNs.
|
||||
\end{itemize}
|
||||
This hints at the fact that recurrent computation might be involved, maybe at later stages of the recognition process.
|
||||
|
||||
\begin{casestudy}[Primates recognition reaction time \cite{recognition_reaction}]
|
||||
\phantom{}
|
||||
\begin{descriptionlist}
|
||||
\item[Recognition training and evaluation]
|
||||
Humans, macaques and DCNNs are trained for the task of object recognition on
|
||||
images with two levels of difficulty:
|
||||
\begin{descriptionlist}
|
||||
\item[Control images] Easier to recognize.
|
||||
\item[Challenge images] Harder to recognize.
|
||||
\end{descriptionlist}
|
||||
Results show that primates outperform DCNNs on challenge images.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.6\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/recognition_reaction1.png}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.25\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/recognition_reaction2.png}
|
||||
\end{subfigure}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Trial steps, example images and behavioral comparison between monkeys and DCNNs.
|
||||
Red and blue points in the graph are challenge and control images, respectively.
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
\item[Reaction time]
|
||||
It also has been observed that the reaction time of both humans and monkeys for challenge images is significantly higher than the reaction for control images
|
||||
($\Delta\text{RT} = 11.9 \text{ ms}$ for monkeys and $\Delta\text{RT} = 25 \text{ ms}$ for humans).
|
||||
|
||||
To determine the time at which the identity of an object is formed in the IT cortex,
|
||||
the neural activity is measured every 10 ms after the stimulus onset and a linear classifier (decoder) is trained to determine the \textbf{neural decode accuracy (NDA)}
|
||||
(i.e. the best accuracy that the classifier can achieve with the information in that time slice).
|
||||
We refer with \textbf{object solution time (OST)} the time at which the NDA reached the primate accuracy (i.e. high enough).
|
||||
|
||||
It has been observed that challenge images have a slightly higher OST ($\sim 30 \text{ ms}$)
|
||||
whether the animal was actively performing the task or passively viewing the image.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/recognition_reaction3.png}
|
||||
\end{figure}
|
||||
|
||||
\item[DCNN IT prediction]
|
||||
The IT neuronal response for a subset of challenge and control images has been measured across 10 ms bins
|
||||
to obtain two sets $R^\text{train}$ and $R^\text{test}$ (50/50).
|
||||
|
||||
During training, the activation $F^\text{train}$ of a layer of the DCNN is used to predict $R^\text{train}$
|
||||
through partial least square regression (i.e. a linear combination of $F^\text{train}$).
|
||||
|
||||
During testing, the activation of the same layer of the DCNN is transformed using the found parameters and compared to $R^\text{test}$.
|
||||
|
||||
Results show a higher predictivity for early responses (which are mainly feed-forward) and a significant drop over time.
|
||||
The drop coincides with the OST of challenge images, hinting at the fact that later phases of the IT might involve recurrence.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.35\linewidth]{./img/recognition_reaction4.png}
|
||||
\end{figure}
|
||||
|
||||
\item[CORnet IT prediction]
|
||||
The previous experiment has also been done using deeper CNNs that showed better predictivity.
|
||||
This can be explained by the fact that deeper networks simulate the unrolling of a recurrent network and are therefore an approximation of them.
|
||||
|
||||
Deeper networks are also able to solve some of the challenge images but those that remained unsolved
|
||||
are those with the longest OSTs among the challenge images.
|
||||
|
||||
CORnet, a four-layer recurrent neural network, has also been experimented.
|
||||
Results show that the first layers of CORnet are good predictors of the early IT phases while the last layers are good at predicting the late phases of IT.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/cornet.png}
|
||||
\caption{Architecture of CORnet}
|
||||
\end{figure}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.48\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/recognition_reaction5.png}
|
||||
\caption{
|
||||
\parbox[t]{0.9\linewidth}{
|
||||
Predictivity for deep, deeper and recurrent CNNs
|
||||
}
|
||||
}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.48\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/recognition_reaction6.png}
|
||||
\caption{
|
||||
\parbox[t]{0.9\linewidth}{
|
||||
Number (on top) and median OST (bars) of the unsolved images for each model
|
||||
}
|
||||
}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\begin{remark}
|
||||
Recurrence can be seen as additional non-linear transformations in addition to those of the feed-forward phase.
|
||||
\end{remark}
|
||||
\end{descriptionlist}
|
||||
\end{casestudy}
|
||||
|
||||
|
||||
\subsection{Visual pattern completion}
|
||||
|
||||
|
||||
\begin{description}
|
||||
\item[Pattern completion] \marginnote{Pattern completion}
|
||||
Ability to recognize poorly visible or occluded objects.
|
||||
|
||||
\begin{remark}
|
||||
The visual system is able to infer an object even if only 10-20\% of it is visible.
|
||||
|
||||
It is hypothesized that recurrent computation is involved.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
\begin{casestudy}[Human and RNN pattern completion \cite{pattern_completion}]
|
||||
\phantom{}
|
||||
\begin{descriptionlist}
|
||||
\item[Trial structure]
|
||||
Whole and partial images are presented to humans through two types of trials:
|
||||
\begin{descriptionlist}
|
||||
\item[Unmasked]
|
||||
After fixation, an image is displayed for a short time followed by a blank screen. Then, a response is required from the candidate.
|
||||
|
||||
\item[Backward masking]
|
||||
After fixation, an image is displayed for a short time followed by another image. Then, a response is required from the candidate.
|
||||
The second image aims to interrupt the processing of the first one (i.e. interrupt recurrent processing).
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/pattern_completion1.png}
|
||||
\end{figure}
|
||||
|
||||
\item[Human results]
|
||||
Results show that subjects are able to robustly recognize whole and partial objects in the unmasked case.
|
||||
In the masked case, performances are instead worse.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/pattern_completion2.png}
|
||||
\end{figure}
|
||||
|
||||
Moreover, measurements show that the neural response to partially visible objects is delayed compared to whole images,
|
||||
hinting at the fact that additional computation is needed.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.5\linewidth]{./img/pattern_completion3.png}
|
||||
\caption{
|
||||
Activity (IFP) of a neuron that responds to faces
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
\item[CNN results]
|
||||
Feed-forward CNNs have also been trained on the task of object recognition.
|
||||
\begin{itemize}
|
||||
\item Performances are comparable to humans for whole images but decline for partial images.
|
||||
\item There is a slight correlation between the latency of humans' neural response and
|
||||
the distance of the internal representation in the CNNs of each partial object to its whole image.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.33\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.9\linewidth]{./img/pattern_completion4.png}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.65\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/pattern_completion5.png}
|
||||
\caption{Representation and latency correlation. The color of the dots depends on the electrode that measured the latency.}
|
||||
\end{subfigure}
|
||||
\end{figure}
|
||||
|
||||
\item[RNN results]
|
||||
Recurrent neural networks have also been tested by using existing CNNs enhanced through attractor networks\footnote{
|
||||
Recurrent network with multiple attractor points each representing a whole image.
|
||||
By processing the same partial image for multiple time steps, its representation should converge to an attractor point.
|
||||
} (Hopfield network, RNNh).
|
||||
Results show that RNNh has higher performance in pattern completion.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.45\linewidth]{./img/pattern_completion6.png}
|
||||
\end{figure}
|
||||
|
||||
Moreover, by plotting the temporal evolution of the internal representation of partial objects,
|
||||
it can be seen that, at the beginning, partial images are more similar among themselves than their corresponding attractor point,
|
||||
but, over time, their representation approaches the correct cluster.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.7\linewidth]{./img/pattern_completion7.png}
|
||||
\end{figure}
|
||||
|
||||
Time-wise, RNNh performance and correlation with humans increase over the time steps and saturates at around 10-20 steps.
|
||||
This is consistent with the physiological delays of the human ventral visual stream.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/pattern_completion8.png}
|
||||
\end{figure}
|
||||
|
||||
By backward masking the input of the RNNh (i.e. present the image for a few time steps and then change it), performance drops from $58 \pm 2\%$ to $37 \pm 2\%$.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.25\linewidth]{./img/pattern_completion9.png}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
\end{casestudy}
|
||||
|
||||
|
||||
|
||||
\section{Unsupervised neural networks}
|
||||
\marginnote{Unsupervised neural networks}
|
||||
|
||||
Most of the models to simulate the visual cortex are trained on supervised datasets of millions of images.
|
||||
Such supervision is not able to explain how primates learn to recognize objects as processing a huge amount of category labels during development is highly improbable.
|
||||
Possible hypotheses are:
|
||||
\begin{itemize}
|
||||
\item Humans might rely on different inductive biases for a more efficient learning.
|
||||
\item Humans might augment their initial dataset by combining known instances.
|
||||
\end{itemize}
|
||||
|
||||
Unsupervised learning might explain what happens in between
|
||||
the representations at low-level visual areas (i.e. the retina), which are mostly hardcoded from evolution,
|
||||
and the representations learned at higher levels.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.55\linewidth]{./img/vision_learning_method.png}
|
||||
\end{figure}
|
||||
|
||||
\begin{casestudy}[Unsupervised embedding \cite{unsupervised_embedding}]
|
||||
Different unsupervised embedding methods are used to create a representation for a dataset of images that are then assessed on various tasks.
|
||||
|
||||
\begin{descriptionlist}
|
||||
\item[Contrastive embedding]
|
||||
Unsupervised embedding method that uses a DCNN (which simulates low-level visual areas) to create the representation of an image in a low dimensional space and
|
||||
then optimize it by pushing each embedding closer to its close neighbors and far from its background neighbors.
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\begin{subfigure}{0.75\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/local_aggregation1.png}
|
||||
\end{subfigure}
|
||||
\begin{subfigure}{0.55\linewidth}
|
||||
\centering
|
||||
\includegraphics[width=\linewidth]{./img/local_aggregation2.png}
|
||||
\end{subfigure}
|
||||
\caption{Workflow and visualization of the local aggregation algorithm}
|
||||
\end{figure}
|
||||
|
||||
\item[Results on object recognition tasks]
|
||||
To solve the tasks, unsupervised embeddings are used in conjunction with a linear classifier.
|
||||
A supervised DCNN is also used as a baseline.
|
||||
|
||||
Results show that:
|
||||
\begin{itemize}
|
||||
\item Among all the unsupervised methods, contrastive embeddings have the best performances.
|
||||
\item Unsupervised methods equaled or outperformed the DCNN on tasks such as object position and size estimation.
|
||||
\item The DCNN outperforms unsupervised models on categorization tasks.
|
||||
\end{itemize}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.95\linewidth]{./img/unsupervised_embedding1.png}
|
||||
\caption{
|
||||
\parbox[t]{0.7\linewidth}{
|
||||
Evaluation accuracy of an untrained model (brown), predictive encoding methods (orange),
|
||||
self-supervised methods (blue), contrastive embeddings (red) and a supervised DCNN (black).
|
||||
}
|
||||
}
|
||||
\end{figure}
|
||||
|
||||
\item[Results on neural data]
|
||||
Techniques to map the responses of an artificial network to real neural responses have been used to evaluate unsupervised methods.
|
||||
|
||||
Results show that:
|
||||
\begin{descriptionlist}
|
||||
\item[Area V1] None of the unsupervised methods are statistically better than the DCNN.
|
||||
\item[Area V4] A subset of methods equaled the DCNN.
|
||||
\item[Area IT] Only contrastive embeddings equaled the DCNN.
|
||||
\end{descriptionlist}
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.8\linewidth]{./img/unsupervised_embedding2.png}
|
||||
\end{figure}
|
||||
|
||||
\item[Results on video data]
|
||||
As training on single distinct images (ImageNet) is significantly different from real biological data streams,
|
||||
a dataset containing videos (SAYCam) has been experimented with.
|
||||
A contrastive embedding, the VIE algorithm, has been employed to predict neural activity.
|
||||
|
||||
Results show that embeddings learned from videos are comparable to those learned from only images.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.6\linewidth]{./img/unsupervised_embedding3.png}
|
||||
\end{figure}
|
||||
|
||||
\item[Semi-supervised learning]
|
||||
Semi-supervised embedding aims to find a representation using a small subset of labeled data points and a large amount of unlabeled data.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.75\linewidth]{./img/local_label_propagation.png}
|
||||
\caption{Workflow of the local label propagation algorithm}
|
||||
\end{figure}
|
||||
|
||||
Results show that semi-supervised embeddings with only a $3\%$ of supervision are substantially more consistent than purely unsupervised methods.
|
||||
Although, the gap between them and the DCNN still remains.
|
||||
|
||||
Nevertheless, a significant gap is also present between the results of all the models and the noise ceiling of the data,
|
||||
indicating that there still are inconsistencies between artificial networks and the human visual system.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
\includegraphics[width=0.40\linewidth]{./img/unsupervised_embedding4.png}
|
||||
\end{figure}
|
||||
\end{descriptionlist}
|
||||
\end{casestudy}
|
||||
@ -91,7 +91,7 @@
|
||||
|
||||
|
||||
|
||||
\subsection{Pathways}
|
||||
\section{Pathways}
|
||||
|
||||
\begin{description}
|
||||
\item[Retino-geniculo-striate pathway] \marginnote{Retino-geniculo-striate pathway}
|
||||
@ -485,6 +485,7 @@ Object recognition requires both the following competing properties:
|
||||
|
||||
\begin{remark}
|
||||
200 ms is the time required to move the eyes. Experiments on core object recognition don't want candidates to move their eyes.
|
||||
Moreover, it prevents feed-back processing from starting.
|
||||
\end{remark}
|
||||
\end{description}
|
||||
|
||||
|
||||