State-of-the-art deep convolutional networks (DCNs) such as squeeze-and-
excitation (SE) residual networks implement a form of attention, also known as
contextual guidance, which is derived from global image features. Here, we
explore a complementary form of attention, known as visual saliency, which is
derived from local image features. We extend the SE module with a novel
global-and-local attention (GALA) module which combines both forms of attention
-- resulting in state-of-the-art accuracy on ILSVRC. We further describe
ClickMe.ai, a large-scale online experiment designed for human participants to
identify diagnostic image regions to co-train a GALA network. Adding
humans-in-the-loop is shown to significantly improve network accuracy, while
also yielding visual features that are more interpretable and more similar to
those used by human observers.
%0 Generic
%1 citeulike:14609092
%A xxx,
%D 2018
%K arch attention regularization
%T Global-and-local attention networks for visual recognition
%U http://arxiv.org/abs/1805.08819
%X State-of-the-art deep convolutional networks (DCNs) such as squeeze-and-
excitation (SE) residual networks implement a form of attention, also known as
contextual guidance, which is derived from global image features. Here, we
explore a complementary form of attention, known as visual saliency, which is
derived from local image features. We extend the SE module with a novel
global-and-local attention (GALA) module which combines both forms of attention
-- resulting in state-of-the-art accuracy on ILSVRC. We further describe
ClickMe.ai, a large-scale online experiment designed for human participants to
identify diagnostic image regions to co-train a GALA network. Adding
humans-in-the-loop is shown to significantly improve network accuracy, while
also yielding visual features that are more interpretable and more similar to
those used by human observers.
@misc{citeulike:14609092,
abstract = {{State-of-the-art deep convolutional networks (DCNs) such as squeeze-and-
excitation (SE) residual networks implement a form of attention, also known as
contextual guidance, which is derived from global image features. Here, we
explore a complementary form of attention, known as visual saliency, which is
derived from local image features. We extend the SE module with a novel
global-and-local attention (GALA) module which combines both forms of attention
-- resulting in state-of-the-art accuracy on ILSVRC. We further describe
ClickMe.ai, a large-scale online experiment designed for human participants to
identify diagnostic image regions to co-train a GALA network. Adding
humans-in-the-loop is shown to significantly improve network accuracy, while
also yielding visual features that are more interpretable and more similar to
those used by human observers.}},
added-at = {2019-02-27T22:23:29.000+0100},
archiveprefix = {arXiv},
author = {xxx},
biburl = {https://www.bibsonomy.org/bibtex/22416bcf7344923d04dff4f8a941fab7e/nmatsuk},
citeulike-article-id = {14609092},
citeulike-linkout-0 = {http://arxiv.org/abs/1805.08819},
citeulike-linkout-1 = {http://arxiv.org/pdf/1805.08819},
day = 25,
eprint = {1805.08819},
interhash = {408aa6cbb79db8332eac2555210515a2},
intrahash = {2416bcf7344923d04dff4f8a941fab7e},
keywords = {arch attention regularization},
month = may,
posted-at = {2018-06-28 15:14:33},
priority = {2},
timestamp = {2019-02-27T22:23:29.000+0100},
title = {{Global-and-local attention networks for visual recognition}},
url = {http://arxiv.org/abs/1805.08819},
year = 2018
}