Real-world image recognition systems need to recognize tens of thousands of
classes that constitute a plethora of visual concepts. The traditional approach
of annotating thousands of images per class for training is infeasible in such
a scenario, prompting the use of webly supervised data. This paper explores the
training of image-recognition systems on large numbers of images and associated
user comments. In particular, we develop visual n-gram models that can predict
arbitrary phrases that are relevant to the content of an image. Our visual
n-gram models are feed-forward convolutional networks trained using new loss
functions that are inspired by n-gram models commonly used in language
modeling. We demonstrate the merits of our models in phrase prediction,
phrase-based image retrieval, relating images and captions, and zero-shot
transfer.
%0 Generic
%1 li2016learning
%A Li, Ang
%A Jabri, Allan
%A Joulin, Armand
%A van der Maaten, Laurens
%D 2016
%K learning myown network neural ngram phrase supervised vision weakly
%T Learning Visual N-Grams from Web Data
%U http://arxiv.org/abs/1612.09161
%X Real-world image recognition systems need to recognize tens of thousands of
classes that constitute a plethora of visual concepts. The traditional approach
of annotating thousands of images per class for training is infeasible in such
a scenario, prompting the use of webly supervised data. This paper explores the
training of image-recognition systems on large numbers of images and associated
user comments. In particular, we develop visual n-gram models that can predict
arbitrary phrases that are relevant to the content of an image. Our visual
n-gram models are feed-forward convolutional networks trained using new loss
functions that are inspired by n-gram models commonly used in language
modeling. We demonstrate the merits of our models in phrase prediction,
phrase-based image retrieval, relating images and captions, and zero-shot
transfer.
@misc{li2016learning,
abstract = {Real-world image recognition systems need to recognize tens of thousands of
classes that constitute a plethora of visual concepts. The traditional approach
of annotating thousands of images per class for training is infeasible in such
a scenario, prompting the use of webly supervised data. This paper explores the
training of image-recognition systems on large numbers of images and associated
user comments. In particular, we develop visual n-gram models that can predict
arbitrary phrases that are relevant to the content of an image. Our visual
n-gram models are feed-forward convolutional networks trained using new loss
functions that are inspired by n-gram models commonly used in language
modeling. We demonstrate the merits of our models in phrase prediction,
phrase-based image retrieval, relating images and captions, and zero-shot
transfer.},
added-at = {2017-04-14T01:49:19.000+0200},
author = {Li, Ang and Jabri, Allan and Joulin, Armand and van der Maaten, Laurens},
biburl = {https://www.bibsonomy.org/bibtex/253e9cc1fb38a82dd68f6cfad2548282d/ang},
description = {Learning Visual N-Grams from Web Data},
interhash = {64ff94c58e4248185ee7236cc521be0a},
intrahash = {53e9cc1fb38a82dd68f6cfad2548282d},
keywords = {learning myown network neural ngram phrase supervised vision weakly},
note = {cite arxiv:1612.09161},
timestamp = {2017-04-14T01:49:19.000+0200},
title = {Learning Visual N-Grams from Web Data},
url = {http://arxiv.org/abs/1612.09161},
year = 2016
}