Books are a rich source of both fine-grained information, how a character, an
object or a scene looks like, as well as high-level semantics, what someone is
thinking, feeling and how these states evolve through a story. This paper aims
to align books to their movie releases in order to provide rich descriptive
explanations for visual content that go semantically far beyond the captions
available in current datasets. To align movies and books we exploit a neural
sentence embedding that is trained in an unsupervised way from a large corpus
of books, as well as a video-text neural embedding for computing similarities
between movie clips and sentences in the book. We propose a context-aware CNN
to combine information from multiple sources. We demonstrate good quantitative
performance for movie/book alignment and show several qualitative examples that
showcase the diversity of tasks our model can be used for.
Description
Aligning Books and Movies: Towards Story-like Visual Explanations by
Watching Movies and Reading Books
%0 Generic
%1 zhu2015aligning
%A Zhu, Yukun
%A Kiros, Ryan
%A Zemel, Richard
%A Salakhutdinov, Ruslan
%A Urtasun, Raquel
%A Torralba, Antonio
%A Fidler, Sanja
%D 2015
%K reading
%T Aligning Books and Movies: Towards Story-like Visual Explanations by
Watching Movies and Reading Books
%U http://arxiv.org/abs/1506.06724
%X Books are a rich source of both fine-grained information, how a character, an
object or a scene looks like, as well as high-level semantics, what someone is
thinking, feeling and how these states evolve through a story. This paper aims
to align books to their movie releases in order to provide rich descriptive
explanations for visual content that go semantically far beyond the captions
available in current datasets. To align movies and books we exploit a neural
sentence embedding that is trained in an unsupervised way from a large corpus
of books, as well as a video-text neural embedding for computing similarities
between movie clips and sentences in the book. We propose a context-aware CNN
to combine information from multiple sources. We demonstrate good quantitative
performance for movie/book alignment and show several qualitative examples that
showcase the diversity of tasks our model can be used for.
@misc{zhu2015aligning,
abstract = {Books are a rich source of both fine-grained information, how a character, an
object or a scene looks like, as well as high-level semantics, what someone is
thinking, feeling and how these states evolve through a story. This paper aims
to align books to their movie releases in order to provide rich descriptive
explanations for visual content that go semantically far beyond the captions
available in current datasets. To align movies and books we exploit a neural
sentence embedding that is trained in an unsupervised way from a large corpus
of books, as well as a video-text neural embedding for computing similarities
between movie clips and sentences in the book. We propose a context-aware CNN
to combine information from multiple sources. We demonstrate good quantitative
performance for movie/book alignment and show several qualitative examples that
showcase the diversity of tasks our model can be used for.},
added-at = {2016-02-03T09:41:54.000+0100},
author = {Zhu, Yukun and Kiros, Ryan and Zemel, Richard and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},
biburl = {https://www.bibsonomy.org/bibtex/23a24deb5169c1bc3c11a735b8cd573af/lrieger},
description = {Aligning Books and Movies: Towards Story-like Visual Explanations by
Watching Movies and Reading Books},
interhash = {1bb084882551339644367a49ffd905fd},
intrahash = {3a24deb5169c1bc3c11a735b8cd573af},
keywords = {reading},
note = {cite arxiv:1506.06724},
timestamp = {2016-02-03T09:41:54.000+0100},
title = {Aligning Books and Movies: Towards Story-like Visual Explanations by
Watching Movies and Reading Books},
url = {http://arxiv.org/abs/1506.06724},
year = 2015
}