Standard image captioning tasks such as COCO and Flickr30k are factual,
neutral in tone and (to a human) state the obvious (e.g., &\#34;a man playing a
guitar&\#34;). While such tasks are useful to verify that a machine understands the
content of an image, they are not engaging to humans as captions. With this in
mind we define a new task, Personality-Captions, where the goal is to be as
engaging to humans as possible by incorporating controllable style and
personality traits. We collect and release a large dataset of 201,858 of such
captions conditioned over 215 possible traits. We build models that combine
existing work from (i) sentence representations (Mazare et al., 2018) with
Transformers trained on 1.7 billion dialogue examples; and (ii) image
representations (Mahajan et al., 2018) with ResNets trained on 3.5 billion
social media images. We obtain state-of-the-art performance on Flickr30k and
COCO, and strong performance on our new task. Finally, online evaluations
validate that our task and models are engaging to humans, with our best model
close to human performance.
%0 Generic
%1 citeulike:14649646
%A xxx,
%D 2018
%K arch tags
%T Engaging Image Captioning Via Personality
%U http://arxiv.org/abs/1810.10665
%X Standard image captioning tasks such as COCO and Flickr30k are factual,
neutral in tone and (to a human) state the obvious (e.g., &\#34;a man playing a
guitar&\#34;). While such tasks are useful to verify that a machine understands the
content of an image, they are not engaging to humans as captions. With this in
mind we define a new task, Personality-Captions, where the goal is to be as
engaging to humans as possible by incorporating controllable style and
personality traits. We collect and release a large dataset of 201,858 of such
captions conditioned over 215 possible traits. We build models that combine
existing work from (i) sentence representations (Mazare et al., 2018) with
Transformers trained on 1.7 billion dialogue examples; and (ii) image
representations (Mahajan et al., 2018) with ResNets trained on 3.5 billion
social media images. We obtain state-of-the-art performance on Flickr30k and
COCO, and strong performance on our new task. Finally, online evaluations
validate that our task and models are engaging to humans, with our best model
close to human performance.
@misc{citeulike:14649646,
abstract = {{ Standard image captioning tasks such as COCO and Flickr30k are factual,
neutral in tone and (to a human) state the obvious (e.g., \&\#34;a man playing a
guitar\&\#34;). While such tasks are useful to verify that a machine understands the
content of an image, they are not engaging to humans as captions. With this in
mind we define a new task, Personality-Captions, where the goal is to be as
engaging to humans as possible by incorporating controllable style and
personality traits. We collect and release a large dataset of 201,858 of such
captions conditioned over 215 possible traits. We build models that combine
existing work from (i) sentence representations (Mazare et al., 2018) with
Transformers trained on 1.7 billion dialogue examples; and (ii) image
representations (Mahajan et al., 2018) with ResNets trained on 3.5 billion
social media images. We obtain state-of-the-art performance on Flickr30k and
COCO, and strong performance on our new task. Finally, online evaluations
validate that our task and models are engaging to humans, with our best model
close to human performance.}},
added-at = {2019-02-27T22:23:29.000+0100},
archiveprefix = {arXiv},
author = {xxx},
biburl = {https://www.bibsonomy.org/bibtex/20b2a615b016e264aaaf85a2b7cd20fa1/nmatsuk},
citeulike-article-id = {14649646},
citeulike-linkout-0 = {http://arxiv.org/abs/1810.10665},
citeulike-linkout-1 = {http://arxiv.org/pdf/1810.10665},
day = 25,
eprint = {1810.10665},
interhash = {76279c1d110bf5d8062948b79d29de9b},
intrahash = {0b2a615b016e264aaaf85a2b7cd20fa1},
keywords = {arch tags},
month = oct,
posted-at = {2018-10-31 10:43:29},
priority = {4},
timestamp = {2019-02-27T22:23:29.000+0100},
title = {{Engaging Image Captioning Via Personality}},
url = {http://arxiv.org/abs/1810.10665},
year = 2018
}