Deep learning yields great results across many fields, from speech
recognition, image classification, to translation. But for each problem,
getting a deep model to work well involves research into the architecture and a
long period of tuning. We present a single model that yields good results on a
number of problems spanning multiple domains. In particular, this single model
is trained concurrently on ImageNet, multiple translation tasks, image
captioning (COCO dataset), a speech recognition corpus, and an English parsing
task. Our model architecture incorporates building blocks from multiple
domains. It contains convolutional layers, an attention mechanism, and
sparsely-gated layers. Each of these computational blocks is crucial for a
subset of the tasks we train on. Interestingly, even if a block is not crucial
for a task, we observe that adding it never hurts performance and in most cases
improves it on all tasks. We also show that tasks with less data benefit
largely from joint training with other tasks, while performance on large tasks
degrades only slightly if at all.
%0 Generic
%1 KaiGom17
%A Kaiser, Lukasz
%A Gomez, Aidan N.
%A Shazeer, Noam
%A Vaswani, Ashish
%A Parmar, Niki
%A Jones, Llion
%A Uszkoreit, Jakob
%D 2017
%K deep_learning mulitple_domains multimodel topology
%T One Model To Learn Them All
%U http://arxiv.org/abs/1706.05137
%X Deep learning yields great results across many fields, from speech
recognition, image classification, to translation. But for each problem,
getting a deep model to work well involves research into the architecture and a
long period of tuning. We present a single model that yields good results on a
number of problems spanning multiple domains. In particular, this single model
is trained concurrently on ImageNet, multiple translation tasks, image
captioning (COCO dataset), a speech recognition corpus, and an English parsing
task. Our model architecture incorporates building blocks from multiple
domains. It contains convolutional layers, an attention mechanism, and
sparsely-gated layers. Each of these computational blocks is crucial for a
subset of the tasks we train on. Interestingly, even if a block is not crucial
for a task, we observe that adding it never hurts performance and in most cases
improves it on all tasks. We also show that tasks with less data benefit
largely from joint training with other tasks, while performance on large tasks
degrades only slightly if at all.
@misc{KaiGom17,
abstract = {{Deep learning yields great results across many fields, from speech
recognition, image classification, to translation. But for each problem,
getting a deep model to work well involves research into the architecture and a
long period of tuning. We present a single model that yields good results on a
number of problems spanning multiple domains. In particular, this single model
is trained concurrently on ImageNet, multiple translation tasks, image
captioning (COCO dataset), a speech recognition corpus, and an English parsing
task. Our model architecture incorporates building blocks from multiple
domains. It contains convolutional layers, an attention mechanism, and
sparsely-gated layers. Each of these computational blocks is crucial for a
subset of the tasks we train on. Interestingly, even if a block is not crucial
for a task, we observe that adding it never hurts performance and in most cases
improves it on all tasks. We also show that tasks with less data benefit
largely from joint training with other tasks, while performance on large tasks
degrades only slightly if at all.}},
added-at = {2018-02-28T16:05:26.000+0100},
archiveprefix = {arXiv},
author = {Kaiser, Lukasz and Gomez, Aidan N. and Shazeer, Noam and Vaswani, Ashish and Parmar, Niki and Jones, Llion and Uszkoreit, Jakob},
biburl = {https://www.bibsonomy.org/bibtex/259f5c325056be1b49e27b12e548ae9f9/loroch},
citeulike-article-id = {14379034},
citeulike-linkout-0 = {http://arxiv.org/abs/1706.05137},
citeulike-linkout-1 = {http://arxiv.org/pdf/1706.05137},
day = 16,
eprint = {1706.05137},
interhash = {dcb64604f344a537d0489a0f7e5e65fc},
intrahash = {59f5c325056be1b49e27b12e548ae9f9},
keywords = {deep_learning mulitple_domains multimodel topology},
month = jun,
posted-at = {2017-06-22 21:39:36},
priority = {3},
timestamp = {2018-02-28T16:05:26.000+0100},
title = {{One Model To Learn Them All}},
url = {http://arxiv.org/abs/1706.05137},
year = 2017
}