Large Transformer models routinely achieve state-of-the-art results on a
number of tasks but training these models can be prohibitively costly,
especially on long sequences. We introduce two techniques to improve the
efficiency of Transformers. For one, we replace dot-product attention by one
that uses locality-sensitive hashing, changing its complexity from O($L^2$) to
O($LL$), where $L$ is the length of the sequence. Furthermore, we use
reversible residual layers instead of the standard residuals, which allows
storing activations only once in the training process instead of $N$ times,
where $N$ is the number of layers. The resulting model, the Reformer, performs
on par with Transformer models while being much more memory-efficient and much
faster on long sequences.
%0 Generic
%1 kitaev2020reformer
%A Kitaev, Nikita
%A Kaiser, Łukasz
%A Levskaya, Anselm
%D 2020
%K reformer toread transformer
%T Reformer: The Efficient Transformer
%U http://arxiv.org/abs/2001.04451
%X Large Transformer models routinely achieve state-of-the-art results on a
number of tasks but training these models can be prohibitively costly,
especially on long sequences. We introduce two techniques to improve the
efficiency of Transformers. For one, we replace dot-product attention by one
that uses locality-sensitive hashing, changing its complexity from O($L^2$) to
O($LL$), where $L$ is the length of the sequence. Furthermore, we use
reversible residual layers instead of the standard residuals, which allows
storing activations only once in the training process instead of $N$ times,
where $N$ is the number of layers. The resulting model, the Reformer, performs
on par with Transformer models while being much more memory-efficient and much
faster on long sequences.
@misc{kitaev2020reformer,
abstract = {Large Transformer models routinely achieve state-of-the-art results on a
number of tasks but training these models can be prohibitively costly,
especially on long sequences. We introduce two techniques to improve the
efficiency of Transformers. For one, we replace dot-product attention by one
that uses locality-sensitive hashing, changing its complexity from O($L^2$) to
O($L\log L$), where $L$ is the length of the sequence. Furthermore, we use
reversible residual layers instead of the standard residuals, which allows
storing activations only once in the training process instead of $N$ times,
where $N$ is the number of layers. The resulting model, the Reformer, performs
on par with Transformer models while being much more memory-efficient and much
faster on long sequences.},
added-at = {2020-01-20T17:01:37.000+0100},
author = {Kitaev, Nikita and Kaiser, Łukasz and Levskaya, Anselm},
biburl = {https://www.bibsonomy.org/bibtex/2afcb7d1d8971f1dc55e2816c8b3235e6/nosebrain},
description = {[2001.04451] Reformer: The Efficient Transformer},
interhash = {264da6dbb9930af00afa1a0986970585},
intrahash = {afcb7d1d8971f1dc55e2816c8b3235e6},
keywords = {reformer toread transformer},
note = {cite arxiv:2001.04451Comment: ICLR 2020},
timestamp = {2020-01-20T17:01:37.000+0100},
title = {Reformer: The Efficient Transformer},
url = {http://arxiv.org/abs/2001.04451},
year = 2020
}