Large transformer models have shown extraordinary success in achieving
state-of-the-art results in many natural language processing applications.
However, training and deploying these models can be prohibitively costly for
long sequences, as the standard self-attention mechanism of the Transformer
uses $O(n^2)$ time and space with respect to sequence length. In this paper, we
demonstrate that the self-attention mechanism can be approximated by a low-rank
matrix. We further exploit this finding to propose a new self-attention
mechanism, which reduces the overall self-attention complexity from $O(n^2)$ to
$O(n)$ in both time and space. The resulting linear transformer, the
Linformer, performs on par with standard Transformer models, while
being much more memory- and time-efficient.
Description
[2006.04768] Linformer: Self-Attention with Linear Complexity
%0 Generic
%1 wang2020linformer
%A Wang, Sinong
%A Li, Belinda
%A Khabsa, Madian
%A Fang, Han
%A Ma, Hao
%D 2020
%K attention deeplearning dfg-antrag-steckbriefe linformer neuralnets nlp transformer
%T Linformer: Self-Attention with Linear Complexity
%U http://arxiv.org/abs/2006.04768
%X Large transformer models have shown extraordinary success in achieving
state-of-the-art results in many natural language processing applications.
However, training and deploying these models can be prohibitively costly for
long sequences, as the standard self-attention mechanism of the Transformer
uses $O(n^2)$ time and space with respect to sequence length. In this paper, we
demonstrate that the self-attention mechanism can be approximated by a low-rank
matrix. We further exploit this finding to propose a new self-attention
mechanism, which reduces the overall self-attention complexity from $O(n^2)$ to
$O(n)$ in both time and space. The resulting linear transformer, the
Linformer, performs on par with standard Transformer models, while
being much more memory- and time-efficient.
@misc{wang2020linformer,
abstract = {Large transformer models have shown extraordinary success in achieving
state-of-the-art results in many natural language processing applications.
However, training and deploying these models can be prohibitively costly for
long sequences, as the standard self-attention mechanism of the Transformer
uses $O(n^2)$ time and space with respect to sequence length. In this paper, we
demonstrate that the self-attention mechanism can be approximated by a low-rank
matrix. We further exploit this finding to propose a new self-attention
mechanism, which reduces the overall self-attention complexity from $O(n^2)$ to
$O(n)$ in both time and space. The resulting linear transformer, the
\textit{Linformer}, performs on par with standard Transformer models, while
being much more memory- and time-efficient.},
added-at = {2021-01-12T15:30:01.000+0100},
author = {Wang, Sinong and Li, Belinda and Khabsa, Madian and Fang, Han and Ma, Hao},
biburl = {https://www.bibsonomy.org/bibtex/2d4422aa5122d4fef2d8c05a4eba612ac/albinzehe},
description = {[2006.04768] Linformer: Self-Attention with Linear Complexity},
interhash = {4413c1dbf74e4224b819965aca6a77b9},
intrahash = {d4422aa5122d4fef2d8c05a4eba612ac},
keywords = {attention deeplearning dfg-antrag-steckbriefe linformer neuralnets nlp transformer},
note = {cite arxiv:2006.04768},
timestamp = {2021-01-12T15:30:01.000+0100},
title = {Linformer: Self-Attention with Linear Complexity},
url = {http://arxiv.org/abs/2006.04768},
year = 2020
}