Attention-based architectures have become ubiquitous in machine learning, yet
our understanding of the reasons for their effectiveness remains limited. This
work proposes a new way to understand self-attention networks: we show that
their output can be decomposed into a sum of smaller terms, each involving the
operation of a sequence of attention heads across layers. Using this
decomposition, we prove that self-attention possesses a strong inductive bias
towards "token uniformity". Specifically, without skip connections or
multi-layer perceptrons (MLPs), the output converges doubly exponentially to a
rank-1 matrix. On the other hand, skip connections and MLPs stop the output
from degeneration. Our experiments verify the identified convergence phenomena
on different variants of standard transformer architectures.
Description
[2103.03404] Attention is Not All You Need: Pure Attention Loses Rank Doubly Exponentially with Depth
%0 Generic
%1 dong2021attention
%A Dong, Yihe
%A Cordonnier, Jean-Baptiste
%A Loukas, Andreas
%D 2021
%K 2021 attention deep-learning
%T Attention is Not All You Need: Pure Attention Loses Rank Doubly
Exponentially with Depth
%U http://arxiv.org/abs/2103.03404
%X Attention-based architectures have become ubiquitous in machine learning, yet
our understanding of the reasons for their effectiveness remains limited. This
work proposes a new way to understand self-attention networks: we show that
their output can be decomposed into a sum of smaller terms, each involving the
operation of a sequence of attention heads across layers. Using this
decomposition, we prove that self-attention possesses a strong inductive bias
towards "token uniformity". Specifically, without skip connections or
multi-layer perceptrons (MLPs), the output converges doubly exponentially to a
rank-1 matrix. On the other hand, skip connections and MLPs stop the output
from degeneration. Our experiments verify the identified convergence phenomena
on different variants of standard transformer architectures.
@misc{dong2021attention,
abstract = {Attention-based architectures have become ubiquitous in machine learning, yet
our understanding of the reasons for their effectiveness remains limited. This
work proposes a new way to understand self-attention networks: we show that
their output can be decomposed into a sum of smaller terms, each involving the
operation of a sequence of attention heads across layers. Using this
decomposition, we prove that self-attention possesses a strong inductive bias
towards "token uniformity". Specifically, without skip connections or
multi-layer perceptrons (MLPs), the output converges doubly exponentially to a
rank-1 matrix. On the other hand, skip connections and MLPs stop the output
from degeneration. Our experiments verify the identified convergence phenomena
on different variants of standard transformer architectures.},
added-at = {2021-03-30T21:49:03.000+0200},
author = {Dong, Yihe and Cordonnier, Jean-Baptiste and Loukas, Andreas},
biburl = {https://www.bibsonomy.org/bibtex/2f61b5deb0ecc01f257f92f546e34d227/analyst},
description = {[2103.03404] Attention is Not All You Need: Pure Attention Loses Rank Doubly Exponentially with Depth},
interhash = {5aee65ee5572bb5580c1ebbf247234aa},
intrahash = {f61b5deb0ecc01f257f92f546e34d227},
keywords = {2021 attention deep-learning},
note = {cite arxiv:2103.03404},
timestamp = {2021-03-30T21:49:03.000+0200},
title = {Attention is Not All You Need: Pure Attention Loses Rank Doubly
Exponentially with Depth},
url = {http://arxiv.org/abs/2103.03404},
year = 2021
}