Vision transformers have delivered tremendous success in representation
learning. This is primarily due to effective token mixing through self
attention. However, this scales quadratically with the number of pixels, which
becomes infeasible for high-resolution inputs. To cope with this challenge, we
propose Adaptive Fourier Neural Operator (AFNO) as an efficient token mixer
that learns to mix in the Fourier domain. AFNO is based on a principled
foundation of operator learning which allows us to frame token mixing as a
continuous global convolution without any dependence on the input resolution.
This principle was previously used to design FNO, which solves global
convolution efficiently in the Fourier domain and has shown promise in learning
challenging PDEs. To handle challenges in visual representation learning such
as discontinuities in images and high resolution inputs, we propose principled
architectural modifications to FNO which results in memory and computational
efficiency. This includes imposing a block-diagonal structure on the channel
mixing weights, adaptively sharing weights across tokens, and sparsifying the
frequency modes via soft-thresholding and shrinkage. The resulting model is
highly parallel with a quasi-linear complexity and has linear memory in the
sequence size. AFNO outperforms self-attention mechanisms for few-shot
segmentation in terms of both efficiency and accuracy. For Cityscapes
segmentation with the Segformer-B3 backbone, AFNO can handle a sequence size of
65k and outperforms other efficient self-attention mechanisms.
Описание
[2111.13587] Adaptive Fourier Neural Operators: Efficient Token Mixers for Transformers
%0 Generic
%1 guibas2021adaptive
%A Guibas, John
%A Mardani, Morteza
%A Li, Zongyi
%A Tao, Andrew
%A Anandkumar, Anima
%A Catanzaro, Bryan
%D 2021
%K climate deeplearning fourieroperator idea:big_data_geo_2 todo:read transformer
%T Adaptive Fourier Neural Operators: Efficient Token Mixers for
Transformers
%U http://arxiv.org/abs/2111.13587
%X Vision transformers have delivered tremendous success in representation
learning. This is primarily due to effective token mixing through self
attention. However, this scales quadratically with the number of pixels, which
becomes infeasible for high-resolution inputs. To cope with this challenge, we
propose Adaptive Fourier Neural Operator (AFNO) as an efficient token mixer
that learns to mix in the Fourier domain. AFNO is based on a principled
foundation of operator learning which allows us to frame token mixing as a
continuous global convolution without any dependence on the input resolution.
This principle was previously used to design FNO, which solves global
convolution efficiently in the Fourier domain and has shown promise in learning
challenging PDEs. To handle challenges in visual representation learning such
as discontinuities in images and high resolution inputs, we propose principled
architectural modifications to FNO which results in memory and computational
efficiency. This includes imposing a block-diagonal structure on the channel
mixing weights, adaptively sharing weights across tokens, and sparsifying the
frequency modes via soft-thresholding and shrinkage. The resulting model is
highly parallel with a quasi-linear complexity and has linear memory in the
sequence size. AFNO outperforms self-attention mechanisms for few-shot
segmentation in terms of both efficiency and accuracy. For Cityscapes
segmentation with the Segformer-B3 backbone, AFNO can handle a sequence size of
65k and outperforms other efficient self-attention mechanisms.
@misc{guibas2021adaptive,
abstract = {Vision transformers have delivered tremendous success in representation
learning. This is primarily due to effective token mixing through self
attention. However, this scales quadratically with the number of pixels, which
becomes infeasible for high-resolution inputs. To cope with this challenge, we
propose Adaptive Fourier Neural Operator (AFNO) as an efficient token mixer
that learns to mix in the Fourier domain. AFNO is based on a principled
foundation of operator learning which allows us to frame token mixing as a
continuous global convolution without any dependence on the input resolution.
This principle was previously used to design FNO, which solves global
convolution efficiently in the Fourier domain and has shown promise in learning
challenging PDEs. To handle challenges in visual representation learning such
as discontinuities in images and high resolution inputs, we propose principled
architectural modifications to FNO which results in memory and computational
efficiency. This includes imposing a block-diagonal structure on the channel
mixing weights, adaptively sharing weights across tokens, and sparsifying the
frequency modes via soft-thresholding and shrinkage. The resulting model is
highly parallel with a quasi-linear complexity and has linear memory in the
sequence size. AFNO outperforms self-attention mechanisms for few-shot
segmentation in terms of both efficiency and accuracy. For Cityscapes
segmentation with the Segformer-B3 backbone, AFNO can handle a sequence size of
65k and outperforms other efficient self-attention mechanisms.},
added-at = {2023-08-11T14:30:02.000+0200},
author = {Guibas, John and Mardani, Morteza and Li, Zongyi and Tao, Andrew and Anandkumar, Anima and Catanzaro, Bryan},
biburl = {https://www.bibsonomy.org/bibtex/27c108dbe973e37c28df646ac4377e262/annakrause},
description = {[2111.13587] Adaptive Fourier Neural Operators: Efficient Token Mixers for Transformers},
interhash = {5b95669257f8f06fda7a9b93d0fcce05},
intrahash = {7c108dbe973e37c28df646ac4377e262},
keywords = {climate deeplearning fourieroperator idea:big_data_geo_2 todo:read transformer},
note = {cite arxiv:2111.13587},
timestamp = {2023-08-11T14:30:02.000+0200},
title = {Adaptive Fourier Neural Operators: Efficient Token Mixers for
Transformers},
url = {http://arxiv.org/abs/2111.13587},
year = 2021
}