Learning useful representations without supervision remains a key challenge
in machine learning. In this paper, we propose a simple yet powerful generative
model that learns such discrete representations. Our model, the Vector
Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways:
the encoder network outputs discrete, rather than continuous, codes; and the
prior is learnt rather than static. In order to learn a discrete latent
representation, we incorporate ideas from vector quantisation (VQ). Using the
VQ method allows the model to circumvent issues of "posterior collapse" --
where the latents are ignored when they are paired with a powerful
autoregressive decoder -- typically observed in the VAE framework. Pairing
these representations with an autoregressive prior, the model can generate high
quality images, videos, and speech as well as doing high quality speaker
conversion and unsupervised learning of phonemes, providing further evidence of
the utility of the learnt representations.
%0 Generic
%1 oord2017neural
%A Oord, Aaron van den
%A Vinyals, Oriol
%A Kavukcuoglu, Koray
%D 2017
%K learning representation vq-vae
%T Neural Discrete Representation Learning
%U http://arxiv.org/abs/1711.00937
%X Learning useful representations without supervision remains a key challenge
in machine learning. In this paper, we propose a simple yet powerful generative
model that learns such discrete representations. Our model, the Vector
Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways:
the encoder network outputs discrete, rather than continuous, codes; and the
prior is learnt rather than static. In order to learn a discrete latent
representation, we incorporate ideas from vector quantisation (VQ). Using the
VQ method allows the model to circumvent issues of "posterior collapse" --
where the latents are ignored when they are paired with a powerful
autoregressive decoder -- typically observed in the VAE framework. Pairing
these representations with an autoregressive prior, the model can generate high
quality images, videos, and speech as well as doing high quality speaker
conversion and unsupervised learning of phonemes, providing further evidence of
the utility of the learnt representations.
@misc{oord2017neural,
abstract = {Learning useful representations without supervision remains a key challenge
in machine learning. In this paper, we propose a simple yet powerful generative
model that learns such discrete representations. Our model, the Vector
Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways:
the encoder network outputs discrete, rather than continuous, codes; and the
prior is learnt rather than static. In order to learn a discrete latent
representation, we incorporate ideas from vector quantisation (VQ). Using the
VQ method allows the model to circumvent issues of "posterior collapse" --
where the latents are ignored when they are paired with a powerful
autoregressive decoder -- typically observed in the VAE framework. Pairing
these representations with an autoregressive prior, the model can generate high
quality images, videos, and speech as well as doing high quality speaker
conversion and unsupervised learning of phonemes, providing further evidence of
the utility of the learnt representations.},
added-at = {2020-03-02T10:46:43.000+0100},
author = {Oord, Aaron van den and Vinyals, Oriol and Kavukcuoglu, Koray},
biburl = {https://www.bibsonomy.org/bibtex/215309437fa347f07d5fa2441f805d7c1/nosebrain},
interhash = {b8563b6d31c62a23db005cbcf4a9bfbe},
intrahash = {15309437fa347f07d5fa2441f805d7c1},
keywords = {learning representation vq-vae},
note = {cite arxiv:1711.00937},
timestamp = {2020-03-02T10:46:43.000+0100},
title = {Neural Discrete Representation Learning},
url = {http://arxiv.org/abs/1711.00937},
year = 2017
}