Optimal transport offers an alternative to maximum likelihood for learning
generative autoencoding models. We show that minimizing the p-Wasserstein
distance between the generator and the true data distribution is equivalent to
the unconstrained min-min optimization of the p-Wasserstein distance between
the encoder aggregated posterior and the prior in latent space, plus a
reconstruction error. We also identify the role of its trade-off hyperparameter
as the capacity of the generator: its Lipschitz constant. Moreover, we prove
that optimizing the encoder over any class of universal approximators, such as
deterministic neural networks, is enough to come arbitrarily close to the
optimum. We therefore advertise this framework, which holds for any metric
space and prior, as a sweet-spot of current generative autoencoding objectives.
We then introduce the Sinkhorn auto-encoder (SAE), which approximates and
minimizes the p-Wasserstein distance in latent space via backprogation through
the Sinkhorn algorithm. SAE directly works on samples, i.e. it models the
aggregated posterior as an implicit distribution, with no need for a
reparameterization trick for gradients estimations. SAE is thus able to work
with different metric spaces and priors with minimal adaptations. We
demonstrate the flexibility of SAE on latent spaces with different geometries
and priors and compare with other methods on benchmark data sets.
%0 Journal Article
%1 patrini2018sinkhorn
%A Patrini, Giorgio
%A Berg, Rianne van den
%A Forré, Patrick
%A Carioni, Marcello
%A Bhargav, Samarth
%A Welling, Max
%A Genewein, Tim
%A Nielsen, Frank
%D 2018
%K divergences generative-models optimal-transport
%T Sinkhorn AutoEncoders
%U http://arxiv.org/abs/1810.01118
%X Optimal transport offers an alternative to maximum likelihood for learning
generative autoencoding models. We show that minimizing the p-Wasserstein
distance between the generator and the true data distribution is equivalent to
the unconstrained min-min optimization of the p-Wasserstein distance between
the encoder aggregated posterior and the prior in latent space, plus a
reconstruction error. We also identify the role of its trade-off hyperparameter
as the capacity of the generator: its Lipschitz constant. Moreover, we prove
that optimizing the encoder over any class of universal approximators, such as
deterministic neural networks, is enough to come arbitrarily close to the
optimum. We therefore advertise this framework, which holds for any metric
space and prior, as a sweet-spot of current generative autoencoding objectives.
We then introduce the Sinkhorn auto-encoder (SAE), which approximates and
minimizes the p-Wasserstein distance in latent space via backprogation through
the Sinkhorn algorithm. SAE directly works on samples, i.e. it models the
aggregated posterior as an implicit distribution, with no need for a
reparameterization trick for gradients estimations. SAE is thus able to work
with different metric spaces and priors with minimal adaptations. We
demonstrate the flexibility of SAE on latent spaces with different geometries
and priors and compare with other methods on benchmark data sets.
@article{patrini2018sinkhorn,
abstract = {Optimal transport offers an alternative to maximum likelihood for learning
generative autoencoding models. We show that minimizing the p-Wasserstein
distance between the generator and the true data distribution is equivalent to
the unconstrained min-min optimization of the p-Wasserstein distance between
the encoder aggregated posterior and the prior in latent space, plus a
reconstruction error. We also identify the role of its trade-off hyperparameter
as the capacity of the generator: its Lipschitz constant. Moreover, we prove
that optimizing the encoder over any class of universal approximators, such as
deterministic neural networks, is enough to come arbitrarily close to the
optimum. We therefore advertise this framework, which holds for any metric
space and prior, as a sweet-spot of current generative autoencoding objectives.
We then introduce the Sinkhorn auto-encoder (SAE), which approximates and
minimizes the p-Wasserstein distance in latent space via backprogation through
the Sinkhorn algorithm. SAE directly works on samples, i.e. it models the
aggregated posterior as an implicit distribution, with no need for a
reparameterization trick for gradients estimations. SAE is thus able to work
with different metric spaces and priors with minimal adaptations. We
demonstrate the flexibility of SAE on latent spaces with different geometries
and priors and compare with other methods on benchmark data sets.},
added-at = {2019-12-11T14:22:16.000+0100},
author = {Patrini, Giorgio and Berg, Rianne van den and Forré, Patrick and Carioni, Marcello and Bhargav, Samarth and Welling, Max and Genewein, Tim and Nielsen, Frank},
biburl = {https://www.bibsonomy.org/bibtex/23400881bf4e90bba43d3d125c555ff6a/kirk86},
description = {[1810.01118] Sinkhorn AutoEncoders},
interhash = {a0b5fe684fffbfdf9dd18044abd7e108},
intrahash = {3400881bf4e90bba43d3d125c555ff6a},
keywords = {divergences generative-models optimal-transport},
note = {cite arxiv:1810.01118Comment: Accepted for oral presentation at UAI19},
timestamp = {2019-12-11T14:22:16.000+0100},
title = {Sinkhorn AutoEncoders},
url = {http://arxiv.org/abs/1810.01118},
year = 2018
}