The reparameterization trick enables optimizing large scale stochastic
computation graphs via gradient descent. The essence of the trick is to
refactor each stochastic node into a differentiable function of its parameters
and a random variable with fixed distribution. After refactoring, the gradients
of the loss propagated by the chain rule through the graph are low variance
unbiased estimators of the gradients of the expected loss. While many
continuous random variables have such reparameterizations, discrete random
variables lack useful reparameterizations due to the discontinuous nature of
discrete states. In this work we introduce Concrete random
variables---continuous relaxations of discrete random variables. The Concrete
distribution is a new family of distributions with closed form densities and a
simple reparameterization. Whenever a discrete stochastic node of a computation
graph can be refactored into a one-hot bit representation that is treated
continuously, Concrete stochastic nodes can be used with automatic
differentiation to produce low-variance biased gradients of objectives
(including objectives that depend on the log-probability of latent stochastic
nodes) on the corresponding discrete graph. We demonstrate the effectiveness of
Concrete relaxations on density estimation and structured prediction tasks
using neural networks.
Description
[1611.00712] The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables
%0 Journal Article
%1 maddison2016concrete
%A Maddison, Chris J.
%A Mnih, Andriy
%A Teh, Yee Whye
%D 2016
%K relaxation stats
%T The Concrete Distribution: A Continuous Relaxation of Discrete Random
Variables
%U http://arxiv.org/abs/1611.00712
%X The reparameterization trick enables optimizing large scale stochastic
computation graphs via gradient descent. The essence of the trick is to
refactor each stochastic node into a differentiable function of its parameters
and a random variable with fixed distribution. After refactoring, the gradients
of the loss propagated by the chain rule through the graph are low variance
unbiased estimators of the gradients of the expected loss. While many
continuous random variables have such reparameterizations, discrete random
variables lack useful reparameterizations due to the discontinuous nature of
discrete states. In this work we introduce Concrete random
variables---continuous relaxations of discrete random variables. The Concrete
distribution is a new family of distributions with closed form densities and a
simple reparameterization. Whenever a discrete stochastic node of a computation
graph can be refactored into a one-hot bit representation that is treated
continuously, Concrete stochastic nodes can be used with automatic
differentiation to produce low-variance biased gradients of objectives
(including objectives that depend on the log-probability of latent stochastic
nodes) on the corresponding discrete graph. We demonstrate the effectiveness of
Concrete relaxations on density estimation and structured prediction tasks
using neural networks.
@article{maddison2016concrete,
abstract = {The reparameterization trick enables optimizing large scale stochastic
computation graphs via gradient descent. The essence of the trick is to
refactor each stochastic node into a differentiable function of its parameters
and a random variable with fixed distribution. After refactoring, the gradients
of the loss propagated by the chain rule through the graph are low variance
unbiased estimators of the gradients of the expected loss. While many
continuous random variables have such reparameterizations, discrete random
variables lack useful reparameterizations due to the discontinuous nature of
discrete states. In this work we introduce Concrete random
variables---continuous relaxations of discrete random variables. The Concrete
distribution is a new family of distributions with closed form densities and a
simple reparameterization. Whenever a discrete stochastic node of a computation
graph can be refactored into a one-hot bit representation that is treated
continuously, Concrete stochastic nodes can be used with automatic
differentiation to produce low-variance biased gradients of objectives
(including objectives that depend on the log-probability of latent stochastic
nodes) on the corresponding discrete graph. We demonstrate the effectiveness of
Concrete relaxations on density estimation and structured prediction tasks
using neural networks.},
added-at = {2019-03-22T13:31:46.000+0100},
author = {Maddison, Chris J. and Mnih, Andriy and Teh, Yee Whye},
biburl = {https://www.bibsonomy.org/bibtex/2bdac6b274c0ea2f0f2cfcc361c0f118d/kirk86},
description = {[1611.00712] The Concrete Distribution: A Continuous Relaxation of Discrete Random Variables},
interhash = {fc80279dd4ef93cba525f83bac95a40c},
intrahash = {bdac6b274c0ea2f0f2cfcc361c0f118d},
keywords = {relaxation stats},
note = {cite arxiv:1611.00712},
timestamp = {2019-03-22T13:31:46.000+0100},
title = {The Concrete Distribution: A Continuous Relaxation of Discrete Random
Variables},
url = {http://arxiv.org/abs/1611.00712},
year = 2016
}