We consider the problem of universal approximation of functions by two-layer
neural nets with random weights that are "nearly Gaussian" in the sense of
Kullback-Leibler divergence. This problem is motivated by recent works on lazy
training, where the weight updates generated by stochastic gradient descent do
not move appreciably from the i.i.d. Gaussian initialization. We first consider
the mean-field limit, where the finite population of neurons in the hidden
layer is replaced by a continual ensemble, and show that our problem can be
phrased as global minimization of a free-energy functional on the space of
probability measures over the weights. This functional trades off the $L^2$
approximation risk against the KL divergence with respect to a centered
Gaussian prior. We characterize the unique global minimizer and then construct
a controlled nonlinear dynamics in the space of probability measures over
weights that solves a McKean--Vlasov optimal control problem. This control
problem is closely related to the Schrödinger bridge (or entropic optimal
transport) problem, and its value is proportional to the minimum of the free
energy. Finally, we show that SGD in the lazy training regime (which can be
ensured by jointly tuning the variance of the Gaussian prior and the entropic
regularization parameter) serves as a greedy approximation to the optimal
McKean--Vlasov distributional dynamics and provide quantitative guarantees on
the $L^2$ approximation error.
Description
[2002.01987] A mean-field theory of lazy training in two-layer neural nets: entropic regularization and controlled McKean-Vlasov dynamics
%0 Journal Article
%1 tzen2020meanfield
%A Tzen, Belinda
%A Raginsky, Maxim
%D 2020
%K bayesian mean-field optimization readings theory uncertainty
%T A mean-field theory of lazy training in two-layer neural nets: entropic
regularization and controlled McKean-Vlasov dynamics
%U http://arxiv.org/abs/2002.01987
%X We consider the problem of universal approximation of functions by two-layer
neural nets with random weights that are "nearly Gaussian" in the sense of
Kullback-Leibler divergence. This problem is motivated by recent works on lazy
training, where the weight updates generated by stochastic gradient descent do
not move appreciably from the i.i.d. Gaussian initialization. We first consider
the mean-field limit, where the finite population of neurons in the hidden
layer is replaced by a continual ensemble, and show that our problem can be
phrased as global minimization of a free-energy functional on the space of
probability measures over the weights. This functional trades off the $L^2$
approximation risk against the KL divergence with respect to a centered
Gaussian prior. We characterize the unique global minimizer and then construct
a controlled nonlinear dynamics in the space of probability measures over
weights that solves a McKean--Vlasov optimal control problem. This control
problem is closely related to the Schrödinger bridge (or entropic optimal
transport) problem, and its value is proportional to the minimum of the free
energy. Finally, we show that SGD in the lazy training regime (which can be
ensured by jointly tuning the variance of the Gaussian prior and the entropic
regularization parameter) serves as a greedy approximation to the optimal
McKean--Vlasov distributional dynamics and provide quantitative guarantees on
the $L^2$ approximation error.
@article{tzen2020meanfield,
abstract = {We consider the problem of universal approximation of functions by two-layer
neural nets with random weights that are "nearly Gaussian" in the sense of
Kullback-Leibler divergence. This problem is motivated by recent works on lazy
training, where the weight updates generated by stochastic gradient descent do
not move appreciably from the i.i.d. Gaussian initialization. We first consider
the mean-field limit, where the finite population of neurons in the hidden
layer is replaced by a continual ensemble, and show that our problem can be
phrased as global minimization of a free-energy functional on the space of
probability measures over the weights. This functional trades off the $L^2$
approximation risk against the KL divergence with respect to a centered
Gaussian prior. We characterize the unique global minimizer and then construct
a controlled nonlinear dynamics in the space of probability measures over
weights that solves a McKean--Vlasov optimal control problem. This control
problem is closely related to the Schr\"odinger bridge (or entropic optimal
transport) problem, and its value is proportional to the minimum of the free
energy. Finally, we show that SGD in the lazy training regime (which can be
ensured by jointly tuning the variance of the Gaussian prior and the entropic
regularization parameter) serves as a greedy approximation to the optimal
McKean--Vlasov distributional dynamics and provide quantitative guarantees on
the $L^2$ approximation error.},
added-at = {2020-02-07T17:29:13.000+0100},
author = {Tzen, Belinda and Raginsky, Maxim},
biburl = {https://www.bibsonomy.org/bibtex/21fb4e17e35bfabc22edc23bcf7f7a138/kirk86},
description = {[2002.01987] A mean-field theory of lazy training in two-layer neural nets: entropic regularization and controlled McKean-Vlasov dynamics},
interhash = {984c2d96fb507f8896ff578ce8bb955c},
intrahash = {1fb4e17e35bfabc22edc23bcf7f7a138},
keywords = {bayesian mean-field optimization readings theory uncertainty},
note = {cite arxiv:2002.01987},
timestamp = {2020-03-22T14:07:33.000+0100},
title = {A mean-field theory of lazy training in two-layer neural nets: entropic
regularization and controlled McKean-Vlasov dynamics},
url = {http://arxiv.org/abs/2002.01987},
year = 2020
}