The gradient noise (GN) in the stochastic gradient descent (SGD) algorithm is
often considered to be Gaussian in the large data regime by assuming that the
classical central limit theorem (CLT) kicks in. This assumption is often made
for mathematical convenience, since it enables SGD to be analyzed as a
stochastic differential equation (SDE) driven by a Brownian motion. We argue
that the Gaussianity assumption might fail to hold in deep learning settings
and hence render the Brownian motion-based analyses inappropriate. Inspired by
non-Gaussian natural phenomena, we consider the GN in a more general context
and invoke the generalized CLT (GCLT), which suggests that the GN converges to
a heavy-tailed $\alpha$-stable random variable. Accordingly, we propose to
analyze SGD as an SDE driven by a Lévy motion. Such SDEs can incur `jumps',
which force the SDE transition from narrow minima to wider minima, as proven by
existing metastability theory. To validate the $\alpha$-stable assumption, we
conduct extensive experiments on common deep learning architectures and show
that in all settings, the GN is highly non-Gaussian and admits heavy-tails. We
further investigate the tail behavior in varying network architectures and
sizes, loss functions, and datasets. Our results open up a different
perspective and shed more light on the belief that SGD prefers wide minima.
Description
[1901.06053] A Tail-Index Analysis of Stochastic Gradient Noise in Deep Neural Networks
%0 Journal Article
%1 simsekli2019tailindex
%A Simsekli, Umut
%A Sagun, Levent
%A Gurbuzbalaban, Mert
%D 2019
%K optimization stable stochastic
%T A Tail-Index Analysis of Stochastic Gradient Noise in Deep Neural
Networks
%U http://arxiv.org/abs/1901.06053
%X The gradient noise (GN) in the stochastic gradient descent (SGD) algorithm is
often considered to be Gaussian in the large data regime by assuming that the
classical central limit theorem (CLT) kicks in. This assumption is often made
for mathematical convenience, since it enables SGD to be analyzed as a
stochastic differential equation (SDE) driven by a Brownian motion. We argue
that the Gaussianity assumption might fail to hold in deep learning settings
and hence render the Brownian motion-based analyses inappropriate. Inspired by
non-Gaussian natural phenomena, we consider the GN in a more general context
and invoke the generalized CLT (GCLT), which suggests that the GN converges to
a heavy-tailed $\alpha$-stable random variable. Accordingly, we propose to
analyze SGD as an SDE driven by a Lévy motion. Such SDEs can incur `jumps',
which force the SDE transition from narrow minima to wider minima, as proven by
existing metastability theory. To validate the $\alpha$-stable assumption, we
conduct extensive experiments on common deep learning architectures and show
that in all settings, the GN is highly non-Gaussian and admits heavy-tails. We
further investigate the tail behavior in varying network architectures and
sizes, loss functions, and datasets. Our results open up a different
perspective and shed more light on the belief that SGD prefers wide minima.
@article{simsekli2019tailindex,
abstract = {The gradient noise (GN) in the stochastic gradient descent (SGD) algorithm is
often considered to be Gaussian in the large data regime by assuming that the
classical central limit theorem (CLT) kicks in. This assumption is often made
for mathematical convenience, since it enables SGD to be analyzed as a
stochastic differential equation (SDE) driven by a Brownian motion. We argue
that the Gaussianity assumption might fail to hold in deep learning settings
and hence render the Brownian motion-based analyses inappropriate. Inspired by
non-Gaussian natural phenomena, we consider the GN in a more general context
and invoke the generalized CLT (GCLT), which suggests that the GN converges to
a heavy-tailed $\alpha$-stable random variable. Accordingly, we propose to
analyze SGD as an SDE driven by a L\'{e}vy motion. Such SDEs can incur `jumps',
which force the SDE transition from narrow minima to wider minima, as proven by
existing metastability theory. To validate the $\alpha$-stable assumption, we
conduct extensive experiments on common deep learning architectures and show
that in all settings, the GN is highly non-Gaussian and admits heavy-tails. We
further investigate the tail behavior in varying network architectures and
sizes, loss functions, and datasets. Our results open up a different
perspective and shed more light on the belief that SGD prefers wide minima.},
added-at = {2019-06-12T17:47:52.000+0200},
author = {Simsekli, Umut and Sagun, Levent and Gurbuzbalaban, Mert},
biburl = {https://www.bibsonomy.org/bibtex/27fe9c66a64b57edbdec89e9fbe12bfc4/kirk86},
description = {[1901.06053] A Tail-Index Analysis of Stochastic Gradient Noise in Deep Neural Networks},
interhash = {6422c8bdca2c302dcbc881ef3735e136},
intrahash = {7fe9c66a64b57edbdec89e9fbe12bfc4},
keywords = {optimization stable stochastic},
note = {cite arxiv:1901.06053},
timestamp = {2019-06-12T17:47:52.000+0200},
title = {A Tail-Index Analysis of Stochastic Gradient Noise in Deep Neural
Networks},
url = {http://arxiv.org/abs/1901.06053},
year = 2019
}