Sign-based optimization methods have become popular in machine learning due
to their favorable communication cost in distributed optimization and their
surprisingly good performance in neural network training. Furthermore, they are
closely connected to so-called adaptive gradient methods like Adam. Recent
works on signSGD have used a non-standard "separable smoothness" assumption,
whereas some older works study sign gradient descent as steepest descent with
respect to the $\ell_ınfty$-norm. In this work, we unify these existing
results by showing a close connection between separable smoothness and
$\ell_ınfty$-smoothness and argue that the latter is the weaker and more
natural assumption. We then proceed to study the smoothness constant with
respect to the $\ell_ınfty$-norm and thereby isolate geometric properties of
the objective function which affect the performance of sign-based methods. In
short, we find sign-based methods to be preferable over gradient descent if (i)
the Hessian is to some degree concentrated on its diagonal, and (ii) its
maximal eigenvalue is much larger than the average eigenvalue. Both properties
are common in deep networks.
Description
[2002.08056] The Geometry of Sign Gradient Descent
%0 Journal Article
%1 balles2020geometry
%A Balles, Lukas
%A Pedregosa, Fabian
%A Roux, Nicolas Le
%D 2020
%K bias geometry optimization readings
%T The Geometry of Sign Gradient Descent
%U http://arxiv.org/abs/2002.08056
%X Sign-based optimization methods have become popular in machine learning due
to their favorable communication cost in distributed optimization and their
surprisingly good performance in neural network training. Furthermore, they are
closely connected to so-called adaptive gradient methods like Adam. Recent
works on signSGD have used a non-standard "separable smoothness" assumption,
whereas some older works study sign gradient descent as steepest descent with
respect to the $\ell_ınfty$-norm. In this work, we unify these existing
results by showing a close connection between separable smoothness and
$\ell_ınfty$-smoothness and argue that the latter is the weaker and more
natural assumption. We then proceed to study the smoothness constant with
respect to the $\ell_ınfty$-norm and thereby isolate geometric properties of
the objective function which affect the performance of sign-based methods. In
short, we find sign-based methods to be preferable over gradient descent if (i)
the Hessian is to some degree concentrated on its diagonal, and (ii) its
maximal eigenvalue is much larger than the average eigenvalue. Both properties
are common in deep networks.
@article{balles2020geometry,
abstract = {Sign-based optimization methods have become popular in machine learning due
to their favorable communication cost in distributed optimization and their
surprisingly good performance in neural network training. Furthermore, they are
closely connected to so-called adaptive gradient methods like Adam. Recent
works on signSGD have used a non-standard "separable smoothness" assumption,
whereas some older works study sign gradient descent as steepest descent with
respect to the $\ell_\infty$-norm. In this work, we unify these existing
results by showing a close connection between separable smoothness and
$\ell_\infty$-smoothness and argue that the latter is the weaker and more
natural assumption. We then proceed to study the smoothness constant with
respect to the $\ell_\infty$-norm and thereby isolate geometric properties of
the objective function which affect the performance of sign-based methods. In
short, we find sign-based methods to be preferable over gradient descent if (i)
the Hessian is to some degree concentrated on its diagonal, and (ii) its
maximal eigenvalue is much larger than the average eigenvalue. Both properties
are common in deep networks.},
added-at = {2020-02-22T02:14:38.000+0100},
author = {Balles, Lukas and Pedregosa, Fabian and Roux, Nicolas Le},
biburl = {https://www.bibsonomy.org/bibtex/2393863f6ad2398adb7ca3b0583d0e9d3/kirk86},
description = {[2002.08056] The Geometry of Sign Gradient Descent},
interhash = {66f6286107e00e1b4c24884190f68b8c},
intrahash = {393863f6ad2398adb7ca3b0583d0e9d3},
keywords = {bias geometry optimization readings},
note = {cite arxiv:2002.08056},
timestamp = {2020-02-22T02:14:38.000+0100},
title = {The Geometry of Sign Gradient Descent},
url = {http://arxiv.org/abs/2002.08056},
year = 2020
}