We establish a data-dependent notion of algorithmic stability for Stochastic
Gradient Descent (SGD), and employ it to develop novel generalization bounds.
This is in contrast to previous distribution-free algorithmic stability results
for SGD which depend on the worst-case constants. By virtue of the
data-dependent argument, our bounds provide new insights into learning with SGD
on convex and non-convex problems. In the convex case, we show that the bound
on the generalization error depends on the risk at the initialization point. In
the non-convex case, we prove that the expected curvature of the objective
function around the initialization point has crucial influence on the
generalization error. In both cases, our results suggest a simple data-driven
strategy to stabilize SGD by pre-screening its initialization. As a corollary,
our results allow us to show optimistic generalization bounds that exhibit fast
convergence rates for SGD subject to a vanishing empirical risk and low noise
of stochastic gradient.
Beschreibung
Data-Dependent Stability of Stochastic Gradient Descent
%0 Generic
%1 kuzborskij2017datadependent
%A Kuzborskij, Ilja
%A Lampert, Christoph H.
%D 2017
%K SGD optimization theory to_read
%T Data-Dependent Stability of Stochastic Gradient Descent
%U http://arxiv.org/abs/1703.01678
%X We establish a data-dependent notion of algorithmic stability for Stochastic
Gradient Descent (SGD), and employ it to develop novel generalization bounds.
This is in contrast to previous distribution-free algorithmic stability results
for SGD which depend on the worst-case constants. By virtue of the
data-dependent argument, our bounds provide new insights into learning with SGD
on convex and non-convex problems. In the convex case, we show that the bound
on the generalization error depends on the risk at the initialization point. In
the non-convex case, we prove that the expected curvature of the objective
function around the initialization point has crucial influence on the
generalization error. In both cases, our results suggest a simple data-driven
strategy to stabilize SGD by pre-screening its initialization. As a corollary,
our results allow us to show optimistic generalization bounds that exhibit fast
convergence rates for SGD subject to a vanishing empirical risk and low noise
of stochastic gradient.
@misc{kuzborskij2017datadependent,
abstract = {We establish a data-dependent notion of algorithmic stability for Stochastic
Gradient Descent (SGD), and employ it to develop novel generalization bounds.
This is in contrast to previous distribution-free algorithmic stability results
for SGD which depend on the worst-case constants. By virtue of the
data-dependent argument, our bounds provide new insights into learning with SGD
on convex and non-convex problems. In the convex case, we show that the bound
on the generalization error depends on the risk at the initialization point. In
the non-convex case, we prove that the expected curvature of the objective
function around the initialization point has crucial influence on the
generalization error. In both cases, our results suggest a simple data-driven
strategy to stabilize SGD by pre-screening its initialization. As a corollary,
our results allow us to show optimistic generalization bounds that exhibit fast
convergence rates for SGD subject to a vanishing empirical risk and low noise
of stochastic gradient.},
added-at = {2018-02-19T06:28:52.000+0100},
author = {Kuzborskij, Ilja and Lampert, Christoph H.},
biburl = {https://www.bibsonomy.org/bibtex/283da19142bf05f061982e943dbeada58/jk_itwm},
description = {Data-Dependent Stability of Stochastic Gradient Descent},
interhash = {c671f23a87a2672e10b5b480ca1fbdfe},
intrahash = {83da19142bf05f061982e943dbeada58},
keywords = {SGD optimization theory to_read},
note = {cite arxiv:1703.01678},
timestamp = {2018-02-19T06:28:52.000+0100},
title = {Data-Dependent Stability of Stochastic Gradient Descent},
url = {http://arxiv.org/abs/1703.01678},
year = 2017
}