Recent work has revealed that overparameterized networks trained by gradient
descent achieve arbitrarily low training error, and sometimes even low test
error. The required width, however, is always polynomial in at least one of the
sample size $n$, the (inverse) training error $1/\epsilon$, and the (inverse)
failure probability $1/\delta$. This work shows that
$O(1/\epsilon)$ iterations of gradient descent on two-layer
networks of any width exceeding $polylog(n,1/\epsilon,1/\delta)$ and
$Ømega(1/\epsilon^2)$ training examples suffices to achieve a test
error of $\epsilon$. The analysis further relies upon a margin property of the
limiting kernel, which is guaranteed positive, and can distinguish between true
labels and random labels.
Description
[1909.12292] Polylogarithmic width suffices for gradient descent to achieve arbitrarily small test error with shallow ReLU networks
%0 Journal Article
%1 ji2019polylogarithmic
%A Ji, Ziwei
%A Telgarsky, Matus
%D 2019
%K generalization optimization readings theory
%T Polylogarithmic width suffices for gradient descent to achieve
arbitrarily small test error with shallow ReLU networks
%U http://arxiv.org/abs/1909.12292
%X Recent work has revealed that overparameterized networks trained by gradient
descent achieve arbitrarily low training error, and sometimes even low test
error. The required width, however, is always polynomial in at least one of the
sample size $n$, the (inverse) training error $1/\epsilon$, and the (inverse)
failure probability $1/\delta$. This work shows that
$O(1/\epsilon)$ iterations of gradient descent on two-layer
networks of any width exceeding $polylog(n,1/\epsilon,1/\delta)$ and
$Ømega(1/\epsilon^2)$ training examples suffices to achieve a test
error of $\epsilon$. The analysis further relies upon a margin property of the
limiting kernel, which is guaranteed positive, and can distinguish between true
labels and random labels.
@article{ji2019polylogarithmic,
abstract = {Recent work has revealed that overparameterized networks trained by gradient
descent achieve arbitrarily low training error, and sometimes even low test
error. The required width, however, is always polynomial in at least one of the
sample size $n$, the (inverse) training error $1/\epsilon$, and the (inverse)
failure probability $1/\delta$. This work shows that
$\widetilde{O}(1/\epsilon)$ iterations of gradient descent on two-layer
networks of any width exceeding $\mathrm{polylog}(n,1/\epsilon,1/\delta)$ and
$\widetilde{\Omega}(1/\epsilon^2)$ training examples suffices to achieve a test
error of $\epsilon$. The analysis further relies upon a margin property of the
limiting kernel, which is guaranteed positive, and can distinguish between true
labels and random labels.},
added-at = {2019-09-27T13:47:59.000+0200},
author = {Ji, Ziwei and Telgarsky, Matus},
biburl = {https://www.bibsonomy.org/bibtex/20a6dcd0ca0abc95dbdd78b137875981d/kirk86},
description = {[1909.12292] Polylogarithmic width suffices for gradient descent to achieve arbitrarily small test error with shallow ReLU networks},
interhash = {4791d3a5dd09e08d7c2f762625e6a1ad},
intrahash = {0a6dcd0ca0abc95dbdd78b137875981d},
keywords = {generalization optimization readings theory},
note = {cite arxiv:1909.12292},
timestamp = {2019-09-27T13:47:59.000+0200},
title = {Polylogarithmic width suffices for gradient descent to achieve
arbitrarily small test error with shallow ReLU networks},
url = {http://arxiv.org/abs/1909.12292},
year = 2019
}