We study the supervised learning problem under either of the following two
models: (1) Feature vectors $x_i$ are $d$-dimensional Gaussians
and responses are $y_i = f_*(x_i)$ for $f_*$ an unknown quadratic
function; (2) Feature vectors $x_i$ are distributed as a mixture
of two $d$-dimensional centered Gaussians, and $y_i$'s are the corresponding
class labels. We use two-layers neural networks with quadratic activations, and
compare three different learning regimes: the random features (RF) regime in
which we only train the second-layer weights; the neural tangent (NT) regime in
which we train a linearization of the neural network around its initialization;
the fully trained neural network (NN) regime in which we train all the weights
in the network. We prove that, even for the simple quadratic model of point
(1), there is a potentially unbounded gap between the prediction risk achieved
in these three training regimes, when the number of neurons is smaller than the
ambient dimension. When the number of neurons is larger than the number of
dimensions, the problem is significantly easier and both NT and NN learning
achieve zero risk.
Beschreibung
[1906.08899] Limitations of Lazy Training of Two-layers Neural Networks
%0 Journal Article
%1 ghorbani2019limitations
%A Ghorbani, Behrooz
%A Mei, Song
%A Misiakiewicz, Theodor
%A Montanari, Andrea
%D 2019
%K deep-learning generalization readings
%T Limitations of Lazy Training of Two-layers Neural Networks
%U http://arxiv.org/abs/1906.08899
%X We study the supervised learning problem under either of the following two
models: (1) Feature vectors $x_i$ are $d$-dimensional Gaussians
and responses are $y_i = f_*(x_i)$ for $f_*$ an unknown quadratic
function; (2) Feature vectors $x_i$ are distributed as a mixture
of two $d$-dimensional centered Gaussians, and $y_i$'s are the corresponding
class labels. We use two-layers neural networks with quadratic activations, and
compare three different learning regimes: the random features (RF) regime in
which we only train the second-layer weights; the neural tangent (NT) regime in
which we train a linearization of the neural network around its initialization;
the fully trained neural network (NN) regime in which we train all the weights
in the network. We prove that, even for the simple quadratic model of point
(1), there is a potentially unbounded gap between the prediction risk achieved
in these three training regimes, when the number of neurons is smaller than the
ambient dimension. When the number of neurons is larger than the number of
dimensions, the problem is significantly easier and both NT and NN learning
achieve zero risk.
@article{ghorbani2019limitations,
abstract = {We study the supervised learning problem under either of the following two
models: (1) Feature vectors ${\boldsymbol x}_i$ are $d$-dimensional Gaussians
and responses are $y_i = f_*({\boldsymbol x}_i)$ for $f_*$ an unknown quadratic
function; (2) Feature vectors ${\boldsymbol x}_i$ are distributed as a mixture
of two $d$-dimensional centered Gaussians, and $y_i$'s are the corresponding
class labels. We use two-layers neural networks with quadratic activations, and
compare three different learning regimes: the random features (RF) regime in
which we only train the second-layer weights; the neural tangent (NT) regime in
which we train a linearization of the neural network around its initialization;
the fully trained neural network (NN) regime in which we train all the weights
in the network. We prove that, even for the simple quadratic model of point
(1), there is a potentially unbounded gap between the prediction risk achieved
in these three training regimes, when the number of neurons is smaller than the
ambient dimension. When the number of neurons is larger than the number of
dimensions, the problem is significantly easier and both NT and NN learning
achieve zero risk.},
added-at = {2019-09-26T15:29:43.000+0200},
author = {Ghorbani, Behrooz and Mei, Song and Misiakiewicz, Theodor and Montanari, Andrea},
biburl = {https://www.bibsonomy.org/bibtex/2b6aba55d71425fdd34728e7fbc929e55/kirk86},
description = {[1906.08899] Limitations of Lazy Training of Two-layers Neural Networks},
interhash = {b7c42353dd2314185d6b9e17f2d360c0},
intrahash = {b6aba55d71425fdd34728e7fbc929e55},
keywords = {deep-learning generalization readings},
note = {cite arxiv:1906.08899Comment: 39 pages; 2 pdf figures},
timestamp = {2019-09-26T15:29:43.000+0200},
title = {Limitations of Lazy Training of Two-layers Neural Networks},
url = {http://arxiv.org/abs/1906.08899},
year = 2019
}