We study the role of depth in training randomly initialized overparameterized
neural networks. We give the first general result showing that depth improves
trainability of neural networks by improving the conditioning of certain
kernel matrices of the input data. This result holds for arbitrary non-linear
activation functions, and we provide a characterization of the improvement in
conditioning as a function of the degree of non-linearity and the depth of the
network. We provide versions of the result that hold for training just the top
layer of the neural network, as well as for training all layers, via the neural
tangent kernel. As applications of these general results, we provide a
generalization of the results of Das et al. (2019) showing that learnability of
deep random neural networks with arbitrary non-linear activations (under mild
assumptions) degrades exponentially with depth. Additionally, we show how
benign overfitting can occur in deep neural networks via the results of
Bartlett et al. (2019b).
Description
[2002.01523] A Deep Conditioning Treatment of Neural Networks
%0 Generic
%1 agarwal2020conditioning
%A Agarwal, Naman
%A Awasthi, Pranjal
%A Kale, Satyen
%D 2020
%K 2020 arxiv deep-learning
%T A Deep Conditioning Treatment of Neural Networks
%U http://arxiv.org/abs/2002.01523
%X We study the role of depth in training randomly initialized overparameterized
neural networks. We give the first general result showing that depth improves
trainability of neural networks by improving the conditioning of certain
kernel matrices of the input data. This result holds for arbitrary non-linear
activation functions, and we provide a characterization of the improvement in
conditioning as a function of the degree of non-linearity and the depth of the
network. We provide versions of the result that hold for training just the top
layer of the neural network, as well as for training all layers, via the neural
tangent kernel. As applications of these general results, we provide a
generalization of the results of Das et al. (2019) showing that learnability of
deep random neural networks with arbitrary non-linear activations (under mild
assumptions) degrades exponentially with depth. Additionally, we show how
benign overfitting can occur in deep neural networks via the results of
Bartlett et al. (2019b).
@misc{agarwal2020conditioning,
abstract = {We study the role of depth in training randomly initialized overparameterized
neural networks. We give the first general result showing that depth improves
trainability of neural networks by improving the {\em conditioning} of certain
kernel matrices of the input data. This result holds for arbitrary non-linear
activation functions, and we provide a characterization of the improvement in
conditioning as a function of the degree of non-linearity and the depth of the
network. We provide versions of the result that hold for training just the top
layer of the neural network, as well as for training all layers, via the neural
tangent kernel. As applications of these general results, we provide a
generalization of the results of Das et al. (2019) showing that learnability of
deep random neural networks with arbitrary non-linear activations (under mild
assumptions) degrades exponentially with depth. Additionally, we show how
benign overfitting can occur in deep neural networks via the results of
Bartlett et al. (2019b).},
added-at = {2020-02-07T18:28:23.000+0100},
author = {Agarwal, Naman and Awasthi, Pranjal and Kale, Satyen},
biburl = {https://www.bibsonomy.org/bibtex/2302ffd109b51a15cb7175d18853ea1b7/analyst},
description = {[2002.01523] A Deep Conditioning Treatment of Neural Networks},
interhash = {bcd43062f2019e392f29014b091e62be},
intrahash = {302ffd109b51a15cb7175d18853ea1b7},
keywords = {2020 arxiv deep-learning},
note = {cite arxiv:2002.01523},
timestamp = {2020-02-07T18:28:23.000+0100},
title = {A Deep Conditioning Treatment of Neural Networks},
url = {http://arxiv.org/abs/2002.01523},
year = 2020
}