Batch normalization (BatchNorm) has become an indispensable tool for training
deep neural networks, yet it is still poorly understood. Although previous work
has typically focused on its normalization component, BatchNorm also adds two
per-feature trainable parameters: a coefficient and a bias. However, the role
and expressive power of these parameters remains unclear. To study this
question, we investigate the performance achieved when training only these
parameters and freezing all others at their random initializations. We find
that doing so leads to surprisingly high performance. For example, a
sufficiently deep ResNet reaches 83% accuracy on CIFAR-10 in this
configuration. Interestingly, BatchNorm achieves this performance in part by
naturally learning to disable around a third of the random features without any
changes to the training objective. Not only do these results highlight the
under-appreciated role of the affine parameters in BatchNorm, but - in a
broader sense - they characterize the expressive power of neural networks
constructed simply by shifting and rescaling random features.
Description
[2003.00152] Training BatchNorm and Only BatchNorm: On the Expressive Power of Random Features in CNNs
%0 Journal Article
%1 frankle2020training
%A Frankle, Jonathan
%A Schwab, David J.
%A Morcos, Ari S.
%D 2020
%K feature-selection randomized readings
%T Training BatchNorm and Only BatchNorm: On the Expressive Power of Random
Features in CNNs
%U http://arxiv.org/abs/2003.00152
%X Batch normalization (BatchNorm) has become an indispensable tool for training
deep neural networks, yet it is still poorly understood. Although previous work
has typically focused on its normalization component, BatchNorm also adds two
per-feature trainable parameters: a coefficient and a bias. However, the role
and expressive power of these parameters remains unclear. To study this
question, we investigate the performance achieved when training only these
parameters and freezing all others at their random initializations. We find
that doing so leads to surprisingly high performance. For example, a
sufficiently deep ResNet reaches 83% accuracy on CIFAR-10 in this
configuration. Interestingly, BatchNorm achieves this performance in part by
naturally learning to disable around a third of the random features without any
changes to the training objective. Not only do these results highlight the
under-appreciated role of the affine parameters in BatchNorm, but - in a
broader sense - they characterize the expressive power of neural networks
constructed simply by shifting and rescaling random features.
@article{frankle2020training,
abstract = {Batch normalization (BatchNorm) has become an indispensable tool for training
deep neural networks, yet it is still poorly understood. Although previous work
has typically focused on its normalization component, BatchNorm also adds two
per-feature trainable parameters: a coefficient and a bias. However, the role
and expressive power of these parameters remains unclear. To study this
question, we investigate the performance achieved when training only these
parameters and freezing all others at their random initializations. We find
that doing so leads to surprisingly high performance. For example, a
sufficiently deep ResNet reaches 83% accuracy on CIFAR-10 in this
configuration. Interestingly, BatchNorm achieves this performance in part by
naturally learning to disable around a third of the random features without any
changes to the training objective. Not only do these results highlight the
under-appreciated role of the affine parameters in BatchNorm, but - in a
broader sense - they characterize the expressive power of neural networks
constructed simply by shifting and rescaling random features.},
added-at = {2020-03-05T23:02:09.000+0100},
author = {Frankle, Jonathan and Schwab, David J. and Morcos, Ari S.},
biburl = {https://www.bibsonomy.org/bibtex/2d0106f4d8bdb5a121701ca6f0f685533/kirk86},
description = {[2003.00152] Training BatchNorm and Only BatchNorm: On the Expressive Power of Random Features in CNNs},
interhash = {2d4c9caede252cd261ad2879a10ef526},
intrahash = {d0106f4d8bdb5a121701ca6f0f685533},
keywords = {feature-selection randomized readings},
note = {cite arxiv:2003.00152},
timestamp = {2020-03-05T23:02:09.000+0100},
title = {Training BatchNorm and Only BatchNorm: On the Expressive Power of Random
Features in CNNs},
url = {http://arxiv.org/abs/2003.00152},
year = 2020
}