In 1988, Eric B. Baum showed that two-layers neural networks with threshold
activation function can perfectly memorize the binary labels of $n$ points in
general position in $R^d$ using only $n/d \urcorner$
neurons. We observe that with ReLU networks, using four times as many neurons
one can fit arbitrary real labels. Moreover, for approximate memorization up to
error $\epsilon$, the neural tangent kernel can also memorize with only
$Ołeft(nd łog(1/\epsilon) \right)$ neurons (assuming that the
data is well dispersed too). We show however that these constructions give rise
to networks where the magnitude of the neurons' weights are far from optimal.
In contrast we propose a new training procedure for ReLU networks, based on
complex (as opposed to real) recombination of the neurons, for which we show
approximate memorization with both $Ołeft(nd \cdot
łog(1/\epsilon)\epsilon\right)$ neurons, as well as nearly-optimal
size of the weights.
Description
[2006.02855] Network size and weights size for memorization with two-layers neural networks
%0 Journal Article
%1 bubeck2020network
%A Bubeck, Sébastien
%A Eldan, Ronen
%A Lee, Yin Tat
%A Mikulincer, Dan
%D 2020
%K generalization memory readings
%T Network size and weights size for memorization with two-layers neural
networks
%U http://arxiv.org/abs/2006.02855
%X In 1988, Eric B. Baum showed that two-layers neural networks with threshold
activation function can perfectly memorize the binary labels of $n$ points in
general position in $R^d$ using only $n/d \urcorner$
neurons. We observe that with ReLU networks, using four times as many neurons
one can fit arbitrary real labels. Moreover, for approximate memorization up to
error $\epsilon$, the neural tangent kernel can also memorize with only
$Ołeft(nd łog(1/\epsilon) \right)$ neurons (assuming that the
data is well dispersed too). We show however that these constructions give rise
to networks where the magnitude of the neurons' weights are far from optimal.
In contrast we propose a new training procedure for ReLU networks, based on
complex (as opposed to real) recombination of the neurons, for which we show
approximate memorization with both $Ołeft(nd \cdot
łog(1/\epsilon)\epsilon\right)$ neurons, as well as nearly-optimal
size of the weights.
@article{bubeck2020network,
abstract = {In 1988, Eric B. Baum showed that two-layers neural networks with threshold
activation function can perfectly memorize the binary labels of $n$ points in
general position in $\mathbb{R}^d$ using only $\ulcorner n/d \urcorner$
neurons. We observe that with ReLU networks, using four times as many neurons
one can fit arbitrary real labels. Moreover, for approximate memorization up to
error $\epsilon$, the neural tangent kernel can also memorize with only
$O\left(\frac{n}{d} \cdot \log(1/\epsilon) \right)$ neurons (assuming that the
data is well dispersed too). We show however that these constructions give rise
to networks where the magnitude of the neurons' weights are far from optimal.
In contrast we propose a new training procedure for ReLU networks, based on
complex (as opposed to real) recombination of the neurons, for which we show
approximate memorization with both $O\left(\frac{n}{d} \cdot
\frac{\log(1/\epsilon)}{\epsilon}\right)$ neurons, as well as nearly-optimal
size of the weights.},
added-at = {2020-06-15T20:54:11.000+0200},
author = {Bubeck, Sébastien and Eldan, Ronen and Lee, Yin Tat and Mikulincer, Dan},
biburl = {https://www.bibsonomy.org/bibtex/274523bdd1d0f8779a6458d8eb040a8aa/kirk86},
description = {[2006.02855] Network size and weights size for memorization with two-layers neural networks},
interhash = {b45014e645587a0fb93879f1f096be7c},
intrahash = {74523bdd1d0f8779a6458d8eb040a8aa},
keywords = {generalization memory readings},
note = {cite arxiv:2006.02855Comment: 27 pages},
timestamp = {2020-06-15T20:54:11.000+0200},
title = {Network size and weights size for memorization with two-layers neural
networks},
url = {http://arxiv.org/abs/2006.02855},
year = 2020
}