Interpolators---estimators that achieve zero training error---have attracted
growing attention in machine learning, mainly because state-of-the art neural
networks appear to be models of this type. In this paper, we study minimum
$\ell_2$ norm ("ridgeless") interpolation in high-dimensional least squares
regression. We consider two different models for the feature distribution: a
linear model, where the feature vectors $x_i R^p$ are obtained by
applying a linear transform to a vector of i.i.d. entries, $x_i = \Sigma^1/2
z_i$ (with $z_i R^p$); and a nonlinear model, where the feature
vectors are obtained by passing the input through a random one-layer neural
network, $x_i = \varphi(W z_i)$ (with $z_i R^d$, $W ın
R^p d$ a matrix of i.i.d. entries, and $\varphi$ an
activation function acting componentwise on $W z_i$). We recover---in a precise
quantitative way---several phenomena that have been observed in large-scale
neural networks and kernel machines, including the "double descent" behavior of
the prediction risk, and the potential benefits of overparametrization.
Beschreibung
[1903.08560] Surprises in High-Dimensional Ridgeless Least Squares Interpolation
%0 Journal Article
%1 hastie2019surprises
%A Hastie, Trevor
%A Montanari, Andrea
%A Rosset, Saharon
%A Tibshirani, Ryan J.
%D 2019
%K generalization interpolation readings
%T Surprises in High-Dimensional Ridgeless Least Squares Interpolation
%U http://arxiv.org/abs/1903.08560
%X Interpolators---estimators that achieve zero training error---have attracted
growing attention in machine learning, mainly because state-of-the art neural
networks appear to be models of this type. In this paper, we study minimum
$\ell_2$ norm ("ridgeless") interpolation in high-dimensional least squares
regression. We consider two different models for the feature distribution: a
linear model, where the feature vectors $x_i R^p$ are obtained by
applying a linear transform to a vector of i.i.d. entries, $x_i = \Sigma^1/2
z_i$ (with $z_i R^p$); and a nonlinear model, where the feature
vectors are obtained by passing the input through a random one-layer neural
network, $x_i = \varphi(W z_i)$ (with $z_i R^d$, $W ın
R^p d$ a matrix of i.i.d. entries, and $\varphi$ an
activation function acting componentwise on $W z_i$). We recover---in a precise
quantitative way---several phenomena that have been observed in large-scale
neural networks and kernel machines, including the "double descent" behavior of
the prediction risk, and the potential benefits of overparametrization.
@article{hastie2019surprises,
abstract = {Interpolators---estimators that achieve zero training error---have attracted
growing attention in machine learning, mainly because state-of-the art neural
networks appear to be models of this type. In this paper, we study minimum
$\ell_2$ norm ("ridgeless") interpolation in high-dimensional least squares
regression. We consider two different models for the feature distribution: a
linear model, where the feature vectors $x_i \in \mathbb{R}^p$ are obtained by
applying a linear transform to a vector of i.i.d. entries, $x_i = \Sigma^{1/2}
z_i$ (with $z_i \in \mathbb{R}^p$); and a nonlinear model, where the feature
vectors are obtained by passing the input through a random one-layer neural
network, $x_i = \varphi(W z_i)$ (with $z_i \in \mathbb{R}^d$, $W \in
\mathbb{R}^{p \times d}$ a matrix of i.i.d. entries, and $\varphi$ an
activation function acting componentwise on $W z_i$). We recover---in a precise
quantitative way---several phenomena that have been observed in large-scale
neural networks and kernel machines, including the "double descent" behavior of
the prediction risk, and the potential benefits of overparametrization.},
added-at = {2020-02-20T18:42:27.000+0100},
author = {Hastie, Trevor and Montanari, Andrea and Rosset, Saharon and Tibshirani, Ryan J.},
biburl = {https://www.bibsonomy.org/bibtex/21e5b05e16409fb226fbf96489a4f3eb9/kirk86},
description = {[1903.08560] Surprises in High-Dimensional Ridgeless Least Squares Interpolation},
interhash = {7176dc70c7640df9b6a35f5b0565508d},
intrahash = {1e5b05e16409fb226fbf96489a4f3eb9},
keywords = {generalization interpolation readings},
note = {cite arxiv:1903.08560Comment: 53 pages; 13 figures},
timestamp = {2020-02-22T03:11:38.000+0100},
title = {Surprises in High-Dimensional Ridgeless Least Squares Interpolation},
url = {http://arxiv.org/abs/1903.08560},
year = 2019
}