This paper relates parameter distance to gradient breakdown for a broad class
of nonlinear compositional functions. The analysis leads to a new distance
function called deep relative trust and a descent lemma for neural networks.
Since the resulting learning rule seems not to require learning rate grid
search, it may unlock a simpler workflow for training deeper and more complex
neural networks. Please find the Python code used in this paper here:
https://github.com/jxbz/fromage.
Description
[2002.03432] On the distance between two neural networks and the stability of learning
%0 Journal Article
%1 bernstein2020distance
%A Bernstein, Jeremy
%A Vahdat, Arash
%A Yue, Yisong
%A Liu, Ming-Yu
%D 2020
%K generalization noise readings stable
%T On the distance between two neural networks and the stability of
learning
%U http://arxiv.org/abs/2002.03432
%X This paper relates parameter distance to gradient breakdown for a broad class
of nonlinear compositional functions. The analysis leads to a new distance
function called deep relative trust and a descent lemma for neural networks.
Since the resulting learning rule seems not to require learning rate grid
search, it may unlock a simpler workflow for training deeper and more complex
neural networks. Please find the Python code used in this paper here:
https://github.com/jxbz/fromage.
@article{bernstein2020distance,
abstract = {This paper relates parameter distance to gradient breakdown for a broad class
of nonlinear compositional functions. The analysis leads to a new distance
function called deep relative trust and a descent lemma for neural networks.
Since the resulting learning rule seems not to require learning rate grid
search, it may unlock a simpler workflow for training deeper and more complex
neural networks. Please find the Python code used in this paper here:
https://github.com/jxbz/fromage.},
added-at = {2020-06-16T15:29:23.000+0200},
author = {Bernstein, Jeremy and Vahdat, Arash and Yue, Yisong and Liu, Ming-Yu},
biburl = {https://www.bibsonomy.org/bibtex/2667d4b1e96e962b42f10b23d2192d38f/kirk86},
description = {[2002.03432] On the distance between two neural networks and the stability of learning},
interhash = {fc4f780396e903b23a0791f2b00796dc},
intrahash = {667d4b1e96e962b42f10b23d2192d38f},
keywords = {generalization noise readings stable},
note = {cite arxiv:2002.03432},
timestamp = {2020-06-16T15:29:23.000+0200},
title = {On the distance between two neural networks and the stability of
learning},
url = {http://arxiv.org/abs/2002.03432},
year = 2020
}