Deep neural networks have become invaluable tools for supervised machine
learning, e.g., classification of text or images. While often offering superior
results over traditional techniques and successfully expressing complicated
patterns in data, deep architectures are known to be challenging to design and
train such that they generalize well to new data. Important issues with deep
architectures are numerical instabilities in derivative-based learning
algorithms commonly called exploding or vanishing gradients. In this paper we
propose new forward propagation techniques inspired by systems of Ordinary
Differential Equations (ODE) that overcome this challenge and lead to
well-posed learning problems for arbitrarily deep networks.
The backbone of our approach is our interpretation of deep learning as a
parameter estimation problem of nonlinear dynamical systems. Given this
formulation, we analyze stability and well-posedness of deep learning and use
this new understanding to develop new network architectures. We relate the
exploding and vanishing gradient phenomenon to the stability of the discrete
ODE and present several strategies for stabilizing deep learning for very deep
networks. While our new architectures restrict the solution space, several
numerical experiments show their competitiveness with state-of-the-art
networks.
Description
[1705.03341] Stable Architectures for Deep Neural Networks
%0 Generic
%1 haber2017stable
%A Haber, Eldad
%A Ruthotto, Lars
%D 2017
%K architecture deep-learning ode stability
%R 10.1088/1361-6420/aa9a90
%T Stable Architectures for Deep Neural Networks
%U http://arxiv.org/abs/1705.03341
%X Deep neural networks have become invaluable tools for supervised machine
learning, e.g., classification of text or images. While often offering superior
results over traditional techniques and successfully expressing complicated
patterns in data, deep architectures are known to be challenging to design and
train such that they generalize well to new data. Important issues with deep
architectures are numerical instabilities in derivative-based learning
algorithms commonly called exploding or vanishing gradients. In this paper we
propose new forward propagation techniques inspired by systems of Ordinary
Differential Equations (ODE) that overcome this challenge and lead to
well-posed learning problems for arbitrarily deep networks.
The backbone of our approach is our interpretation of deep learning as a
parameter estimation problem of nonlinear dynamical systems. Given this
formulation, we analyze stability and well-posedness of deep learning and use
this new understanding to develop new network architectures. We relate the
exploding and vanishing gradient phenomenon to the stability of the discrete
ODE and present several strategies for stabilizing deep learning for very deep
networks. While our new architectures restrict the solution space, several
numerical experiments show their competitiveness with state-of-the-art
networks.
@misc{haber2017stable,
abstract = {Deep neural networks have become invaluable tools for supervised machine
learning, e.g., classification of text or images. While often offering superior
results over traditional techniques and successfully expressing complicated
patterns in data, deep architectures are known to be challenging to design and
train such that they generalize well to new data. Important issues with deep
architectures are numerical instabilities in derivative-based learning
algorithms commonly called exploding or vanishing gradients. In this paper we
propose new forward propagation techniques inspired by systems of Ordinary
Differential Equations (ODE) that overcome this challenge and lead to
well-posed learning problems for arbitrarily deep networks.
The backbone of our approach is our interpretation of deep learning as a
parameter estimation problem of nonlinear dynamical systems. Given this
formulation, we analyze stability and well-posedness of deep learning and use
this new understanding to develop new network architectures. We relate the
exploding and vanishing gradient phenomenon to the stability of the discrete
ODE and present several strategies for stabilizing deep learning for very deep
networks. While our new architectures restrict the solution space, several
numerical experiments show their competitiveness with state-of-the-art
networks.},
added-at = {2021-06-02T11:44:57.000+0200},
author = {Haber, Eldad and Ruthotto, Lars},
biburl = {https://www.bibsonomy.org/bibtex/2167b40acb9a0177ffffab8eb4e8ca26b/adulny},
description = {[1705.03341] Stable Architectures for Deep Neural Networks},
doi = {10.1088/1361-6420/aa9a90},
interhash = {3e84d6e4b3b29838939570371b729937},
intrahash = {167b40acb9a0177ffffab8eb4e8ca26b},
keywords = {architecture deep-learning ode stability},
note = {cite arxiv:1705.03341Comment: 23 pages, 7 figures},
timestamp = {2021-06-02T11:44:57.000+0200},
title = {Stable Architectures for Deep Neural Networks},
url = {http://arxiv.org/abs/1705.03341},
year = 2017
}