Deep ensembles have been empirically shown to be a promising approach for
improving accuracy, uncertainty and out-of-distribution robustness of deep
learning models. While deep ensembles were theoretically motivated by the
bootstrap, non-bootstrap ensembles trained with just random initialization also
perform well in practice, which suggests that there could be other explanations
for why deep ensembles work well. Bayesian neural networks, which learn
distributions over the parameters of the network, are theoretically
well-motivated by Bayesian principles, but do not perform as well as deep
ensembles in practice, particularly under dataset shift. One possible
explanation for this gap between theory and practice is that popular scalable
approximate Bayesian methods tend to focus on a single mode, whereas deep
ensembles tend to explore diverse modes in function space. We investigate this
hypothesis by building on recent work on understanding the loss landscape of
neural networks and adding our own exploration to measure the similarity of
functions in the space of predictions. Our results show that random
initializations explore entirely different modes, while functions along an
optimization trajectory or sampled from the subspace thereof cluster within a
single mode predictions-wise, while often deviating significantly in the weight
space. We demonstrate that while low-loss connectors between modes exist, they
are not connected in the space of predictions. Developing the concept of the
diversity--accuracy plane, we show that the decorrelation power of random
initializations is unmatched by popular subspace sampling methods.
Beschreibung
[1912.02757] Deep Ensembles: A Loss Landscape Perspective
%0 Journal Article
%1 fort2019ensembles
%A Fort, Stanislav
%A Hu, Huiyi
%A Lakshminarayanan, Balaji
%D 2019
%K generalization optimization readings uncertainty
%T Deep Ensembles: A Loss Landscape Perspective
%U http://arxiv.org/abs/1912.02757
%X Deep ensembles have been empirically shown to be a promising approach for
improving accuracy, uncertainty and out-of-distribution robustness of deep
learning models. While deep ensembles were theoretically motivated by the
bootstrap, non-bootstrap ensembles trained with just random initialization also
perform well in practice, which suggests that there could be other explanations
for why deep ensembles work well. Bayesian neural networks, which learn
distributions over the parameters of the network, are theoretically
well-motivated by Bayesian principles, but do not perform as well as deep
ensembles in practice, particularly under dataset shift. One possible
explanation for this gap between theory and practice is that popular scalable
approximate Bayesian methods tend to focus on a single mode, whereas deep
ensembles tend to explore diverse modes in function space. We investigate this
hypothesis by building on recent work on understanding the loss landscape of
neural networks and adding our own exploration to measure the similarity of
functions in the space of predictions. Our results show that random
initializations explore entirely different modes, while functions along an
optimization trajectory or sampled from the subspace thereof cluster within a
single mode predictions-wise, while often deviating significantly in the weight
space. We demonstrate that while low-loss connectors between modes exist, they
are not connected in the space of predictions. Developing the concept of the
diversity--accuracy plane, we show that the decorrelation power of random
initializations is unmatched by popular subspace sampling methods.
@article{fort2019ensembles,
abstract = {Deep ensembles have been empirically shown to be a promising approach for
improving accuracy, uncertainty and out-of-distribution robustness of deep
learning models. While deep ensembles were theoretically motivated by the
bootstrap, non-bootstrap ensembles trained with just random initialization also
perform well in practice, which suggests that there could be other explanations
for why deep ensembles work well. Bayesian neural networks, which learn
distributions over the parameters of the network, are theoretically
well-motivated by Bayesian principles, but do not perform as well as deep
ensembles in practice, particularly under dataset shift. One possible
explanation for this gap between theory and practice is that popular scalable
approximate Bayesian methods tend to focus on a single mode, whereas deep
ensembles tend to explore diverse modes in function space. We investigate this
hypothesis by building on recent work on understanding the loss landscape of
neural networks and adding our own exploration to measure the similarity of
functions in the space of predictions. Our results show that random
initializations explore entirely different modes, while functions along an
optimization trajectory or sampled from the subspace thereof cluster within a
single mode predictions-wise, while often deviating significantly in the weight
space. We demonstrate that while low-loss connectors between modes exist, they
are not connected in the space of predictions. Developing the concept of the
diversity--accuracy plane, we show that the decorrelation power of random
initializations is unmatched by popular subspace sampling methods.},
added-at = {2019-12-22T19:57:51.000+0100},
author = {Fort, Stanislav and Hu, Huiyi and Lakshminarayanan, Balaji},
biburl = {https://www.bibsonomy.org/bibtex/2757fc90283907c8d515f85e54fd0c08e/kirk86},
description = {[1912.02757] Deep Ensembles: A Loss Landscape Perspective},
interhash = {3ed892828e122ebdc1e15ea630f9434e},
intrahash = {757fc90283907c8d515f85e54fd0c08e},
keywords = {generalization optimization readings uncertainty},
note = {cite arxiv:1912.02757},
timestamp = {2019-12-22T19:57:51.000+0100},
title = {Deep Ensembles: A Loss Landscape Perspective},
url = {http://arxiv.org/abs/1912.02757},
year = 2019
}