Several variants of the Long Short-Term Memory (LSTM) architecture for
recurrent neural networks have been proposed since its inception in 1995. In
recent years, these networks have become the state-of-the-art models for a
variety of machine learning problems. This has led to a renewed interest in
understanding the role and utility of various computational components of
typical LSTM variants. In this paper, we present the first large-scale analysis
of eight LSTM variants on three representative tasks: speech recognition,
handwriting recognition, and polyphonic music modeling. The hyperparameters of
all LSTM variants for each task were optimized separately using random search,
and their importance was assessed using the powerful fANOVA framework. In
total, we summarize the results of 5400 experimental runs ($15$ years
of CPU time), which makes our study the largest of its kind on LSTM networks.
Our results show that none of the variants can improve upon the standard LSTM
architecture significantly, and demonstrate the forget gate and the output
activation function to be its most critical components. We further observe that
the studied hyperparameters are virtually independent and derive guidelines for
their efficient adjustment.
%0 Generic
%1 greff2015search
%A Greff, Klaus
%A Srivastava, Rupesh Kumar
%A Koutník, Jan
%A Steunebrink, Bas R.
%A Schmidhuber, Jürgen
%D 2015
%K 2015 deep-learning lstm paper rnn
%R 10.1109/TNNLS.2016.2582924
%T LSTM: A Search Space Odyssey
%U http://arxiv.org/abs/1503.04069
%X Several variants of the Long Short-Term Memory (LSTM) architecture for
recurrent neural networks have been proposed since its inception in 1995. In
recent years, these networks have become the state-of-the-art models for a
variety of machine learning problems. This has led to a renewed interest in
understanding the role and utility of various computational components of
typical LSTM variants. In this paper, we present the first large-scale analysis
of eight LSTM variants on three representative tasks: speech recognition,
handwriting recognition, and polyphonic music modeling. The hyperparameters of
all LSTM variants for each task were optimized separately using random search,
and their importance was assessed using the powerful fANOVA framework. In
total, we summarize the results of 5400 experimental runs ($15$ years
of CPU time), which makes our study the largest of its kind on LSTM networks.
Our results show that none of the variants can improve upon the standard LSTM
architecture significantly, and demonstrate the forget gate and the output
activation function to be its most critical components. We further observe that
the studied hyperparameters are virtually independent and derive guidelines for
their efficient adjustment.
@misc{greff2015search,
abstract = {Several variants of the Long Short-Term Memory (LSTM) architecture for
recurrent neural networks have been proposed since its inception in 1995. In
recent years, these networks have become the state-of-the-art models for a
variety of machine learning problems. This has led to a renewed interest in
understanding the role and utility of various computational components of
typical LSTM variants. In this paper, we present the first large-scale analysis
of eight LSTM variants on three representative tasks: speech recognition,
handwriting recognition, and polyphonic music modeling. The hyperparameters of
all LSTM variants for each task were optimized separately using random search,
and their importance was assessed using the powerful fANOVA framework. In
total, we summarize the results of 5400 experimental runs ($\approx 15$ years
of CPU time), which makes our study the largest of its kind on LSTM networks.
Our results show that none of the variants can improve upon the standard LSTM
architecture significantly, and demonstrate the forget gate and the output
activation function to be its most critical components. We further observe that
the studied hyperparameters are virtually independent and derive guidelines for
their efficient adjustment.},
added-at = {2021-04-27T21:13:28.000+0200},
author = {Greff, Klaus and Srivastava, Rupesh Kumar and Koutník, Jan and Steunebrink, Bas R. and Schmidhuber, Jürgen},
biburl = {https://www.bibsonomy.org/bibtex/243789cf4ae08eeb130a1fd0b66199953/analyst},
description = {[1503.04069] LSTM: A Search Space Odyssey},
doi = {10.1109/TNNLS.2016.2582924},
interhash = {68644741a66e3fa841b62bb4e637f4c7},
intrahash = {43789cf4ae08eeb130a1fd0b66199953},
keywords = {2015 deep-learning lstm paper rnn},
note = {cite arxiv:1503.04069Comment: 12 pages, 6 figures},
timestamp = {2021-04-27T21:13:28.000+0200},
title = {LSTM: A Search Space Odyssey},
url = {http://arxiv.org/abs/1503.04069},
year = 2015
}