A strong selling point of using a model in reinforcement learning is that model-based algorithms can propagate the obtained experience more quickly, and are able to direct exploration better. As a consequence, fewer exploratory actions are enough to learn a good policy. Strangely enough, current theoretical results for model-based algorithms do not support this claim: In a Markov decision process with N states, the best bounds on the number of exploratory steps necessary are of order $O(N^2 N)$, in contrast to the $O(N N)$ bound available for the model-free, delayed Q-learning algorithm. In this paper we show that a modified version of the Rmax algorithm needs to make at most $O(N N)$ exploratory steps. This matches the lower bound up to logarithmic factors, as well as the upper bound of the state-of-the-art model-free algorithm, while our new bound improves the dependence on the discount factor gamma.
%0 Conference Paper
%1 szita2010
%A Szita, I.
%A Szepesvári, Cs.
%B ICML
%D 2010
%E Fürnkranz, J.
%E Joachims, T.
%I Omnipress
%K PAC-learning, algorithms exploitation, exploration learning, reinforcement sequential theory, vs.
%P 1031--1038
%T Model-based Reinforcement Learning with Nearly Tight Exploration Complexity Bounds
%X A strong selling point of using a model in reinforcement learning is that model-based algorithms can propagate the obtained experience more quickly, and are able to direct exploration better. As a consequence, fewer exploratory actions are enough to learn a good policy. Strangely enough, current theoretical results for model-based algorithms do not support this claim: In a Markov decision process with N states, the best bounds on the number of exploratory steps necessary are of order $O(N^2 N)$, in contrast to the $O(N N)$ bound available for the model-free, delayed Q-learning algorithm. In this paper we show that a modified version of the Rmax algorithm needs to make at most $O(N N)$ exploratory steps. This matches the lower bound up to logarithmic factors, as well as the upper bound of the state-of-the-art model-free algorithm, while our new bound improves the dependence on the discount factor gamma.
@inproceedings{szita2010,
abstract = {A strong selling point of using a model in reinforcement learning is that model-based algorithms can propagate the obtained experience more quickly, and are able to direct exploration better. As a consequence, fewer exploratory actions are enough to learn a good policy. Strangely enough, current theoretical results for model-based algorithms do not support this claim: In a Markov decision process with N states, the best bounds on the number of exploratory steps necessary are of order $O(N^2 \log N)$, in contrast to the $O(N \log N)$ bound available for the model-free, delayed Q-learning algorithm. In this paper we show that a modified version of the Rmax algorithm needs to make at most $O(N \log N)$ exploratory steps. This matches the lower bound up to logarithmic factors, as well as the upper bound of the state-of-the-art model-free algorithm, while our new bound improves the dependence on the discount factor gamma.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Szita, I. and Szepesv{\'a}ri, {Cs}.},
biburl = {https://www.bibsonomy.org/bibtex/2a6b0dbe83dea9c80cf2171c6ed8829c0/csaba},
booktitle = {ICML},
crossref = {ICML2010},
date-added = {2010-08-28 17:38:14 -0600},
date-modified = {2010-11-25 00:50:11 -0700},
editor = {F{\"u}rnkranz, J. and Joachims, T.},
ee = {http://www.icml2010.org/papers/546.pdf},
interhash = {fa745b8f188b30cd83733a77b6f7cf6e},
intrahash = {a6b0dbe83dea9c80cf2171c6ed8829c0},
keywords = {PAC-learning, algorithms exploitation, exploration learning, reinforcement sequential theory, vs.},
month = {June},
pages = {1031--1038},
pdf = {papers/ICML10_rmax_improved.pdf},
publisher = {Omnipress},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Model-based Reinforcement Learning with Nearly Tight Exploration Complexity Bounds},
year = 2010
}