We consider continuous state, continuous action batch reinforcement learning where the goal is to learn a good policy from a sufficiently rich trajectory generated by some policy. We study a variant of fitted Q-iteration, where the greedy action selection is replaced by searching for a policy in a restricted set of candidate policies by maximizing the average action values. We provide a rigorous analysis of this algorithm, proving what we believe is the first finite-time bound for value-function based algorithms for continuous state and action problems.
Note: In retrospect, it would have been better to call this algorithm an actor-critic algorithm. The algorithm that we considers updates a policy and a value function (action-value function in this case).
%0 Conference Paper
%1 antos2007
%A Antos, A.
%A Munos, R.
%A Szepesvári, Cs.
%B NIPS
%D 2007
%K actor-critic approximation, batch bounds, function learning, methods, nonparametrics performance reinforcement
%P 9--16
%T Fitted Q-iteration in Continuous Action-space MDPs
%X We consider continuous state, continuous action batch reinforcement learning where the goal is to learn a good policy from a sufficiently rich trajectory generated by some policy. We study a variant of fitted Q-iteration, where the greedy action selection is replaced by searching for a policy in a restricted set of candidate policies by maximizing the average action values. We provide a rigorous analysis of this algorithm, proving what we believe is the first finite-time bound for value-function based algorithms for continuous state and action problems.
Note: In retrospect, it would have been better to call this algorithm an actor-critic algorithm. The algorithm that we considers updates a policy and a value function (action-value function in this case).
@inproceedings{antos2007,
abstract = {We consider continuous state, continuous action batch reinforcement learning where the goal is to learn a good policy from a sufficiently rich trajectory generated by some policy. We study a variant of fitted Q-iteration, where the greedy action selection is replaced by searching for a policy in a restricted set of candidate policies by maximizing the average action values. We provide a rigorous analysis of this algorithm, proving what we believe is the first finite-time bound for value-function based algorithms for continuous state and action problems.
Note: In retrospect, it would have been better to call this algorithm an actor-critic algorithm. The algorithm that we considers updates a policy and a value function (action-value function in this case).},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Antos, A. and Munos, R. and Szepesv{\'a}ri, {Cs}.},
biburl = {https://www.bibsonomy.org/bibtex/215e32a7a420729a8e0e3f367d50df57c/csaba},
booktitle = {NIPS},
crossref = {NIPS20},
date-added = {2010-08-28 17:38:14 -0600},
date-modified = {2010-11-25 00:50:53 -0700},
interhash = {d72a3b55d770d0096a2489e139a74a52},
intrahash = {15e32a7a420729a8e0e3f367d50df57c},
keywords = {actor-critic approximation, batch bounds, function learning, methods, nonparametrics performance reinforcement},
pages = {9--16},
pdf = {papers/rlca.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Fitted {Q}-iteration in Continuous Action-space {MDP}s},
year = 2007
}