We establish a new connection between value and policy based reinforcement
learning (RL) based on a relationship between softmax temporal value
consistency and policy optimality under entropy regularization. Specifically,
we show that softmax consistent action values correspond to optimal entropy
regularized policy probabilities along any action sequence, regardless of
provenance. From this observation, we develop a new RL algorithm, Path
Consistency Learning (PCL), that minimizes a notion of soft consistency error
along multi-step action sequences extracted from both on- and off-policy
traces. We examine the behavior of PCL in different scenarios and show that PCL
can be interpreted as generalizing both actor-critic and Q-learning algorithms.
We subsequently deepen the relationship by showing how a single model can be
used to represent both a policy and the corresponding softmax state values,
eliminating the need for a separate critic. The experimental evaluation
demonstrates that PCL significantly outperforms strong actor-critic and
Q-learning baselines across several benchmarks.
Description
[1702.08892v3] Bridging the Gap Between Value and Policy Based Reinforcement Learning
%0 Generic
%1 nachum2017bridging
%A Nachum, Ofir
%A Norouzi, Mohammad
%A Xu, Kelvin
%A Schuurmans, Dale
%D 2017
%K 2017 arxiv reinforcement-learning
%T Bridging the Gap Between Value and Policy Based Reinforcement Learning
%U http://arxiv.org/abs/1702.08892
%X We establish a new connection between value and policy based reinforcement
learning (RL) based on a relationship between softmax temporal value
consistency and policy optimality under entropy regularization. Specifically,
we show that softmax consistent action values correspond to optimal entropy
regularized policy probabilities along any action sequence, regardless of
provenance. From this observation, we develop a new RL algorithm, Path
Consistency Learning (PCL), that minimizes a notion of soft consistency error
along multi-step action sequences extracted from both on- and off-policy
traces. We examine the behavior of PCL in different scenarios and show that PCL
can be interpreted as generalizing both actor-critic and Q-learning algorithms.
We subsequently deepen the relationship by showing how a single model can be
used to represent both a policy and the corresponding softmax state values,
eliminating the need for a separate critic. The experimental evaluation
demonstrates that PCL significantly outperforms strong actor-critic and
Q-learning baselines across several benchmarks.
@misc{nachum2017bridging,
abstract = {We establish a new connection between value and policy based reinforcement
learning (RL) based on a relationship between softmax temporal value
consistency and policy optimality under entropy regularization. Specifically,
we show that softmax consistent action values correspond to optimal entropy
regularized policy probabilities along any action sequence, regardless of
provenance. From this observation, we develop a new RL algorithm, Path
Consistency Learning (PCL), that minimizes a notion of soft consistency error
along multi-step action sequences extracted from both on- and off-policy
traces. We examine the behavior of PCL in different scenarios and show that PCL
can be interpreted as generalizing both actor-critic and Q-learning algorithms.
We subsequently deepen the relationship by showing how a single model can be
used to represent both a policy and the corresponding softmax state values,
eliminating the need for a separate critic. The experimental evaluation
demonstrates that PCL significantly outperforms strong actor-critic and
Q-learning baselines across several benchmarks.},
added-at = {2017-11-28T19:28:27.000+0100},
author = {Nachum, Ofir and Norouzi, Mohammad and Xu, Kelvin and Schuurmans, Dale},
biburl = {https://www.bibsonomy.org/bibtex/2dc4c433e421378f05f72b8014a363abc/achakraborty},
description = {[1702.08892v3] Bridging the Gap Between Value and Policy Based Reinforcement Learning},
interhash = {eef6ec39028ad0c79200c4f7b2e1e36c},
intrahash = {dc4c433e421378f05f72b8014a363abc},
keywords = {2017 arxiv reinforcement-learning},
note = {cite arxiv:1702.08892Comment: NIPS 2017},
timestamp = {2017-11-29T17:54:04.000+0100},
title = {Bridging the Gap Between Value and Policy Based Reinforcement Learning},
url = {http://arxiv.org/abs/1702.08892},
year = 2017
}