We address online combinatorial optimization when the player has a prior over
the adversary's sequence of losses. In this framework, Russo and Van Roy
proposed an information-theoretic analysis of Thompson Sampling based on the
information ratio, resulting in optimal worst-case regret bounds. In this
paper we introduce three novel ideas to this line of work. First we propose a
new quantity, the scale-sensitive information ratio, which allows us to obtain
more refined first-order regret bounds (i.e., bounds of the form $L^*$
where $L^*$ is the loss of the best combinatorial action). Second we replace
the entropy over combinatorial actions by a coordinate entropy, which allows us
to obtain the first optimal worst-case bound for Thompson Sampling in the
combinatorial setting. Finally, we introduce a novel link between Bayesian
agents and frequentist confidence intervals. Combining these ideas we show that
the classical multi-armed bandit first-order regret bound $O(d
L^*)$ still holds true in the more challenging and more general semi-bandit
scenario. This latter result improves the previous state of the art bound
$O((d+m^3)L^*)$ by Lykouris, Sridharan and Tardos.
Описание
[1902.00681] First-Order Regret Analysis of Thompson Sampling
%0 Conference Paper
%1 bubeck2019firstorder
%A Bubeck, Sébastien
%A Sellke, Mark
%D 2019
%K bayesian bounds combinatorics online-learning optimization readings sampling
%T First-Order Regret Analysis of Thompson Sampling
%U http://arxiv.org/abs/1902.00681
%X We address online combinatorial optimization when the player has a prior over
the adversary's sequence of losses. In this framework, Russo and Van Roy
proposed an information-theoretic analysis of Thompson Sampling based on the
information ratio, resulting in optimal worst-case regret bounds. In this
paper we introduce three novel ideas to this line of work. First we propose a
new quantity, the scale-sensitive information ratio, which allows us to obtain
more refined first-order regret bounds (i.e., bounds of the form $L^*$
where $L^*$ is the loss of the best combinatorial action). Second we replace
the entropy over combinatorial actions by a coordinate entropy, which allows us
to obtain the first optimal worst-case bound for Thompson Sampling in the
combinatorial setting. Finally, we introduce a novel link between Bayesian
agents and frequentist confidence intervals. Combining these ideas we show that
the classical multi-armed bandit first-order regret bound $O(d
L^*)$ still holds true in the more challenging and more general semi-bandit
scenario. This latter result improves the previous state of the art bound
$O((d+m^3)L^*)$ by Lykouris, Sridharan and Tardos.
@inproceedings{bubeck2019firstorder,
abstract = {We address online combinatorial optimization when the player has a prior over
the adversary's sequence of losses. In this framework, Russo and Van Roy
proposed an information-theoretic analysis of Thompson Sampling based on the
{\em information ratio}, resulting in optimal worst-case regret bounds. In this
paper we introduce three novel ideas to this line of work. First we propose a
new quantity, the scale-sensitive information ratio, which allows us to obtain
more refined first-order regret bounds (i.e., bounds of the form $\sqrt{L^*}$
where $L^*$ is the loss of the best combinatorial action). Second we replace
the entropy over combinatorial actions by a coordinate entropy, which allows us
to obtain the first optimal worst-case bound for Thompson Sampling in the
combinatorial setting. Finally, we introduce a novel link between Bayesian
agents and frequentist confidence intervals. Combining these ideas we show that
the classical multi-armed bandit first-order regret bound $\tilde{O}(\sqrt{d
L^*})$ still holds true in the more challenging and more general semi-bandit
scenario. This latter result improves the previous state of the art bound
$\tilde{O}(\sqrt{(d+m^3)L^*})$ by Lykouris, Sridharan and Tardos.},
added-at = {2019-11-27T15:06:11.000+0100},
author = {Bubeck, Sébastien and Sellke, Mark},
biburl = {https://www.bibsonomy.org/bibtex/24634b2454b95b2991b462920dea4dfe1/kirk86},
description = {[1902.00681] First-Order Regret Analysis of Thompson Sampling},
interhash = {602470d8edcbeb2ab70253bb9446f6eb},
intrahash = {4634b2454b95b2991b462920dea4dfe1},
keywords = {bayesian bounds combinatorics online-learning optimization readings sampling},
note = {cite arxiv:1902.00681Comment: 27 pages},
timestamp = {2019-11-27T15:07:51.000+0100},
title = {First-Order Regret Analysis of Thompson Sampling},
url = {http://arxiv.org/abs/1902.00681},
year = 2019
}