A stochastic combinatorial semi-bandit is an online learning problem where at each step a learning agent chooses a subset of ground items subject to constraints, and then observes stochastic weights of these items and receives their sum as a payoff. In this paper, we close the problem of computationally and sample efficient learning in stochastic combinatorial semi-bandits. In particular, we analyze a UCB-like algorithm for solving the problem, which is known to be computationally efficient; and prove O(K L (1 / Delta) log n) and O( (K L n log n)^1/2 )$ upper bounds on its n-step regret, where L is the number of ground items, K is the maximum number of chosen items, and Delta is the gap between the expected returns of the optimal and best suboptimal solutions. The gap-dependent bound is tight up to a constant factor and the gap-free bound is tight up to a polylogarithmic factor.
%0 Conference Paper
%1 KWASz15
%A Kveton, B.
%A Wen, Z.
%A Ashkan, A.
%A Szepesvári, Cs.
%B AISTATS
%D 2015
%K bandits, combinatorial learning, linear online semi-bandits stochastic theory,
%P 535--543
%T Tight Regret Bounds for Stochastic Combinatorial Semi-Bandits
%X A stochastic combinatorial semi-bandit is an online learning problem where at each step a learning agent chooses a subset of ground items subject to constraints, and then observes stochastic weights of these items and receives their sum as a payoff. In this paper, we close the problem of computationally and sample efficient learning in stochastic combinatorial semi-bandits. In particular, we analyze a UCB-like algorithm for solving the problem, which is known to be computationally efficient; and prove O(K L (1 / Delta) log n) and O( (K L n log n)^1/2 )$ upper bounds on its n-step regret, where L is the number of ground items, K is the maximum number of chosen items, and Delta is the gap between the expected returns of the optimal and best suboptimal solutions. The gap-dependent bound is tight up to a constant factor and the gap-free bound is tight up to a polylogarithmic factor.
@inproceedings{KWASz15,
abstract = {A stochastic combinatorial semi-bandit is an online learning problem where at each step a learning agent chooses a subset of ground items subject to constraints, and then observes stochastic weights of these items and receives their sum as a payoff. In this paper, we close the problem of computationally and sample efficient learning in stochastic combinatorial semi-bandits. In particular, we analyze a UCB-like algorithm for solving the problem, which is known to be computationally efficient; and prove O(K L (1 / Delta) log n) and O( (K L n log n)^{1/2} )$ upper bounds on its n-step regret, where L is the number of ground items, K is the maximum number of chosen items, and Delta is the gap between the expected returns of the optimal and best suboptimal solutions. The gap-dependent bound is tight up to a constant factor and the gap-free bound is tight up to a polylogarithmic factor.},
added-at = {2020-03-17T03:03:01.000+0100},
author = {Kveton, B. and Wen, Z. and Ashkan, A. and Szepesv{\'a}ri, {Cs}.},
biburl = {https://www.bibsonomy.org/bibtex/2c33775d3e4f0bc65a864c0b8719f4346/csaba},
booktitle = {AISTATS},
date-added = {2015-01-27 06:39:07 +0000},
date-modified = {2015-08-02 01:02:44 +0000},
interhash = {a718f2b5cf61c34dce26aa3b0cebd206},
intrahash = {c33775d3e4f0bc65a864c0b8719f4346},
keywords = {bandits, combinatorial learning, linear online semi-bandits stochastic theory,},
pages = {535--543},
pdf = {papers/AISTAT15-CombBand.pdf},
timestamp = {2020-03-17T03:03:01.000+0100},
title = {Tight Regret Bounds for Stochastic Combinatorial Semi-Bandits},
year = 2015
}