We consider two algorithms for on-line prediction based on a linear model. The algorithms are the well-known gradient descent (GD) algorithm and a new algorithm, which we call EG±. They both maintain a weight vector using simple updates. For the GD algorithm, the update is based on subtracting the gradient of the squared error made on a prediction. The EG±algorithm uses the components of the gradient in the exponents of factors that are used in updating the weight vector multiplicatively. We present worst-case loss bounds for EG±and compare them to previously known bounds for the GD algorithm. The bounds suggest that the losses of the algorithms are in general incomparable, but EG±has a much smaller loss if only few components of the input are relevant for the predictions. We have performed experiments which show that our worst-case upper bounds are quite tight already on simple artificial data.
Description
Exponentiated Gradient versus Gradient Descent for Linear Predictors - ScienceDirect
%0 Journal Article
%1 KIVINEN19971
%A Kivinen, Jyrki
%A Warmuth, Manfred K.
%D 1997
%J Information and Computation
%K machine-learning optimization theory
%N 1
%P 1 - 63
%R https://doi.org/10.1006/inco.1996.2612
%T Exponentiated Gradient versus Gradient Descent for Linear Predictors
%U http://www.sciencedirect.com/science/article/pii/S0890540196926127
%V 132
%X We consider two algorithms for on-line prediction based on a linear model. The algorithms are the well-known gradient descent (GD) algorithm and a new algorithm, which we call EG±. They both maintain a weight vector using simple updates. For the GD algorithm, the update is based on subtracting the gradient of the squared error made on a prediction. The EG±algorithm uses the components of the gradient in the exponents of factors that are used in updating the weight vector multiplicatively. We present worst-case loss bounds for EG±and compare them to previously known bounds for the GD algorithm. The bounds suggest that the losses of the algorithms are in general incomparable, but EG±has a much smaller loss if only few components of the input are relevant for the predictions. We have performed experiments which show that our worst-case upper bounds are quite tight already on simple artificial data.
@article{KIVINEN19971,
abstract = {We consider two algorithms for on-line prediction based on a linear model. The algorithms are the well-known gradient descent (GD) algorithm and a new algorithm, which we call EG±. They both maintain a weight vector using simple updates. For the GD algorithm, the update is based on subtracting the gradient of the squared error made on a prediction. The EG±algorithm uses the components of the gradient in the exponents of factors that are used in updating the weight vector multiplicatively. We present worst-case loss bounds for EG±and compare them to previously known bounds for the GD algorithm. The bounds suggest that the losses of the algorithms are in general incomparable, but EG±has a much smaller loss if only few components of the input are relevant for the predictions. We have performed experiments which show that our worst-case upper bounds are quite tight already on simple artificial data.},
added-at = {2019-04-05T12:39:10.000+0200},
author = {Kivinen, Jyrki and Warmuth, Manfred K.},
biburl = {https://www.bibsonomy.org/bibtex/2e907bc99eb1a9b2e3dc7c2ca3c076c57/kirk86},
description = {Exponentiated Gradient versus Gradient Descent for Linear Predictors - ScienceDirect},
doi = {https://doi.org/10.1006/inco.1996.2612},
interhash = {f00878016d5b5998a47fce8b796c8e5a},
intrahash = {e907bc99eb1a9b2e3dc7c2ca3c076c57},
issn = {0890-5401},
journal = {Information and Computation},
keywords = {machine-learning optimization theory},
number = 1,
pages = {1 - 63},
timestamp = {2019-04-05T12:39:10.000+0200},
title = {Exponentiated Gradient versus Gradient Descent for Linear Predictors},
url = {http://www.sciencedirect.com/science/article/pii/S0890540196926127},
volume = 132,
year = 1997
}