Most evaluations of novel algorithmic contributions assess their accuracy in predicting what was withheld in an offline evaluation scenario. However, several doubts have been raised that standard offline evaluation practices are not appropriate to select the best algorithm for field deployment. The goal of this work is therefore to compare the offline and the online evaluation methodology with the same study participants, i.e. a within users experimental design. This paper presents empirical evidence that the ranking of algorithms based on offline accuracy measurements clearly contradicts the results from the online study with the same set of users. Thus the external validity of the most commonly applied evaluation methodology is not guaranteed.
%0 Conference Paper
%1 citeulike:14139129
%A Rossetti, Marco
%A Stella, Fabio
%A Zanker, Markus
%B Proceedings of the 10th ACM Conference on Recommender Systems
%C New York, NY, USA
%D 2016
%I ACM
%K evaluation recommender recsys2016 user-study
%P 31--34
%R 10.1145/2959100.2959176
%T Contrasting Offline and Online Results when Evaluating Recommendation Algorithms
%U http://dx.doi.org/10.1145/2959100.2959176
%X Most evaluations of novel algorithmic contributions assess their accuracy in predicting what was withheld in an offline evaluation scenario. However, several doubts have been raised that standard offline evaluation practices are not appropriate to select the best algorithm for field deployment. The goal of this work is therefore to compare the offline and the online evaluation methodology with the same study participants, i.e. a within users experimental design. This paper presents empirical evidence that the ranking of algorithms based on offline accuracy measurements clearly contradicts the results from the online study with the same set of users. Thus the external validity of the most commonly applied evaluation methodology is not guaranteed.
%@ 978-1-4503-4035-9
@inproceedings{citeulike:14139129,
abstract = {{Most evaluations of novel algorithmic contributions assess their accuracy in predicting what was withheld in an offline evaluation scenario. However, several doubts have been raised that standard offline evaluation practices are not appropriate to select the best algorithm for field deployment. The goal of this work is therefore to compare the offline and the online evaluation methodology with the same study participants, i.e. a within users experimental design. This paper presents empirical evidence that the ranking of algorithms based on offline accuracy measurements clearly contradicts the results from the online study with the same set of users. Thus the external validity of the most commonly applied evaluation methodology is not guaranteed.}},
added-at = {2018-03-19T12:24:51.000+0100},
address = {New York, NY, USA},
author = {Rossetti, Marco and Stella, Fabio and Zanker, Markus},
biburl = {https://www.bibsonomy.org/bibtex/21753373b671eb991efc9528791c3bd7f/aho},
booktitle = {Proceedings of the 10th ACM Conference on Recommender Systems},
citeulike-article-id = {14139129},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=2959176},
citeulike-linkout-1 = {http://dx.doi.org/10.1145/2959100.2959176},
doi = {10.1145/2959100.2959176},
interhash = {5cf5cb61d626c800bbd5b702b908888f},
intrahash = {1753373b671eb991efc9528791c3bd7f},
isbn = {978-1-4503-4035-9},
keywords = {evaluation recommender recsys2016 user-study},
location = {Boston, Massachusetts, USA},
pages = {31--34},
posted-at = {2016-09-17 16:54:09},
priority = {2},
publisher = {ACM},
series = {RecSys '16},
timestamp = {2018-03-19T12:24:51.000+0100},
title = {{Contrasting Offline and Online Results when Evaluating Recommendation Algorithms}},
url = {http://dx.doi.org/10.1145/2959100.2959176},
year = 2016
}