This paper points out some mistakes that can be frequently found in IR publications: MRR and ERR violate basic requirements for a metric, MAP is based on unrealistic assumptions, the numbers shown overstate the precision of the result, relative improvements of arithmetic means are inappropriate, the simple holdout method yields unreliable results, hypotheses are often formulated after the experiment, significance tests frequently ignore the multiple comparisons problem, effect sizes are ignored, reproducibility of the experiments might be nearly impossible, and sometimes authors claim proof by experimentation.
Description
Some Common Mistakes In IR Evaluation, And How They Can Be Avoided | ACM SIGIR Forum
%0 Journal Article
%1 fuhr2018common
%A Fuhr, Norbert
%D 2018
%I Association for Computing Machinery (ACM)
%J SIGIR Forum
%K evaluation information ir plk retrieval
%N 3
%P 32--41
%R 10.1145/3190580.3190586
%T Some Common Mistakes In IR Evaluation, And How They Can Be Avoided
%U https://doi.org/10.1145%2F3190580.3190586
%V 51
%X This paper points out some mistakes that can be frequently found in IR publications: MRR and ERR violate basic requirements for a metric, MAP is based on unrealistic assumptions, the numbers shown overstate the precision of the result, relative improvements of arithmetic means are inappropriate, the simple holdout method yields unreliable results, hypotheses are often formulated after the experiment, significance tests frequently ignore the multiple comparisons problem, effect sizes are ignored, reproducibility of the experiments might be nearly impossible, and sometimes authors claim proof by experimentation.
@article{fuhr2018common,
abstract = {This paper points out some mistakes that can be frequently found in IR publications: MRR and ERR violate basic requirements for a metric, MAP is based on unrealistic assumptions, the numbers shown overstate the precision of the result, relative improvements of arithmetic means are inappropriate, the simple holdout method yields unreliable results, hypotheses are often formulated after the experiment, significance tests frequently ignore the multiple comparisons problem, effect sizes are ignored, reproducibility of the experiments might be nearly impossible, and sometimes authors claim proof by experimentation.},
added-at = {2022-08-24T13:13:51.000+0200},
author = {Fuhr, Norbert},
biburl = {https://www.bibsonomy.org/bibtex/2036a1df370e3e79571f611c990b50ec2/jaeschke},
description = {Some Common Mistakes In IR Evaluation, And How They Can Be Avoided | ACM SIGIR Forum},
doi = {10.1145/3190580.3190586},
interhash = {9410e56b0207a7ada2565fb492c01215},
intrahash = {036a1df370e3e79571f611c990b50ec2},
journal = {{SIGIR} Forum},
keywords = {evaluation information ir plk retrieval},
month = feb,
number = 3,
pages = {32--41},
publisher = {Association for Computing Machinery (ACM)},
timestamp = {2022-08-24T13:36:53.000+0200},
title = {Some Common Mistakes In {IR} Evaluation, And How They Can Be Avoided},
url = {https://doi.org/10.1145%2F3190580.3190586},
volume = 51,
year = 2018
}