The effectiveness of twenty public search engines is evaluated using TREC-inspired methods and a set of 54 queries taken from real Web search logs. The World Wide Web is taken as the test collection and a combination of crawler and text retrieval system is evaluated. The engines are compared on a range of measures derivable from binary relevance judgments of the first seven live results returned. Statistical testing reveals a significant difference between engines and high intercorrelations between measures. Surprisingly, given the dynamic nature of the Web and the time elapsed, there is also a high correlation between results of this study and a previous study by Gordon and Pathak. For nearly all engines, there is a gradual decline in precision at increasing cutoff after some initial fluctuation. Performance of the engines as a group is found to be inferior to the group of participants in the TREC-8 Large Web task, although the best engines approach the median of those systems. Shortcomings of current Web search evaluation methodology are identified and recommendations are made for future improvements. In particular, the present study and its predecessors deal with queries which are assumed to derive from a need to find a selection of documents relevant to a topic. By contrast, real Web search reflects a range of other information need types which require different judging and different measures.
%0 Journal Article
%1 Hawking-Measuring-2001
%A Hawking, David
%A Craswell, Nick
%A Bailey, Peter
%A Griffiths, Kathleen
%D 2001
%K Measuring Web_search World_Wide_Web quality retrieval search_engine wismasys0809
%T Measuring search engine quality
%U http://www.ils.unc.edu/~losee/par/paril.pdf.
%X The effectiveness of twenty public search engines is evaluated using TREC-inspired methods and a set of 54 queries taken from real Web search logs. The World Wide Web is taken as the test collection and a combination of crawler and text retrieval system is evaluated. The engines are compared on a range of measures derivable from binary relevance judgments of the first seven live results returned. Statistical testing reveals a significant difference between engines and high intercorrelations between measures. Surprisingly, given the dynamic nature of the Web and the time elapsed, there is also a high correlation between results of this study and a previous study by Gordon and Pathak. For nearly all engines, there is a gradual decline in precision at increasing cutoff after some initial fluctuation. Performance of the engines as a group is found to be inferior to the group of participants in the TREC-8 Large Web task, although the best engines approach the median of those systems. Shortcomings of current Web search evaluation methodology are identified and recommendations are made for future improvements. In particular, the present study and its predecessors deal with queries which are assumed to derive from a need to find a selection of documents relevant to a topic. By contrast, real Web search reflects a range of other information need types which require different judging and different measures.
@article{Hawking-Measuring-2001,
abstract = {The effectiveness of twenty public search engines is evaluated using TREC-inspired methods and a set of 54 queries taken from real Web search logs. The World Wide Web is taken as the test collection and a combination of crawler and text retrieval system is evaluated. The engines are compared on a range of measures derivable from binary relevance judgments of the first seven live results returned. Statistical testing reveals a significant difference between engines and high intercorrelations between measures. Surprisingly, given the dynamic nature of the Web and the time elapsed, there is also a high correlation between results of this study and a previous study by Gordon and Pathak. For nearly all engines, there is a gradual decline in precision at increasing cutoff after some initial fluctuation. Performance of the engines as a group is found to be inferior to the group of participants in the TREC-8 Large Web task, although the best engines approach the median of those systems. Shortcomings of current Web search evaluation methodology are identified and recommendations are made for future improvements. In particular, the present study and its predecessors deal with queries which are assumed to derive from a need to find a selection of documents relevant to a topic. By contrast, real Web search reflects a range of other information need types which require different judging and different measures.},
added-at = {2008-11-11T14:49:02.000+0100},
author = {Hawking, David and Craswell, Nick and Bailey, Peter and Griffiths, Kathleen},
biburl = {https://www.bibsonomy.org/bibtex/29e8ae72c81b1b98122f7703085368ea0/juver},
interhash = {4db9afffea81ae7e478843231a1b7762},
intrahash = {9e8ae72c81b1b98122f7703085368ea0},
keywords = {Measuring Web_search World_Wide_Web quality retrieval search_engine wismasys0809},
timestamp = {2008-11-11T14:49:02.000+0100},
title = {Measuring search engine quality},
url = {http://www.ils.unc.edu/~losee/par/paril.pdf.},
year = 2001
}