We give a comprehensive report on our experiments with retrieval from
OCR-generated text using systems based on standard models of retrieval.
More specifically, we show that average precision and recall is not
affected by OCR errors across systems for several collections. The
collections used in these experiments include both actual OCR-generated
text and standard information retrieval collections corrupted through
the simulation of OCR errors. Both the actual and simulation experiments
include full-text and abstract-length documents. We also demonstrate
that the ranking and feedback methods associated with these models
are generally not robust enough to deal with OCR errors. It is further
shown that the OCR errors and garbage strings generated from the
mistranslation of graphic objects increase the size of the index
by a wide margin. We not only point out problems that can arise from
applying OCR text within an information retrieval environment, we
also suggest solutions to overcome some of these problems.
%0 Journal Article
%1 Taghva1996
%A Taghva, Kazem
%A Borsack, Julie
%A Condit, Allen
%D 1996
%K ?, Analysis Capture, Computing Content Digitization Error Experimentation, Feedback, H.3.1 H.3.3 I.4.1 Image Indexing Indexing, Information Methodologies, Optical Performance, Ranking Retrieval Retrieval, Scanning, Search Systems, algorithms, and character computer correction, methods, models, ocr process, processing recognition, retrieval, storage vision,
%N 1
%P 64--93
%T Evaluation of Model-Based Retrieval Effectiveness with OCR Text
%U http://www.acm.org/pubs/articles/journals/tois/1996-14-1/p64-taghva/p64-taghva.pdf
%V 14
%X We give a comprehensive report on our experiments with retrieval from
OCR-generated text using systems based on standard models of retrieval.
More specifically, we show that average precision and recall is not
affected by OCR errors across systems for several collections. The
collections used in these experiments include both actual OCR-generated
text and standard information retrieval collections corrupted through
the simulation of OCR errors. Both the actual and simulation experiments
include full-text and abstract-length documents. We also demonstrate
that the ranking and feedback methods associated with these models
are generally not robust enough to deal with OCR errors. It is further
shown that the OCR errors and garbage strings generated from the
mistranslation of graphic objects increase the size of the index
by a wide margin. We not only point out problems that can arise from
applying OCR text within an information retrieval environment, we
also suggest solutions to overcome some of these problems.
@article{Taghva1996,
abstract = {We give a comprehensive report on our experiments with retrieval from
OCR-generated text using systems based on standard models of retrieval.
More specifically, we show that average precision and recall is not
affected by OCR errors across systems for several collections. The
collections used in these experiments include both actual OCR-generated
text and standard information retrieval collections corrupted through
the simulation of OCR errors. Both the actual and simulation experiments
include full-text and abstract-length documents. We also demonstrate
that the ranking and feedback methods associated with these models
are generally not robust enough to deal with OCR errors. It is further
shown that the OCR errors and garbage strings generated from the
mistranslation of graphic objects increase the size of the index
by a wide margin. We not only point out problems that can arise from
applying OCR text within an information retrieval environment, we
also suggest solutions to overcome some of these problems.},
added-at = {2011-03-27T19:35:34.000+0200},
author = {Taghva, Kazem and Borsack, Julie and Condit, Allen},
biburl = {https://www.bibsonomy.org/bibtex/2acdf8783969135a970a3730c13484d92/cocus},
copyright = {(c) Copyright 1996 Association for Computing Machinery},
file = {:./p64-taghva.pdf:PDF},
interhash = {f549ceefd90b187acdabad80be77842f},
intrahash = {acdf8783969135a970a3730c13484d92},
journaltitle = {#acmti#},
keywords = {?, Analysis Capture, Computing Content Digitization Error Experimentation, Feedback, H.3.1 H.3.3 I.4.1 Image Indexing Indexing, Information Methodologies, Optical Performance, Ranking Retrieval Retrieval, Scanning, Search Systems, algorithms, and character computer correction, methods, models, ocr process, processing recognition, retrieval, storage vision,},
mrnumber = {J.TOIS.14.1.64},
number = 1,
owner = {CK},
pages = {64--93},
timestamp = {2011-03-27T19:35:43.000+0200},
title = {Evaluation of Model-Based Retrieval Effectiveness with {OCR} Text},
url = {http://www.acm.org/pubs/articles/journals/tois/1996-14-1/p64-taghva/p64-taghva.pdf},
volume = 14,
year = 1996
}