With the explosive growth of the World Wide Web, millions of documents are published and accessed on-line. Statistics show that a significant part of Web text information is encoded in Web images. Since Web images have special characteristics that sometimes distinguish them from other types of images, commercial OCR products often fail to recognize Web images due to their special characteristics. This paper proposes a novel Web image processing algorithm that aims to locate text areas and prepare them for OCR procedure with better results. Our methodology for text area identification has been fully integrated with an OCR engine and with an Information Extraction system. We present quantitative results for the performance of the OCR engine as well as qualitative results concerning its effects to the Information Extraction system. Experimental results obtained from a large corpus of Web images, demonstrate the efficiency of our methodology.
%0 Generic
%1 DBLP:conf/setn/PerantonisGMKP04
%A Perantonis, Stavros J.
%A Gatos, Basilios
%A Maragos, Vassilios
%A Karkaletsis, Vangelis
%A Petasis, Georgios
%B Methods and Applications of Artificial Intelligence, Proceedings of the 3rd Hellenic Conference on Artificial Intelligence (SETN 2004)
%C Samos, Greece
%D 2004
%E Vouros, George A.
%E Panayiotopoulos, Themis
%I Springer Berlin / Heidelberg
%K imported
%P 82--92
%T Text Area Identification in Web Images
%U http://www.ellogon.org/petasis/bibliography/SETN2004/SETN2004.pdf
%V 3025
%X With the explosive growth of the World Wide Web, millions of documents are published and accessed on-line. Statistics show that a significant part of Web text information is encoded in Web images. Since Web images have special characteristics that sometimes distinguish them from other types of images, commercial OCR products often fail to recognize Web images due to their special characteristics. This paper proposes a novel Web image processing algorithm that aims to locate text areas and prepare them for OCR procedure with better results. Our methodology for text area identification has been fully integrated with an OCR engine and with an Information Extraction system. We present quantitative results for the performance of the OCR engine as well as qualitative results concerning its effects to the Information Extraction system. Experimental results obtained from a large corpus of Web images, demonstrate the efficiency of our methodology.
%@ 3-540-21937-4
@conference{DBLP:conf/setn/PerantonisGMKP04,
abstract = {With the explosive growth of the World Wide Web, millions of documents are published and accessed on-line. Statistics show that a significant part of Web text information is encoded in Web images. Since Web images have special characteristics that sometimes distinguish them from other types of images, commercial OCR products often fail to recognize Web images due to their special characteristics. This paper proposes a novel Web image processing algorithm that aims to locate text areas and prepare them for OCR procedure with better results. Our methodology for text area identification has been fully integrated with an OCR engine and with an Information Extraction system. We present quantitative results for the performance of the OCR engine as well as qualitative results concerning its effects to the Information Extraction system. Experimental results obtained from a large corpus of Web images, demonstrate the efficiency of our methodology.},
added-at = {2011-08-10T12:37:26.000+0200},
address = {Samos, Greece},
author = {Perantonis, Stavros J. and Gatos, Basilios and Maragos, Vassilios and Karkaletsis, Vangelis and Petasis, Georgios},
biburl = {https://www.bibsonomy.org/bibtex/2bed1a3e47e65afaef870cb508ea681c7/petasis},
booktitle = {Methods and Applications of Artificial Intelligence, Proceedings of the 3rd Hellenic Conference on Artificial Intelligence (SETN 2004)},
editor = {Vouros, George A. and Panayiotopoulos, Themis},
interhash = {0b8169429bfbc5126fdf91fb8afc323c},
intrahash = {bed1a3e47e65afaef870cb508ea681c7},
isbn = {3-540-21937-4},
keywords = {imported},
month = May,
pages = {82--92},
publisher = {Springer Berlin / Heidelberg},
series = {Lecture Notes in Computer Science},
timestamp = {2011-08-10T12:37:27.000+0200},
title = {{T}ext {A}rea {I}dentification in {W}eb {I}mages},
url = {http://www.ellogon.org/petasis/bibliography/SETN2004/SETN2004.pdf},
volume = 3025,
year = 2004
}