The information explosion of the Web aggravates the problem of effective information retrieval. Even though linguistic approaches found in the literature perform linguistic annotation by creating metadata in the form of tokens, lemmas or part of speech tags, however,this process is insufficient. This is due to the fact that these linguistic metadata do not exploit the actual content of the page, leading to the need of performing semantic annotation based on a predefined semantic model. This paper proposes a new learning approach for performing automatic semantic annotation. This is the result of a two step procedure: the first step partitions a web page into blocks based on its visual layout, while the second, performs subsequent partitioning based on the examination of appearance of specific types of entities denoting the semantic category as well as the application of a number of simple heuristics. Preliminary experiments performed on a manually annotated corpus regarding athletics proved to be very promising.
%0 Generic
%1 citeulike:5663452
%A Petasis, Georgios
%A Fragkou, Pavlina
%A Theodorakos, Aris
%A Karkaletsis, Vangelis
%A Spyropoulos, Constantine D.
%B Proceedings of the 4th Web as a Corpus Workshop (WAC-4), 6th Language Resources and Evaluation Conference (LREC 2008)
%C Marrakech, Morocco
%D 2008
%J 4th Web as Corpus Workshop (WAC-4)
%K imported
%P 18--24
%R 10.1109/SPCA.2006.297506
%T Segmenting HTML pages using visual and semantic information
%U http://www.ellogon.org/petasis/bibliography/LREC2008/LREC-2008-SemanticSegmentation-Submitted.pdf
%X The information explosion of the Web aggravates the problem of effective information retrieval. Even though linguistic approaches found in the literature perform linguistic annotation by creating metadata in the form of tokens, lemmas or part of speech tags, however,this process is insufficient. This is due to the fact that these linguistic metadata do not exploit the actual content of the page, leading to the need of performing semantic annotation based on a predefined semantic model. This paper proposes a new learning approach for performing automatic semantic annotation. This is the result of a two step procedure: the first step partitions a web page into blocks based on its visual layout, while the second, performs subsequent partitioning based on the examination of appearance of specific types of entities denoting the semantic category as well as the application of a number of simple heuristics. Preliminary experiments performed on a manually annotated corpus regarding athletics proved to be very promising.
@conference{citeulike:5663452,
abstract = {The information explosion of the Web aggravates the problem of effective information retrieval. Even though linguistic approaches found in the literature perform linguistic annotation by creating metadata in the form of tokens, lemmas or part of speech tags, however,this process is insufficient. This is due to the fact that these linguistic metadata do not exploit the actual content of the page, leading to the need of performing semantic annotation based on a predefined semantic model. This paper proposes a new learning approach for performing automatic semantic annotation. This is the result of a two step procedure: the first step partitions a web page into blocks based on its visual layout, while the second, performs subsequent partitioning based on the examination of appearance of specific types of entities denoting the semantic category as well as the application of a number of simple heuristics. Preliminary experiments performed on a manually annotated corpus regarding athletics proved to be very promising.},
added-at = {2011-08-10T12:37:26.000+0200},
address = {Marrakech, Morocco},
author = {Petasis, Georgios and Fragkou, Pavlina and Theodorakos, Aris and Karkaletsis, Vangelis and Spyropoulos, Constantine D.},
biburl = {https://www.bibsonomy.org/bibtex/2b4866e051a1fe7110c87a0388fe701f1/petasis},
booktitle = {Proceedings of the 4th Web as a Corpus Workshop (WAC-4), 6th Language Resources and Evaluation Conference (LREC 2008)},
doi = {10.1109/SPCA.2006.297506},
interhash = {7be13470b979bb4097143e8b2b2c7aac},
intrahash = {b4866e051a1fe7110c87a0388fe701f1},
journal = {4th Web as Corpus Workshop (WAC-4)},
keywords = {imported},
month = {June 1},
note = {Proceedings: The 4th Web as Corpus: Can we do better than Google? http://www.lrec-conf.org/proceedings/lrec2008/workshops/W19_Proceedings.pdf},
pages = {18--24},
timestamp = {2011-08-10T12:37:27.000+0200},
title = {{S}egmenting {HTML} pages using visual and semantic information},
url = {http://www.ellogon.org/petasis/bibliography/LREC2008/LREC-2008-SemanticSegmentation-Submitted.pdf},
year = 2008
}