Most text analysis is designed to deal with the concept of a "document", namely a cohesive presentation of thought on a unifying subject. By contrast, individual nodes on the World Wide Web tend to have a much smaller granularity than text documents. We claim that the notions of "document" and "web node" are not synonymous, and that authors often tend to deploy documents as collections of URLs, which we call "compound documents". In this paper we present new techniques for identifying and working with such compound documents, and the results of some large-scale studies on such web documents. The primary motivation for this work stems from the fact that information retrieval techniques are better suited to working on documents than individual hypertext nodes.
%0 Conference Paper
%1 900070
%A Eiron, Nadav
%A McCurley, Kevin S.
%B HYPERTEXT '03: Proceedings of the fourteenth ACM conference on Hypertext and hypermedia
%C New York, NY, USA
%D 2003
%I ACM
%K diplomarbeit documents
%P 85--94
%R http://doi.acm.org/10.1145/900051.900070
%T Untangling compound documents on the web
%U http://portal.acm.org/citation.cfm?id=900051.900070
%X Most text analysis is designed to deal with the concept of a "document", namely a cohesive presentation of thought on a unifying subject. By contrast, individual nodes on the World Wide Web tend to have a much smaller granularity than text documents. We claim that the notions of "document" and "web node" are not synonymous, and that authors often tend to deploy documents as collections of URLs, which we call "compound documents". In this paper we present new techniques for identifying and working with such compound documents, and the results of some large-scale studies on such web documents. The primary motivation for this work stems from the fact that information retrieval techniques are better suited to working on documents than individual hypertext nodes.
%@ 1-58113-704-4
@inproceedings{900070,
abstract = {Most text analysis is designed to deal with the concept of a "document", namely a cohesive presentation of thought on a unifying subject. By contrast, individual nodes on the World Wide Web tend to have a much smaller granularity than text documents. We claim that the notions of "document" and "web node" are not synonymous, and that authors often tend to deploy documents as collections of URLs, which we call "compound documents". In this paper we present new techniques for identifying and working with such compound documents, and the results of some large-scale studies on such web documents. The primary motivation for this work stems from the fact that information retrieval techniques are better suited to working on documents than individual hypertext nodes.},
added-at = {2009-03-09T21:35:00.000+0100},
address = {New York, NY, USA},
author = {Eiron, Nadav and McCurley, Kevin S.},
biburl = {https://www.bibsonomy.org/bibtex/28c9079dffbce6b91e417db8740a193c1/dominikb1888},
booktitle = {HYPERTEXT '03: Proceedings of the fourteenth ACM conference on Hypertext and hypermedia},
description = {Untangling compound documents on the web},
doi = {http://doi.acm.org/10.1145/900051.900070},
interhash = {d53d87b3f28ce27a0d41465134dc8a3b},
intrahash = {8c9079dffbce6b91e417db8740a193c1},
isbn = {1-58113-704-4},
keywords = {diplomarbeit documents},
location = {Nottingham, UK},
pages = {85--94},
publisher = {ACM},
timestamp = {2010-12-09T12:52:02.000+0100},
title = {Untangling compound documents on the web},
url = {http://portal.acm.org/citation.cfm?id=900051.900070},
year = 2003
}