The large size and the dynamic nature of the Web make it necessary to continually maintain Web based information retrieval systems. Crawlers facilitate this process by following hyperlinks in Web pages to automatically download new and updated Web pages. While some systems rely on crawlers that exhaustively crawl the Web, others incorporate ``focus'' within their crawlers to harvest application- or topic-specific collections. In this chapter we discuss the basic issues related to developing an infrastructure for crawlers. This is followed by a review of several topical crawling algorithms, and evaluation metrics that may be used to judge their performance. Given that many innovative applications of Web crawling are still being invented, we briefly discuss some that have already been developed.
%0 Book Section
%1 Pant2004
%A Pant, Gautam
%A Srinivasan, Padmini
%A Menczer, Filippo
%B Web Dynamics: Adapting to Change in Content, Size, Topology and Use
%C Berlin, Heidelberg
%D 2004
%I Springer Berlin Heidelberg
%K book crawl scale web
%P 153--177
%R 10.1007/978-3-662-10874-1_7
%T Crawling the Web
%U https://doi.org/10.1007/978-3-662-10874-1_7
%X The large size and the dynamic nature of the Web make it necessary to continually maintain Web based information retrieval systems. Crawlers facilitate this process by following hyperlinks in Web pages to automatically download new and updated Web pages. While some systems rely on crawlers that exhaustively crawl the Web, others incorporate ``focus'' within their crawlers to harvest application- or topic-specific collections. In this chapter we discuss the basic issues related to developing an infrastructure for crawlers. This is followed by a review of several topical crawling algorithms, and evaluation metrics that may be used to judge their performance. Given that many innovative applications of Web crawling are still being invented, we briefly discuss some that have already been developed.
%@ 978-3-662-10874-1
@inbook{Pant2004,
abstract = {The large size and the dynamic nature of the Web make it necessary to continually maintain Web based information retrieval systems. Crawlers facilitate this process by following hyperlinks in Web pages to automatically download new and updated Web pages. While some systems rely on crawlers that exhaustively crawl the Web, others incorporate ``focus'' within their crawlers to harvest application- or topic-specific collections. In this chapter we discuss the basic issues related to developing an infrastructure for crawlers. This is followed by a review of several topical crawling algorithms, and evaluation metrics that may be used to judge their performance. Given that many innovative applications of Web crawling are still being invented, we briefly discuss some that have already been developed.},
added-at = {2020-03-10T19:41:14.000+0100},
address = {Berlin, Heidelberg},
author = {Pant, Gautam and Srinivasan, Padmini and Menczer, Filippo},
biburl = {https://www.bibsonomy.org/bibtex/2d5fed07046429bdc1a8099fc9998dd7e/parismic},
booktitle = {Web Dynamics: Adapting to Change in Content, Size, Topology and Use},
doi = {10.1007/978-3-662-10874-1_7},
interhash = {a82b0e8ea9989b1c549f6b77be10b7ef},
intrahash = {d5fed07046429bdc1a8099fc9998dd7e},
isbn = {978-3-662-10874-1},
keywords = {book crawl scale web},
pages = {153--177},
publisher = {Springer Berlin Heidelberg},
timestamp = {2020-03-10T19:41:14.000+0100},
title = {Crawling the Web},
url = {https://doi.org/10.1007/978-3-662-10874-1_7},
year = 2004
}