In this paper, we continue our investigations of "web spam": the injection of artificially-created pages into the web in order to influence the results from search engines, to drive traffic to certain pages for fun or profit. This paper considers some previously-undescribed techniques for automatically detecting spam pages, examines the effectiveness of these techniques in isolation and when aggregated using classification algorithms. When combined, our heuristics correctly identify 2,037 (86.2%) of the 2,364 spam pages (13.8%) in our judged collection of 17,168 pages, while misidentifying 526 spam and non-spam pages (3.1%).
%0 Conference Paper
%1 ntoulas2006spam
%A Ntoulas, Alexandros
%A Najork, Marc
%A Manasse, Mark
%A Fetterly, Dennis
%B WWW '06: Proceedings of the 15th international conference on World Wide Web
%C New York, NY, USA
%D 2006
%I ACM
%K features spam web
%P 83--92
%R http://doi.acm.org/10.1145/1135777.1135794
%T Detecting spam web pages through content analysis
%U http://portal.acm.org/citation.cfm?id=1135794
%X In this paper, we continue our investigations of "web spam": the injection of artificially-created pages into the web in order to influence the results from search engines, to drive traffic to certain pages for fun or profit. This paper considers some previously-undescribed techniques for automatically detecting spam pages, examines the effectiveness of these techniques in isolation and when aggregated using classification algorithms. When combined, our heuristics correctly identify 2,037 (86.2%) of the 2,364 spam pages (13.8%) in our judged collection of 17,168 pages, while misidentifying 526 spam and non-spam pages (3.1%).
%@ 1-59593-323-9
@inproceedings{ntoulas2006spam,
abstract = {In this paper, we continue our investigations of "web spam": the injection of artificially-created pages into the web in order to influence the results from search engines, to drive traffic to certain pages for fun or profit. This paper considers some previously-undescribed techniques for automatically detecting spam pages, examines the effectiveness of these techniques in isolation and when aggregated using classification algorithms. When combined, our heuristics correctly identify 2,037 (86.2%) of the 2,364 spam pages (13.8%) in our judged collection of 17,168 pages, while misidentifying 526 spam and non-spam pages (3.1%).},
added-at = {2008-04-07T10:41:46.000+0200},
address = {New York, NY, USA},
author = {Ntoulas, Alexandros and Najork, Marc and Manasse, Mark and Fetterly, Dennis},
biburl = {https://www.bibsonomy.org/bibtex/2c93f4228fd8552bede071569cdaa1ad9/beate},
booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web},
description = {Detecting spam web pages through content analysis},
doi = {http://doi.acm.org/10.1145/1135777.1135794},
interhash = {9f759273a5a5188d1053aa6f8fa7ff07},
intrahash = {c93f4228fd8552bede071569cdaa1ad9},
isbn = {1-59593-323-9},
keywords = {features spam web},
location = {Edinburgh, Scotland},
pages = {83--92},
publisher = {ACM},
timestamp = {2008-12-09T16:32:51.000+0100},
title = {Detecting spam web pages through content analysis},
url = {http://portal.acm.org/citation.cfm?id=1135794},
year = 2006
}