Web spam can significantly deteriorate the quality of search
engine results. Thus there is a large incentive for commercial
search engines to detect spam pages efficiently and accurately.
In this paper we present a spam detection system
that uses the topology of the Web graph by exploiting the
link dependencies among the Web pages, and the content
of the pages themselves. We find that linked hosts tend to
belong to the same class: either both are spam or both are
non-spam. We demonstrate three methods of incorporating
the Web graph topology into the predictions obtained by
our base classifier: (i) clustering the host graph, and assigning
the label of all hosts in the cluster by majority vote, (ii)
propagating the predicted labels to neighboring hosts, and
(iii) using the predicted labels of neighboring hosts as new
features and retraining the classifier. The result is an accurate
system for detecting Web spam that can be applied in
practice to large-scale Web data.
Categories and Subject Descriptors: H.4.m Information
Systems Applications: Miscellaneous
General Terms: Algorithms, Measurement.
Keywords: Link spam, Content spam, Web spam
%0 Conference Paper
%1 CastilloDGMS07
%A Castillo, Carlos
%A Donato, Debora
%A Gionis, Aristides
%A Murdock, Vanessa
%A Silvestri, Fabrizio
%B SIGIR
%D 2007
%E Kraaij, Wessel
%E de Vries, Arjen P.
%E Clarke, Charles L. A.
%E Fuhr, Norbert
%E Kando, Noriko
%I ACM
%K detection neighbors spam spamdetection web
%P 423-430
%T Know your neighbors: web spam detection using the web topology.
%U http://www.dcc.uchile.cl/~ccastill/papers/cdgms_2006_know_your_neighbors.pdf
%X Web spam can significantly deteriorate the quality of search
engine results. Thus there is a large incentive for commercial
search engines to detect spam pages efficiently and accurately.
In this paper we present a spam detection system
that uses the topology of the Web graph by exploiting the
link dependencies among the Web pages, and the content
of the pages themselves. We find that linked hosts tend to
belong to the same class: either both are spam or both are
non-spam. We demonstrate three methods of incorporating
the Web graph topology into the predictions obtained by
our base classifier: (i) clustering the host graph, and assigning
the label of all hosts in the cluster by majority vote, (ii)
propagating the predicted labels to neighboring hosts, and
(iii) using the predicted labels of neighboring hosts as new
features and retraining the classifier. The result is an accurate
system for detecting Web spam that can be applied in
practice to large-scale Web data.
Categories and Subject Descriptors: H.4.m Information
Systems Applications: Miscellaneous
General Terms: Algorithms, Measurement.
Keywords: Link spam, Content spam, Web spam
%@ 978-1-59593-597-7
@inproceedings{CastilloDGMS07,
abstract = {Web spam can significantly deteriorate the quality of search
engine results. Thus there is a large incentive for commercial
search engines to detect spam pages efficiently and accurately.
In this paper we present a spam detection system
that uses the topology of the Web graph by exploiting the
link dependencies among the Web pages, and the content
of the pages themselves. We find that linked hosts tend to
belong to the same class: either both are spam or both are
non-spam. We demonstrate three methods of incorporating
the Web graph topology into the predictions obtained by
our base classifier: (i) clustering the host graph, and assigning
the label of all hosts in the cluster by majority vote, (ii)
propagating the predicted labels to neighboring hosts, and
(iii) using the predicted labels of neighboring hosts as new
features and retraining the classifier. The result is an accurate
system for detecting Web spam that can be applied in
practice to large-scale Web data.
Categories and Subject Descriptors: H.4.m [Information
Systems Applications]: Miscellaneous
General Terms: Algorithms, Measurement.
Keywords: Link spam, Content spam, Web spam},
added-at = {2007-10-19T20:52:17.000+0200},
author = {Castillo, Carlos and Donato, Debora and Gionis, Aristides and Murdock, Vanessa and Silvestri, Fabrizio},
biburl = {https://www.bibsonomy.org/bibtex/2843605d513b807944ef11c076c2d0e95/steff83},
booktitle = {SIGIR},
crossref = {conf/sigir/2007},
date = {2007-08-24},
editor = {Kraaij, Wessel and de Vries, Arjen P. and Clarke, Charles L. A. and Fuhr, Norbert and Kando, Noriko},
ee = {http://doi.acm.org/10.1145/1277741.1277814},
interhash = {c05211f197c007f2fc2d72cf4db2f5e3},
intrahash = {843605d513b807944ef11c076c2d0e95},
isbn = {978-1-59593-597-7},
keywords = {detection neighbors spam spamdetection web},
pages = {423-430},
publisher = {ACM},
timestamp = {2007-10-19T20:52:17.000+0200},
title = {Know your neighbors: web spam detection using the web topology.},
url = {http://www.dcc.uchile.cl/~ccastill/papers/cdgms_2006_know_your_neighbors.pdf},
year = 2007
}