In this paper we investigate the nature and structure of the relation between imposed classifications and real clustering in a particular case of a scale-free network given by the on-line encyclopedia Wikipedia. We find a statistical similarity in the distributions of community sizes both by using the top-down approach of the categories division present in the archive and in the bottom-up procedure of community detection given by an algorithm based on the spectral properties of the graph. Regardless the statistically similar behaviour the two methods provide a rather different division of the articles, thereby signaling that the nature and presence of power laws is a general feature for these systems and cannot be used as a benchmark to evaluate the suitability of a clustering method.
Description
Taxonomy and clustering in collaborative systems: the case of the on-line encyclopedia Wikipedia
%0 Generic
%1 capocci2007taxonomy
%A Capocci, A.
%A Rao, F.
%A Caldarelli, G.
%D 2007
%K wikipedia taxonomy toread comparison clustering
%T Taxonomy and clustering in collaborative systems: the case of the on-line encyclopedia Wikipedia
%U http://www.citebase.org/abstract?id=oai:arXiv.org:0710.3058
%X In this paper we investigate the nature and structure of the relation between imposed classifications and real clustering in a particular case of a scale-free network given by the on-line encyclopedia Wikipedia. We find a statistical similarity in the distributions of community sizes both by using the top-down approach of the categories division present in the archive and in the bottom-up procedure of community detection given by an algorithm based on the spectral properties of the graph. Regardless the statistically similar behaviour the two methods provide a rather different division of the articles, thereby signaling that the nature and presence of power laws is a general feature for these systems and cannot be used as a benchmark to evaluate the suitability of a clustering method.
@misc{capocci2007taxonomy,
abstract = { In this paper we investigate the nature and structure of the relation between imposed classifications and real clustering in a particular case of a scale-free network given by the on-line encyclopedia Wikipedia. We find a statistical similarity in the distributions of community sizes both by using the top-down approach of the categories division present in the archive and in the bottom-up procedure of community detection given by an algorithm based on the spectral properties of the graph. Regardless the statistically similar behaviour the two methods provide a rather different division of the articles, thereby signaling that the nature and presence of power laws is a general feature for these systems and cannot be used as a benchmark to evaluate the suitability of a clustering method.},
added-at = {2011-01-28T11:34:01.000+0100},
author = {Capocci, A. and Rao, F. and Caldarelli, G.},
biburl = {https://www.bibsonomy.org/bibtex/29c69bc97d22b7e5c2d90d8765b491a16/dbenz},
description = {Taxonomy and clustering in collaborative systems: the case of the on-line encyclopedia Wikipedia},
interhash = {df8a20aa40cce46aa0adf4f6360664dc},
intrahash = {9c69bc97d22b7e5c2d90d8765b491a16},
keywords = {wikipedia taxonomy toread comparison clustering},
timestamp = {2013-07-31T15:39:42.000+0200},
title = {Taxonomy and clustering in collaborative systems: the case of the on-line encyclopedia Wikipedia},
url = {http://www.citebase.org/abstract?id=oai:arXiv.org:0710.3058},
year = 2007
}