Identifying Document Topics Using the Wikipedia Category Network
P. Schonhofen. Web Intelligence, 2006. WI 2006. IEEE/WIC/ACM International Conference on, page 456--462. Washington, DC, USA, IEEE, (December 2006)
DOI: 10.1109/WI.2006.92
Abstract
In the size and coverage of Wikipedia, a freely available online encyclopedia has reached the point where it can be utilized similar to an ontology or taxonomy to identify the topics discussed in a document. In this paper we show that even a simple algorithm that exploits only the titles and categories of Wikipedia articles can characterize documents by Wikipedia categories surprisingly well. We test the reliability of our method by predicting categories of Wikipedia articles themselves based on their bodies, and by performing classification and clustering on 20 newsgroups and RCV1, representing documents by their Wikipedia categories instead of their texts
Description
CiteULike: Identifying Document Topics Using the Wikipedia Category Network
%0 Conference Paper
%1 citeulike:1839949
%A Schonhofen, Peter
%B Web Intelligence, 2006. WI 2006. IEEE/WIC/ACM International Conference on
%C Washington, DC, USA
%D 2006
%I IEEE
%K topics wikipedia
%P 456--462
%R 10.1109/WI.2006.92
%T Identifying Document Topics Using the Wikipedia Category Network
%U http://dx.doi.org/10.1109/WI.2006.92
%X In the size and coverage of Wikipedia, a freely available online encyclopedia has reached the point where it can be utilized similar to an ontology or taxonomy to identify the topics discussed in a document. In this paper we show that even a simple algorithm that exploits only the titles and categories of Wikipedia articles can characterize documents by Wikipedia categories surprisingly well. We test the reliability of our method by predicting categories of Wikipedia articles themselves based on their bodies, and by performing classification and clustering on 20 newsgroups and RCV1, representing documents by their Wikipedia categories instead of their texts
%@ 0-7695-2747-7
@inproceedings{citeulike:1839949,
abstract = {In the size and coverage of Wikipedia, a freely available online encyclopedia has reached the point where it can be utilized similar to an ontology or taxonomy to identify the topics discussed in a document. In this paper we show that even a simple algorithm that exploits only the titles and categories of Wikipedia articles can characterize documents by Wikipedia categories surprisingly well. We test the reliability of our method by predicting categories of Wikipedia articles themselves based on their bodies, and by performing classification and clustering on 20 newsgroups and {RCV1}, representing documents by their Wikipedia categories instead of their texts},
added-at = {2012-06-15T15:49:27.000+0200},
address = {Washington, DC, USA},
author = {Schonhofen, Peter},
biburl = {https://www.bibsonomy.org/bibtex/2d36930ac28d845fb36943d56aac7dee2/peterr},
booktitle = {Web Intelligence, 2006. WI 2006. IEEE/WIC/ACM International Conference on},
citeulike-article-id = {1839949},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1249180},
citeulike-linkout-1 = {http://dx.doi.org/10.1109/WI.2006.92},
citeulike-linkout-2 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4061411},
description = {CiteULike: Identifying Document Topics Using the Wikipedia Category Network},
doi = {10.1109/WI.2006.92},
institution = {Comput. \& Autom. Res. Inst., Hungarian Acad. of Sci., Budapest},
interhash = {18c7887ecba23304b91f9c789e746c7b},
intrahash = {d36930ac28d845fb36943d56aac7dee2},
isbn = {0-7695-2747-7},
keywords = {topics wikipedia},
month = dec,
pages = {456--462},
posted-at = {2012-04-26 06:40:15},
priority = {4},
publisher = {IEEE},
series = {WI '06},
timestamp = {2012-06-15T15:49:27.000+0200},
title = {Identifying Document Topics Using the Wikipedia Category Network},
url = {http://dx.doi.org/10.1109/WI.2006.92},
year = 2006
}