Automatic key phrase extraction is a useful tool in many text related applications such as clustering and summarization. State-of-the-art methods are aimed towards extracting key phrases from traditional text such as technical papers. Application of these methods on Web documents, which often contain diverse and heterogeneous contents, is of particular interest and challenge in the information age. In this work, we investigate the significance of narrative text classification in the task of automatic key phrase extraction in Web document corpora. We benchmark three methods, TFIDF, KEA, and Keyterm, used to extract key phrases from all the plain text and from only the narrative text of Web pages. ANOVA tests are used to analyze the ranking data collected in a user study using quantitative measures of acceptable percentage and quality value. The evaluation shows that key phrases extracted from the narrative text only are significantly better than those obtained from all plain text of Web pages. This demonstrates that narrative text classification is indispensable for effective key phrase extraction in Web document corpora.
Beschreibung
Narrative text classification for automatic key phrase extraction in web document corpora
%0 Conference Paper
%1 Zhang05narrativeKeyphrase
%A Zhang, Yongzheng
%A Zincir-Heywood, Nur
%A Milios, Evangelos
%B WIDM '05: Proceedings of the 7th annual ACM international workshop on Web information and data management
%C New York, NY, USA
%D 2005
%I ACM
%K 05 Zhang extraction keyphrase narrative text
%P 51--58
%R http://doi.acm.org/10.1145/1097047.1097059
%T Narrative text classification for automatic key phrase extraction in web document corpora
%U http://portal.acm.org/citation.cfm?id=1097059
%X Automatic key phrase extraction is a useful tool in many text related applications such as clustering and summarization. State-of-the-art methods are aimed towards extracting key phrases from traditional text such as technical papers. Application of these methods on Web documents, which often contain diverse and heterogeneous contents, is of particular interest and challenge in the information age. In this work, we investigate the significance of narrative text classification in the task of automatic key phrase extraction in Web document corpora. We benchmark three methods, TFIDF, KEA, and Keyterm, used to extract key phrases from all the plain text and from only the narrative text of Web pages. ANOVA tests are used to analyze the ranking data collected in a user study using quantitative measures of acceptable percentage and quality value. The evaluation shows that key phrases extracted from the narrative text only are significantly better than those obtained from all plain text of Web pages. This demonstrates that narrative text classification is indispensable for effective key phrase extraction in Web document corpora.
%@ 1-59593-194-5
@inproceedings{Zhang05narrativeKeyphrase,
abstract = {Automatic key phrase extraction is a useful tool in many text related applications such as clustering and summarization. State-of-the-art methods are aimed towards extracting key phrases from traditional text such as technical papers. Application of these methods on Web documents, which often contain diverse and heterogeneous contents, is of particular interest and challenge in the information age. In this work, we investigate the significance of narrative text classification in the task of automatic key phrase extraction in Web document corpora. We benchmark three methods, TFIDF, KEA, and Keyterm, used to extract key phrases from all the plain text and from only the narrative text of Web pages. ANOVA tests are used to analyze the ranking data collected in a user study using quantitative measures of acceptable percentage and quality value. The evaluation shows that key phrases extracted from the narrative text only are significantly better than those obtained from all plain text of Web pages. This demonstrates that narrative text classification is indispensable for effective key phrase extraction in Web document corpora.},
added-at = {2010-03-11T16:49:52.000+0100},
address = {New York, NY, USA},
author = {Zhang, Yongzheng and Zincir-Heywood, Nur and Milios, Evangelos},
biburl = {https://www.bibsonomy.org/bibtex/22287f5b8f602381955e73825cd784275/lee_peck},
booktitle = {WIDM '05: Proceedings of the 7th annual ACM international workshop on Web information and data management},
description = {Narrative text classification for automatic key phrase extraction in web document corpora},
doi = {http://doi.acm.org/10.1145/1097047.1097059},
interhash = {22c8195ff4b07d29d01090b1f632455d},
intrahash = {2287f5b8f602381955e73825cd784275},
isbn = {1-59593-194-5},
keywords = {05 Zhang extraction keyphrase narrative text},
location = {Bremen, Germany},
pages = {51--58},
publisher = {ACM},
timestamp = {2010-03-11T16:49:52.000+0100},
title = {Narrative text classification for automatic key phrase extraction in web document corpora},
url = {http://portal.acm.org/citation.cfm?id=1097059},
year = 2005
}