In this paper, we study the effect of using n-grams (sequences of words of length n) for
text categorization. We use an efficient algorithm for generating such n-gram features in two
benchmark domains, the 20 newsgroups data set and 21,578 REUTERS newswire articles.
Our results with the rule learning algorithm RIPPER indicate that, after the removal of stop
words, word sequences of length 2 or 3 are most useful. Using longer sequences reduces
classification performance.
1 Introduction
After...
%0 Generic
%1 citeulike:1952805
%A Johannes, F.
%D 1998
%K categorization, learning, machine, n-gram, text
%T A study using n-gram features for text categorization
%U http://citeseer.ist.psu.edu/176994.html
%X In this paper, we study the effect of using n-grams (sequences of words of length n) for
text categorization. We use an efficient algorithm for generating such n-gram features in two
benchmark domains, the 20 newsgroups data set and 21,578 REUTERS newswire articles.
Our results with the rule learning algorithm RIPPER indicate that, after the removal of stop
words, word sequences of length 2 or 3 are most useful. Using longer sequences reduces
classification performance.
1 Introduction
After...
@misc{citeulike:1952805,
abstract = {In this paper, we study the effect of using n-grams (sequences of words of length n) for
text categorization. We use an efficient algorithm for generating such n-gram features in two
benchmark domains, the 20 newsgroups data set and 21,578 REUTERS newswire articles.
Our results with the rule learning algorithm RIPPER indicate that, after the removal of stop
words, word sequences of length 2 or 3 are most useful. Using longer sequences reduces
classification performance.
1 Introduction
After...},
added-at = {2008-06-17T16:01:02.000+0200},
author = {Johannes, F.},
biburl = {https://www.bibsonomy.org/bibtex/25ba0a2a2c2343196a59fe853a7b2675c/pprett},
citeulike-article-id = {1952805},
interhash = {d5cc63317b73ae062a31033fc01f1a8a},
intrahash = {5ba0a2a2c2343196a59fe853a7b2675c},
keywords = {categorization, learning, machine, n-gram, text},
posted-at = {2007-11-21 16:18:49},
priority = {0},
timestamp = {2008-06-17T16:01:43.000+0200},
title = {A study using n-gram features for text categorization},
url = {http://citeseer.ist.psu.edu/176994.html},
year = 1998
}