Machine learning approaches to multi-label document classification have (to
date) largely relied on discriminative modeling techniques such as support
vector machines. A drawback of these approaches is that performance rapidly
drops off as the total number of labels and the number of labels per document
increase. This problem is amplified when the label frequencies exhibit the type
of highly skewed distributions that are often observed in real-world datasets.
In this paper we investigate a class of generative statistical topic models for
multi-label documents that associate individual word tokens with different
labels. We investigate the advantages of this approach relative to
discriminative models, particularly with respect to classification problems
involving large numbers of relatively rare labels. We compare the performance
of generative and discriminative approaches on document labeling tasks ranging
from datasets with several thousand labels to datasets with tens of labels. The
experimental results indicate that generative models can achieve competitive
multi-label classification performance compared to discriminative methods, and
have advantages for datasets with many labels and skewed label frequencies.
Description
Statistical Topic Models for Multi-Label Document Classification
%0 Generic
%1 Rubin2011
%A Rubin, Timothy N.
%A Chambers, America
%A Smyth, Padhraic
%A Steyvers, Mark
%D 2011
%K mining model text tm topic toread
%T Statistical Topic Models for Multi-Label Document Classification
%U http://arxiv.org/abs/1107.2462
%X Machine learning approaches to multi-label document classification have (to
date) largely relied on discriminative modeling techniques such as support
vector machines. A drawback of these approaches is that performance rapidly
drops off as the total number of labels and the number of labels per document
increase. This problem is amplified when the label frequencies exhibit the type
of highly skewed distributions that are often observed in real-world datasets.
In this paper we investigate a class of generative statistical topic models for
multi-label documents that associate individual word tokens with different
labels. We investigate the advantages of this approach relative to
discriminative models, particularly with respect to classification problems
involving large numbers of relatively rare labels. We compare the performance
of generative and discriminative approaches on document labeling tasks ranging
from datasets with several thousand labels to datasets with tens of labels. The
experimental results indicate that generative models can achieve competitive
multi-label classification performance compared to discriminative methods, and
have advantages for datasets with many labels and skewed label frequencies.
@misc{Rubin2011,
abstract = { Machine learning approaches to multi-label document classification have (to
date) largely relied on discriminative modeling techniques such as support
vector machines. A drawback of these approaches is that performance rapidly
drops off as the total number of labels and the number of labels per document
increase. This problem is amplified when the label frequencies exhibit the type
of highly skewed distributions that are often observed in real-world datasets.
In this paper we investigate a class of generative statistical topic models for
multi-label documents that associate individual word tokens with different
labels. We investigate the advantages of this approach relative to
discriminative models, particularly with respect to classification problems
involving large numbers of relatively rare labels. We compare the performance
of generative and discriminative approaches on document labeling tasks ranging
from datasets with several thousand labels to datasets with tens of labels. The
experimental results indicate that generative models can achieve competitive
multi-label classification performance compared to discriminative methods, and
have advantages for datasets with many labels and skewed label frequencies.
},
added-at = {2011-09-14T08:20:38.000+0200},
author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark},
biburl = {https://www.bibsonomy.org/bibtex/2f8a5a3958ae264d19c7f5415eb7f0bce/hotho},
description = {Statistical Topic Models for Multi-Label Document Classification},
interhash = {e09d5d8587756d460a5d834025e75aac},
intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce},
keywords = {mining model text tm topic toread},
note = {cite arxiv:1107.2462},
timestamp = {2011-09-14T08:20:38.000+0200},
title = {Statistical Topic Models for Multi-Label Document Classification},
url = {http://arxiv.org/abs/1107.2462},
year = 2011
}