Social media outlets such as Twitter have become an important forum for peer interaction. Thus the ability to classify latent user attributes, including gender, age, regional origin, and political orientation solely from Twitter user language or similar highly informal content has important applications in advertising, personalization, and recommendation. This paper includes a novel investigation of stacked-SVM-based classification algorithms over a rich set of original features, applied to classifying these four user attributes. It also includes extensive analysis of features and approaches that are effective and not effective in classifying user attributes in Twitter-style informal written genres as distinct from the other primarily spoken genres previously studied in the userproperty classification literature. Our models, singly and in ensemble, significantly outperform baseline models in all cases. A detailed analysis of model components and features provides an often entertaining insight into distinctive language-usage variation across gender, age, regional origin and political orientation in modern informal communication.
Description
CiteSeerX — Classifying latent user attributes in Twitter
%0 Conference Paper
%1 Rao10classifyinglatent
%A Rao, Delip
%A Yarowsky, David
%A Shreevats, Abhishek
%A Gupta, Manaswi
%B In Proc. of SMUC
%D 2010
%K classification machineLearning scienceontwitter twitter twitterScholar
%T Classifying latent user attributes in Twitter
%U http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.208.5011
%X Social media outlets such as Twitter have become an important forum for peer interaction. Thus the ability to classify latent user attributes, including gender, age, regional origin, and political orientation solely from Twitter user language or similar highly informal content has important applications in advertising, personalization, and recommendation. This paper includes a novel investigation of stacked-SVM-based classification algorithms over a rich set of original features, applied to classifying these four user attributes. It also includes extensive analysis of features and approaches that are effective and not effective in classifying user attributes in Twitter-style informal written genres as distinct from the other primarily spoken genres previously studied in the userproperty classification literature. Our models, singly and in ensemble, significantly outperform baseline models in all cases. A detailed analysis of model components and features provides an often entertaining insight into distinctive language-usage variation across gender, age, regional origin and political orientation in modern informal communication.
@inproceedings{Rao10classifyinglatent,
abstract = {Social media outlets such as Twitter have become an important forum for peer interaction. Thus the ability to classify latent user attributes, including gender, age, regional origin, and political orientation solely from Twitter user language or similar highly informal content has important applications in advertising, personalization, and recommendation. This paper includes a novel investigation of stacked-SVM-based classification algorithms over a rich set of original features, applied to classifying these four user attributes. It also includes extensive analysis of features and approaches that are effective and not effective in classifying user attributes in Twitter-style informal written genres as distinct from the other primarily spoken genres previously studied in the userproperty classification literature. Our models, singly and in ensemble, significantly outperform baseline models in all cases. A detailed analysis of model components and features provides an often entertaining insight into distinctive language-usage variation across gender, age, regional origin and political orientation in modern informal communication.},
added-at = {2013-05-31T12:46:01.000+0200},
author = {Rao, Delip and Yarowsky, David and Shreevats, Abhishek and Gupta, Manaswi},
biburl = {https://www.bibsonomy.org/bibtex/27f8739f57eac514804224cc0b507831b/asmelash},
booktitle = {In Proc. of SMUC},
description = {CiteSeerX — Classifying latent user attributes in Twitter},
interhash = {16f8b7a2859cc53b47128332a6814818},
intrahash = {7f8739f57eac514804224cc0b507831b},
keywords = {classification machineLearning scienceontwitter twitter twitterScholar},
timestamp = {2013-07-30T18:42:08.000+0200},
title = {Classifying latent user attributes in Twitter},
url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.208.5011},
year = 2010
}