B. Hutchinson. Proceedings of the Fourth International Conference on Language Resources and Evaluation (LREC 2004), page 407--410. (2004)
Abstract
This paper proposes a methodology for obtaining sentences containing discourse markers from the World Wide Web. The proposed methodology is particularly suitable for collecting large numbers of discourse marker tokens. It relies on the automatic identification of discourse markers, and we show that this can be done with an accuracy within 9\% of that of human performance. We also show that the distribution of discourse markers on the web correlates highly with those in a conventional balanced corpus.
%0 Conference Paper
%1 citeulike:160787
%A Hutchinson, Ben
%B Proceedings of the Fourth International Conference on Language Resources and Evaluation (LREC 2004)
%D 2004
%K discourse-marker, natural-language
%P 407--410
%T Mining the web for discourse markers
%X This paper proposes a methodology for obtaining sentences containing discourse markers from the World Wide Web. The proposed methodology is particularly suitable for collecting large numbers of discourse marker tokens. It relies on the automatic identification of discourse markers, and we show that this can be done with an accuracy within 9\% of that of human performance. We also show that the distribution of discourse markers on the web correlates highly with those in a conventional balanced corpus.
@inproceedings{citeulike:160787,
abstract = {{This paper proposes a methodology for obtaining sentences containing discourse markers from the World Wide Web. The proposed methodology is particularly suitable for collecting large numbers of discourse marker tokens. It relies on the automatic identification of discourse markers, and we show that this can be done with an accuracy within 9\% of that of human performance. We also show that the distribution of discourse markers on the web correlates highly with those in a conventional balanced corpus.}},
added-at = {2010-12-17T18:47:41.000+0100},
author = {Hutchinson, Ben},
biburl = {https://www.bibsonomy.org/bibtex/2f17bbd4264b99df5711735afe8171db1/mortimer_m8},
booktitle = {Proceedings of the Fourth International Conference on Language Resources and Evaluation (LREC 2004)},
citeulike-article-id = {160787},
interhash = {0b155fbb79cd6a05974ee8aa26778877},
intrahash = {f17bbd4264b99df5711735afe8171db1},
keywords = {discourse-marker, natural-language},
pages = {407--410},
posted-at = {2005-04-14 13:24:05},
priority = {3},
timestamp = {2010-12-20T11:11:25.000+0100},
title = {{Mining the web for discourse markers}},
year = 2004
}