We propose a new approach for thematic text clustering. The text clusters are used to generate domain specific language models in order to address the problem of language model adaptation. The method relies on a new discriminative n-gram based term selection process (n>l), which reduces the influence of the corpus inhomogeneity, and outputs only semantically focused n-grams as being the most representative key terms in the corpus. These key terms are then used to automatically cluster the whole document collection and generate LM out of these text clusters. Different key term selection methods are evaluated using perplexity as a measure. Automatically computed clusters are compared with manually assigned labelling according to genre information. The results of these experimental studies are presented and discussed. Compared to the manual clustering a significant performance improvement between 21.87 % and 53.12 % is observed depending on the chosen key term selection method.
%0 Conference Paper
%1 valsan03clustering
%A Valsan, Z.
%A Emele, M.
%B Automatic Speech Recognition and Understanding, 2003. ASRU '03. 2003 IEEE Workshop on
%D 2003
%K state.printed research.clustering state.toRead research.nlp research.kr.domain
%P 513- 518
%R 10.1109/ASRU.2003.1318493
%T Thematic text clustering for domain specific language model adaptation
%U http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1318493
%X We propose a new approach for thematic text clustering. The text clusters are used to generate domain specific language models in order to address the problem of language model adaptation. The method relies on a new discriminative n-gram based term selection process (n>l), which reduces the influence of the corpus inhomogeneity, and outputs only semantically focused n-grams as being the most representative key terms in the corpus. These key terms are then used to automatically cluster the whole document collection and generate LM out of these text clusters. Different key term selection methods are evaluated using perplexity as a measure. Automatically computed clusters are compared with manually assigned labelling according to genre information. The results of these experimental studies are presented and discussed. Compared to the manual clustering a significant performance improvement between 21.87 % and 53.12 % is observed depending on the chosen key term selection method.
%@ 0-7803-7980-2
@inproceedings{valsan03clustering,
abstract = {We propose a new approach for thematic text clustering. The text clusters are used to generate domain specific language models in order to address the problem of language model adaptation. The method relies on a new discriminative n-gram based term selection process (n>l), which reduces the influence of the corpus inhomogeneity, and outputs only semantically focused n-grams as being the most representative key terms in the corpus. These key terms are then used to automatically cluster the whole document collection and generate LM out of these text clusters. Different key term selection methods are evaluated using perplexity as a measure. Automatically computed clusters are compared with manually assigned labelling according to genre information. The results of these experimental studies are presented and discussed. Compared to the manual clustering a significant performance improvement between 21.87 % and 53.12 % is observed depending on the chosen key term selection method.},
added-at = {2009-06-25T16:49:08.000+0200},
author = {Valsan, Z. and Emele, M.},
biburl = {https://www.bibsonomy.org/bibtex/25db7751ea4549230d191e6c34edcc79b/msn},
booktitle = {Automatic Speech Recognition and Understanding, 2003. ASRU '03. 2003 IEEE Workshop on},
doi = {10.1109/ASRU.2003.1318493},
file = {valsan03clustering.pdf:papers\\valsan03clustering.pdf:PDF},
interhash = {9510c6a2458f27127e258a7ae4f4d2ba},
intrahash = {5db7751ea4549230d191e6c34edcc79b},
isbn = {0-7803-7980-2},
keywords = {state.printed research.clustering state.toRead research.nlp research.kr.domain},
pages = {513- 518},
timestamp = {2009-06-25T16:49:08.000+0200},
title = {Thematic text clustering for domain specific language model adaptation},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=1318493},
year = 2003
}