As language evolves over time, documents stored in long- term archives become inaccessible to users. Automatically, detecting and handling language evolution will become a necessity to meet user’s information needs. In this paper, we investigate the performance of modern tools and algorithms applied on modern English to find word senses that will later serve as a basis for finding evolution. We apply the curvature clustering algorithm on all nouns and noun phrases extracted from The Times Archive (1785–1985). We use natural language processors for part-of-speech tagging and lemmatization and report on the performance of these processors over the entire period. We evaluate our clusters using WordNet to verify whether they correspond to valid word senses. Because The Times Archive contains OCR errors, we investigate the effects of such errors on word sense discrimination results. Finally, we present a novel approach to correct OCR errors present in the archive and show that the coverage of the curvature clustering algorithm improves. We increase the number of clusters by 24 %. To verify our results, we use the New York Times corpus (1987–2007), a recent collection that is considered error free, as a ground truth for our experiments. We find that after correcting OCR errors in The Times Archive, the performance of word sense discrimination applied on The Times Archive is comparable to the ground truth.
%0 Journal Article
%1 tahmasebi2013applicability
%A Tahmasebi, Nina
%A Niklas, Kai
%A Zenz, Gideon
%A Risse, Thomas
%D 2013
%I Springer-Verlag
%J International Journal on Digital Libraries
%K 2013 arcomem language myown wordsense_discrimination
%P 1-19
%R 10.1007/s00799-013-0105-8
%T On the applicability of word sense discrimination on 201 years of modern english
%X As language evolves over time, documents stored in long- term archives become inaccessible to users. Automatically, detecting and handling language evolution will become a necessity to meet user’s information needs. In this paper, we investigate the performance of modern tools and algorithms applied on modern English to find word senses that will later serve as a basis for finding evolution. We apply the curvature clustering algorithm on all nouns and noun phrases extracted from The Times Archive (1785–1985). We use natural language processors for part-of-speech tagging and lemmatization and report on the performance of these processors over the entire period. We evaluate our clusters using WordNet to verify whether they correspond to valid word senses. Because The Times Archive contains OCR errors, we investigate the effects of such errors on word sense discrimination results. Finally, we present a novel approach to correct OCR errors present in the archive and show that the coverage of the curvature clustering algorithm improves. We increase the number of clusters by 24 %. To verify our results, we use the New York Times corpus (1987–2007), a recent collection that is considered error free, as a ground truth for our experiments. We find that after correcting OCR errors in The Times Archive, the performance of word sense discrimination applied on The Times Archive is comparable to the ground truth.
@article{tahmasebi2013applicability,
abstract = {As language evolves over time, documents stored in long- term archives become inaccessible to users. Automatically, detecting and handling language evolution will become a necessity to meet user’s information needs. In this paper, we investigate the performance of modern tools and algorithms applied on modern English to find word senses that will later serve as a basis for finding evolution. We apply the curvature clustering algorithm on all nouns and noun phrases extracted from The Times Archive (1785–1985). We use natural language processors for part-of-speech tagging and lemmatization and report on the performance of these processors over the entire period. We evaluate our clusters using WordNet to verify whether they correspond to valid word senses. Because The Times Archive contains OCR errors, we investigate the effects of such errors on word sense discrimination results. Finally, we present a novel approach to correct OCR errors present in the archive and show that the coverage of the curvature clustering algorithm improves. We increase the number of clusters by 24 %. To verify our results, we use the New York Times corpus (1987–2007), a recent collection that is considered error free, as a ground truth for our experiments. We find that after correcting OCR errors in The Times Archive, the performance of word sense discrimination applied on The Times Archive is comparable to the ground truth.},
added-at = {2013-03-20T14:45:43.000+0100},
author = {Tahmasebi, Nina and Niklas, Kai and Zenz, Gideon and Risse, Thomas},
biburl = {https://www.bibsonomy.org/bibtex/2f0236c94447d59197db567d28f627754/trisse69},
doi = {10.1007/s00799-013-0105-8},
interhash = {b231a2213b971dbaf693f4d833e351c6},
intrahash = {f0236c94447d59197db567d28f627754},
issn = {1432-5012},
journal = {International Journal on Digital Libraries},
keywords = {2013 arcomem language myown wordsense_discrimination},
language = {English},
pages = {1-19},
publisher = {Springer-Verlag},
timestamp = {2014-03-10T15:31:00.000+0100},
title = {On the applicability of word sense discrimination on 201 years of modern english},
year = 2013
}