With the increasing use of research paper search engines, such as CiteSeer, for both literature search and hiring decisions, the accuracy of such systems is of paramount importance. This paper employs Conditional Random Fields (CRFs) for the task of extracting various common fields from the headers and citation of research papers. The basic theory of CRFs is becoming well-understood, but best-practices for applying them to real-world data requires additional exploration. This paper makes an empirical exploration of several factors, including variations on Gaussian, exponential and hyperbolic-priors for improved regularization, and several classes of features and Markov order. On a standard benchmark data set, we achieve new state-of-the-art performance, reducing error in average F1 by 36%, and word error rate by 78% in comparison with the previous best SVM results. Accuracy compares even more favorably against HMMs.
%0 Journal Article
%1 peng2004accurate
%A Peng, Fuchun
%A McCallum, Andrew
%D 2004
%K crf extraction information paper research thema thema:crf
%P 329--336
%T Accurate Information Extraction from Research Papers using Conditional Random Fields
%V 2004
%X With the increasing use of research paper search engines, such as CiteSeer, for both literature search and hiring decisions, the accuracy of such systems is of paramount importance. This paper employs Conditional Random Fields (CRFs) for the task of extracting various common fields from the headers and citation of research papers. The basic theory of CRFs is becoming well-understood, but best-practices for applying them to real-world data requires additional exploration. This paper makes an empirical exploration of several factors, including variations on Gaussian, exponential and hyperbolic-priors for improved regularization, and several classes of features and Markov order. On a standard benchmark data set, we achieve new state-of-the-art performance, reducing error in average F1 by 36%, and word error rate by 78% in comparison with the previous best SVM results. Accuracy compares even more favorably against HMMs.
@article{peng2004accurate,
abstract = {
With the increasing use of research paper search engines, such as CiteSeer, for both literature search and hiring decisions, the accuracy of such systems is of paramount importance. This paper employs Conditional Random Fields (CRFs) for the task of extracting various common fields from the headers and citation of research papers. The basic theory of CRFs is becoming well-understood, but best-practices for applying them to real-world data requires additional exploration. This paper makes an empirical exploration of several factors, including variations on Gaussian, exponential and hyperbolic-priors for improved regularization, and several classes of features and Markov order. On a standard benchmark data set, we achieve new state-of-the-art performance, reducing error in average F1 by 36%, and word error rate by 78% in comparison with the previous best SVM results. Accuracy compares even more favorably against HMMs.},
added-at = {2016-10-05T18:51:45.000+0200},
author = {Peng, Fuchun and McCallum, Andrew},
biburl = {https://www.bibsonomy.org/bibtex/202e982ebb16b9c2c5bd224b665689215/nosebrain},
interhash = {8f9ef6b359fef3bd08bfed653fe1bb55},
intrahash = {02e982ebb16b9c2c5bd224b665689215},
keywords = {crf extraction information paper research thema thema:crf},
pages = {329--336},
timestamp = {2016-10-05T22:37:57.000+0200},
title = {Accurate Information Extraction from Research Papers using Conditional Random Fields},
volume = 2004,
year = 2004
}