A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.
%0 Journal Article
%1 Bengio:2003:NPL:944919.944966
%A Bengio, Yoshua
%A Ducharme, Réjean
%A Vincent, Pascal
%A Janvin, Christian
%D 2003
%I JMLR.org
%J J. Mach. Learn. Res.
%K ma-zehe thema:word_embeddings wordembeddings
%P 1137--1155
%T A Neural Probabilistic Language Model
%U http://dl.acm.org/citation.cfm?id=944919.944966
%V 3
%X A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.
@article{Bengio:2003:NPL:944919.944966,
abstract = {A goal of statistical language modeling is to learn the joint probability function of sequences of words in a language. This is intrinsically difficult because of the curse of dimensionality: a word sequence on which the model will be tested is likely to be different from all the word sequences seen during training. Traditional but very successful approaches based on n-grams obtain generalization by concatenating very short overlapping sequences seen in the training set. We propose to fight the curse of dimensionality by learning a distributed representation for words which allows each training sentence to inform the model about an exponential number of semantically neighboring sentences. The model learns simultaneously (1) a distributed representation for each word along with (2) the probability function for word sequences, expressed in terms of these representations. Generalization is obtained because a sequence of words that has never been seen before gets high probability if it is made of words that are similar (in the sense of having a nearby representation) to words forming an already seen sentence. Training such large models (with millions of parameters) within a reasonable time is itself a significant challenge. We report on experiments using neural networks for the probability function, showing on two text corpora that the proposed approach significantly improves on state-of-the-art n-gram models, and that the proposed approach allows to take advantage of longer contexts.},
acmid = {944966},
added-at = {2016-05-31T21:20:05.000+0200},
author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Janvin, Christian},
biburl = {https://www.bibsonomy.org/bibtex/25bc1d3d1be6247dd6365014919a711ef/albinzehe},
description = {A neural probabilistic language model},
interhash = {99a471df7eb1ed9d39f9e7cb08859ec9},
intrahash = {5bc1d3d1be6247dd6365014919a711ef},
issn = {1532-4435},
issue_date = {3/1/2003},
journal = {J. Mach. Learn. Res.},
keywords = {ma-zehe thema:word_embeddings wordembeddings},
month = mar,
numpages = {19},
pages = {1137--1155},
publisher = {JMLR.org},
timestamp = {2016-11-14T11:18:05.000+0100},
title = {A Neural Probabilistic Language Model},
url = {http://dl.acm.org/citation.cfm?id=944919.944966},
volume = 3,
year = 2003
}