Word embedding is a key component in many downstream applications in
processing natural languages. Existing approaches often assume the existence of
a large collection of text for learning effective word embedding. However, such
a corpus may not be available for some low-resource languages. In this paper,
we study how to effectively learn a word embedding model on a corpus with only
a few million tokens. In such a situation, the co-occurrence matrix is sparse
as the co-occurrences of many word pairs are unobserved. In contrast to
existing approaches often only sample a few unobserved word pairs as negative
samples, we argue that the zero entries in the co-occurrence matrix also
provide valuable information. We then design a Positive-Unlabeled Learning
(PU-Learning) approach to factorize the co-occurrence matrix and validate the
proposed approaches in four different languages.
Description
[1805.03366] LearningWord Embeddings for Low-resource Languages by PU Learning
%0 Generic
%1 jiang2018learningword
%A Jiang, Chao
%A Yu, Hsiang-Fu
%A Hsieh, Cho-Jui
%A Chang, Kai-Wei
%D 2018
%K naacl2018 session6 wordembeddings
%T LearningWord Embeddings for Low-resource Languages by PU Learning
%U http://arxiv.org/abs/1805.03366
%X Word embedding is a key component in many downstream applications in
processing natural languages. Existing approaches often assume the existence of
a large collection of text for learning effective word embedding. However, such
a corpus may not be available for some low-resource languages. In this paper,
we study how to effectively learn a word embedding model on a corpus with only
a few million tokens. In such a situation, the co-occurrence matrix is sparse
as the co-occurrences of many word pairs are unobserved. In contrast to
existing approaches often only sample a few unobserved word pairs as negative
samples, we argue that the zero entries in the co-occurrence matrix also
provide valuable information. We then design a Positive-Unlabeled Learning
(PU-Learning) approach to factorize the co-occurrence matrix and validate the
proposed approaches in four different languages.
@misc{jiang2018learningword,
abstract = {Word embedding is a key component in many downstream applications in
processing natural languages. Existing approaches often assume the existence of
a large collection of text for learning effective word embedding. However, such
a corpus may not be available for some low-resource languages. In this paper,
we study how to effectively learn a word embedding model on a corpus with only
a few million tokens. In such a situation, the co-occurrence matrix is sparse
as the co-occurrences of many word pairs are unobserved. In contrast to
existing approaches often only sample a few unobserved word pairs as negative
samples, we argue that the zero entries in the co-occurrence matrix also
provide valuable information. We then design a Positive-Unlabeled Learning
(PU-Learning) approach to factorize the co-occurrence matrix and validate the
proposed approaches in four different languages.},
added-at = {2018-06-03T19:21:44.000+0200},
author = {Jiang, Chao and Yu, Hsiang-Fu and Hsieh, Cho-Jui and Chang, Kai-Wei},
biburl = {https://www.bibsonomy.org/bibtex/2894e5e70588f65c726a006c870612f73/albinzehe},
description = {[1805.03366] LearningWord Embeddings for Low-resource Languages by PU Learning},
interhash = {0b71b9415972453b66eabd13b9bd7fdc},
intrahash = {894e5e70588f65c726a006c870612f73},
keywords = {naacl2018 session6 wordembeddings},
note = {cite arxiv:1805.03366Comment: Published in NAACL 2018},
timestamp = {2018-06-03T19:21:44.000+0200},
title = {LearningWord Embeddings for Low-resource Languages by PU Learning},
url = {http://arxiv.org/abs/1805.03366},
year = 2018
}