The word2vec model and application by Mikolov et al. have attracted a great
amount of attention in recent two years. The vector representations of words
learned by word2vec models have been shown to carry semantic meanings and are
useful in various NLP tasks. As an increasing number of researchers would like
to experiment with word2vec or similar techniques, I notice that there lacks a
material that comprehensively explains the parameter learning process of word
embedding models in details, thus preventing researchers that are non-experts
in neural networks from understanding the working mechanism of such models.
This note provides detailed derivations and explanations of the parameter
update equations of the word2vec models, including the original continuous
bag-of-word (CBOW) and skip-gram (SG) models, as well as advanced optimization
techniques, including hierarchical softmax and negative sampling. Intuitive
interpretations of the gradient equations are also provided alongside
mathematical derivations.
In the appendix, a review on the basics of neuron networks and
backpropagation is provided. I also created an interactive demo, wevi, to
facilitate the intuitive understanding of the model.
%0 Generic
%1 rong2014word2vec
%A Rong, Xin
%D 2014
%K learning parameter word2vec
%T word2vec Parameter Learning Explained
%U http://arxiv.org/abs/1411.2738
%X The word2vec model and application by Mikolov et al. have attracted a great
amount of attention in recent two years. The vector representations of words
learned by word2vec models have been shown to carry semantic meanings and are
useful in various NLP tasks. As an increasing number of researchers would like
to experiment with word2vec or similar techniques, I notice that there lacks a
material that comprehensively explains the parameter learning process of word
embedding models in details, thus preventing researchers that are non-experts
in neural networks from understanding the working mechanism of such models.
This note provides detailed derivations and explanations of the parameter
update equations of the word2vec models, including the original continuous
bag-of-word (CBOW) and skip-gram (SG) models, as well as advanced optimization
techniques, including hierarchical softmax and negative sampling. Intuitive
interpretations of the gradient equations are also provided alongside
mathematical derivations.
In the appendix, a review on the basics of neuron networks and
backpropagation is provided. I also created an interactive demo, wevi, to
facilitate the intuitive understanding of the model.
@misc{rong2014word2vec,
abstract = {The word2vec model and application by Mikolov et al. have attracted a great
amount of attention in recent two years. The vector representations of words
learned by word2vec models have been shown to carry semantic meanings and are
useful in various NLP tasks. As an increasing number of researchers would like
to experiment with word2vec or similar techniques, I notice that there lacks a
material that comprehensively explains the parameter learning process of word
embedding models in details, thus preventing researchers that are non-experts
in neural networks from understanding the working mechanism of such models.
This note provides detailed derivations and explanations of the parameter
update equations of the word2vec models, including the original continuous
bag-of-word (CBOW) and skip-gram (SG) models, as well as advanced optimization
techniques, including hierarchical softmax and negative sampling. Intuitive
interpretations of the gradient equations are also provided alongside
mathematical derivations.
In the appendix, a review on the basics of neuron networks and
backpropagation is provided. I also created an interactive demo, wevi, to
facilitate the intuitive understanding of the model.},
added-at = {2016-06-10T09:35:05.000+0200},
author = {Rong, Xin},
biburl = {https://www.bibsonomy.org/bibtex/2d49eb5ce192681816f96e2e6df368d4f/thoni},
interhash = {2254925bfa0cfc42fa403b528b5e47b0},
intrahash = {d49eb5ce192681816f96e2e6df368d4f},
keywords = {learning parameter word2vec},
note = {cite arxiv:1411.2738},
timestamp = {2016-09-06T08:23:07.000+0200},
title = {word2vec Parameter Learning Explained},
url = {http://arxiv.org/abs/1411.2738},
year = 2014
}