M. Kusner, Y. Sun, N. Kolkin, and K. Weinberger. Proceedings of the 32Nd International Conference on International Conference on Machine Learning - Volume 37, page 957--966. JMLR.org, (2015)
Abstract
We present the Word Mover's Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local cooccurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover's Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.
%0 Conference Paper
%1 Kusner:2015:WED:3045118.3045221
%A Kusner, Matt J.
%A Sun, Yu
%A Kolkin, Nicholas I.
%A Weinberger, Kilian Q.
%B Proceedings of the 32Nd International Conference on International Conference on Machine Learning - Volume 37
%D 2015
%I JMLR.org
%K anjali hierarchical similarity-based word-movers-distance word-vectors
%P 957--966
%T From Word Embeddings to Document Distances
%U http://proceedings.mlr.press/v37/kusnerb15.pdf
%X We present the Word Mover's Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local cooccurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover's Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.
@inproceedings{Kusner:2015:WED:3045118.3045221,
abstract = {We present the Word Mover's Distance (WMD), a novel distance function between text documents. Our work is based on recent results in word embeddings that learn semantically meaningful representations for words from local cooccurrences in sentences. The WMD distance measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document. We show that this distance metric can be cast as an instance of the Earth Mover's Distance, a well studied transportation problem for which several highly efficient solvers have been developed. Our metric has no hyperparameters and is straight-forward to implement. Further, we demonstrate on eight real world document classification data sets, in comparison with seven state-of-the-art baselines, that the WMD metric leads to unprecedented low k-nearest neighbor document classification error rates.},
acmid = {3045221},
added-at = {2019-09-02T10:29:26.000+0200},
author = {Kusner, Matt J. and Sun, Yu and Kolkin, Nicholas I. and Weinberger, Kilian Q.},
biburl = {https://www.bibsonomy.org/bibtex/27efee61130797762b3143ab47fef05d3/ghagerer},
booktitle = {Proceedings of the 32Nd International Conference on International Conference on Machine Learning - Volume 37},
interhash = {c5dcb8584655bdb835890936f89766f2},
intrahash = {7efee61130797762b3143ab47fef05d3},
keywords = {anjali hierarchical similarity-based word-movers-distance word-vectors},
location = {Lille, France},
numpages = {10},
pages = {957--966},
publisher = {JMLR.org},
series = {ICML'15},
timestamp = {2020-04-21T13:28:00.000+0200},
title = {From Word Embeddings to Document Distances},
url = {http://proceedings.mlr.press/v37/kusnerb15.pdf},
year = 2015
}