Based on randomly drawn vectors in a separable Hilbert space, one may construct a k-means clustering scheme by minimizing an empirical squared error. We investigate the risk of such a clustering scheme, defined as the expected squared distance of a random vector X from the set of cluster centers. Our main result states that, for an almost surely bounded , the expected excess clustering risk is O(¿1/n) . Since clustering in high (or even infinite)-dimensional spaces may lead to severe computational problems, we examine the properties of a dimension reduction strategy for clustering based on Johnson-Lindenstrauss-type random projections. Our results reflect a tradeoff between accuracy and computational complexity when one uses k-means clustering after random projection of the data to a low-dimensional space. We argue that random projections work better than other simplistic dimension reduction schemes.
%0 Journal Article
%1 citeulike:11151499
%A Biau, G.
%A Devroye, L.
%A Lugosi, G.
%D 2008
%I IEEE
%J Information Theory, IEEE Transactions on
%K 68p30-coding-and-information-theory 62h30-classification-discrimination-cluster-analysis 46c05-hilbert-and-prehilbert-spaces-geometry-topology 60g46-martingales-and-classical-analysis
%N 2
%P 781--790
%R 10.1109/tit.2007.913516
%T On the Performance of Clustering in Hilbert Spaces
%U http://dx.doi.org/10.1109/tit.2007.913516
%V 54
%X Based on randomly drawn vectors in a separable Hilbert space, one may construct a k-means clustering scheme by minimizing an empirical squared error. We investigate the risk of such a clustering scheme, defined as the expected squared distance of a random vector X from the set of cluster centers. Our main result states that, for an almost surely bounded , the expected excess clustering risk is O(¿1/n) . Since clustering in high (or even infinite)-dimensional spaces may lead to severe computational problems, we examine the properties of a dimension reduction strategy for clustering based on Johnson-Lindenstrauss-type random projections. Our results reflect a tradeoff between accuracy and computational complexity when one uses k-means clustering after random projection of the data to a low-dimensional space. We argue that random projections work better than other simplistic dimension reduction schemes.
@article{citeulike:11151499,
abstract = {{Based on randomly drawn vectors in a separable Hilbert space, one may construct a k-means clustering scheme by minimizing an empirical squared error. We investigate the risk of such a clustering scheme, defined as the expected squared distance of a random vector X from the set of cluster centers. Our main result states that, for an almost surely bounded , the expected excess clustering risk is O(\^{A}¿1/n) . Since clustering in high (or even infinite)-dimensional spaces may lead to severe computational problems, we examine the properties of a dimension reduction strategy for clustering based on Johnson-Lindenstrauss-type random projections. Our results reflect a tradeoff between accuracy and computational complexity when one uses k-means clustering after random projection of the data to a low-dimensional space. We argue that random projections work better than other simplistic dimension reduction schemes.}},
added-at = {2017-06-29T07:13:07.000+0200},
author = {Biau, G. and Devroye, L. and Lugosi, G.},
biburl = {https://www.bibsonomy.org/bibtex/28deea17d73768bc98042dea4c6a90240/gdmcbain},
citeulike-article-id = {11151499},
citeulike-attachment-1 = {biau-devroye-lugosi-clustering2007.pdf; /pdf/user/gdmcbain/article/11151499/943107/biau-devroye-lugosi-clustering2007.pdf; 63f69ffd64583c289668adb3c274ce1d7efef97b},
citeulike-linkout-0 = {http://dx.doi.org/10.1109/tit.2007.913516},
citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4439834},
doi = {10.1109/tit.2007.913516},
file = {biau-devroye-lugosi-clustering2007.pdf},
institution = {LSTA \& LPMA, Univ. Pierre et Marie Curie-Paris VI, Paris, France},
interhash = {5ea9e0fd1edd47097314f932da064b31},
intrahash = {8deea17d73768bc98042dea4c6a90240},
issn = {0018-9448},
journal = {Information Theory, IEEE Transactions on},
keywords = {68p30-coding-and-information-theory 62h30-classification-discrimination-cluster-analysis 46c05-hilbert-and-prehilbert-spaces-geometry-topology 60g46-martingales-and-classical-analysis},
month = feb,
number = 2,
pages = {781--790},
posted-at = {2014-01-23 22:34:12},
priority = {0},
publisher = {IEEE},
timestamp = {2021-07-19T06:57:36.000+0200},
title = {On the Performance of Clustering in {H}ilbert Spaces},
url = {http://dx.doi.org/10.1109/tit.2007.913516},
volume = 54,
year = 2008
}