Frequent itemset mining (FIM) is a useful tool for discovering frequently co-occurrent items. Since its inception, a number of significant FIM algorithms have been developed to speed up mining performance. Unfortunately, when the dataset size is huge, both the memory use and computational cost can still be prohibitively expensive. In this work, we propose to parallelize the FP-Growth algorithm (we call our parallel algorithm PFP) on distributed machines. PFP partitions computation in such a way that each machine executes an independent group of mining tasks. Such partitioning eliminates computational dependencies between machines, and thereby communication between them. Through empirical study on a large dataset of 802,939 Web pages and 1,021,107 tags, we demonstrate that PFP can achieve virtually linear speedup. Besides scalability, the empirical study demonstrates that PFP to be promising for supporting query recommendation for search engines.
Description
CiteULike: Pfp: parallel fp-growth for query recommendation
%0 Conference Paper
%1 li08pfp
%A Li, Haoyuan
%A Wang, Yi
%A Zhang, Dong
%A Zhang, Ming
%A Chang, Edward Y.
%B Proceedings of the 2008 ACM conference on Recommender systems
%C New York, NY, USA
%D 2008
%I ACM
%K RelatedWork
%P 107--114
%R 10.1145/1454008.1454027
%T Pfp: parallel fp-growth for query recommendation
%U http://dx.doi.org/10.1145/1454008.1454027
%X Frequent itemset mining (FIM) is a useful tool for discovering frequently co-occurrent items. Since its inception, a number of significant FIM algorithms have been developed to speed up mining performance. Unfortunately, when the dataset size is huge, both the memory use and computational cost can still be prohibitively expensive. In this work, we propose to parallelize the FP-Growth algorithm (we call our parallel algorithm PFP) on distributed machines. PFP partitions computation in such a way that each machine executes an independent group of mining tasks. Such partitioning eliminates computational dependencies between machines, and thereby communication between them. Through empirical study on a large dataset of 802,939 Web pages and 1,021,107 tags, we demonstrate that PFP can achieve virtually linear speedup. Besides scalability, the empirical study demonstrates that PFP to be promising for supporting query recommendation for search engines.
%@ 978-1-60558-093-7
@inproceedings{li08pfp,
abstract = {Frequent itemset mining ({FIM}) is a useful tool for discovering frequently co-occurrent items. Since its inception, a number of significant {FIM} algorithms have been developed to speed up mining performance. Unfortunately, when the dataset size is huge, both the memory use and computational cost can still be prohibitively expensive. In this work, we propose to parallelize the {FP}-Growth algorithm (we call our parallel algorithm {PFP}) on distributed machines. {PFP} partitions computation in such a way that each machine executes an independent group of mining tasks. Such partitioning eliminates computational dependencies between machines, and thereby communication between them. Through empirical study on a large dataset of 802,939 Web pages and 1,021,107 tags, we demonstrate that {PFP} can achieve virtually linear speedup. Besides scalability, the empirical study demonstrates that {PFP} to be promising for supporting query recommendation for search engines.},
added-at = {2013-04-15T15:02:00.000+0200},
address = {New York, NY, USA},
author = {Li, Haoyuan and Wang, Yi and Zhang, Dong and Zhang, Ming and Chang, Edward Y.},
biburl = {https://www.bibsonomy.org/bibtex/2e812fc8eba0bc5465ba622a766e75c27/macek},
booktitle = {Proceedings of the 2008 ACM conference on Recommender systems},
citeulike-article-id = {5739219},
citeulike-linkout-0 = {http://portal.acm.org/citation.cfm?id=1454008.1454027},
citeulike-linkout-1 = {http://dx.doi.org/10.1145/1454008.1454027},
description = {CiteULike: Pfp: parallel fp-growth for query recommendation},
doi = {10.1145/1454008.1454027},
interhash = {96e48726a8d9a10bb0fba548f9e2edb0},
intrahash = {e812fc8eba0bc5465ba622a766e75c27},
isbn = {978-1-60558-093-7},
keywords = {RelatedWork},
location = {Lausanne, Switzerland},
pages = {107--114},
posted-at = {2009-09-29 11:20:53},
priority = {2},
publisher = {ACM},
series = {RecSys '08},
timestamp = {2013-04-15T15:02:00.000+0200},
title = {Pfp: parallel fp-growth for query recommendation},
url = {http://dx.doi.org/10.1145/1454008.1454027},
year = 2008
}