As performance improvements are being increasingly sought via coarse-grained parallelism, established expectations of continued sequential performance increases are not being met. Current trends in computing point towards platforms seeking performance improvements through various degrees of parallelism, with coarse-grained parallelism features becoming commonplace in even entry-level systems.Yet the broad variety of multiprocessor configurations that will be available that differ in the number of processing elements will make it difficult to statically create a single parallel version of a program that performs well on the whole range of such hardware. As a result, there will soon be a vast number of multiprocessor systems that are significantly under-utilized for lack of software that harnesses their power effectively. This problem is exacerbated by the growing inventory of legacy programs in binary executable form with possibly unreachable source code.We present a system that improves the performance of optimized sequential binaries through dynamic recompilation. Leveraging observations made at runtime, a thin software layer recompiles executing code compiled for a uniprocessor and generates parallelized and/or vectorized code segments that exploit available parallel resources. Among the techniques employed are control speculation, loop distribution across several threads, and automatic parallelization of recursive routines.Our solution is entirely software-based and can be ported to existing hardware platforms that have parallel processing capabilities. Our performance results are obtained on real hardware without using simulation.In preliminary benchmarks on only modestly parallel (2-way) hardware, our system already provides speedups of upto 40% on SpecCPU benchmarks, and near-optimal speedups on more obviously parallelizable benchmarks.
Description
Dynamic parallelization and mapping of binary executables on hierarchical platforms
%0 Conference Paper
%1 1128040
%A Yardimci, Efe
%A Franz, Michael
%B CF '06: Proceedings of the 3rd conference on Computing frontiers
%C New York, NY, USA
%D 2006
%I ACM
%K Parallelization automatic
%P 127--138
%R http://doi.acm.org/10.1145/1128022.1128040
%T Dynamic Parallelization and Mapping of Binary Executables on Hierarchical Platforms
%U http://portal.acm.org/citation.cfm?id=1128022.1128040
%X As performance improvements are being increasingly sought via coarse-grained parallelism, established expectations of continued sequential performance increases are not being met. Current trends in computing point towards platforms seeking performance improvements through various degrees of parallelism, with coarse-grained parallelism features becoming commonplace in even entry-level systems.Yet the broad variety of multiprocessor configurations that will be available that differ in the number of processing elements will make it difficult to statically create a single parallel version of a program that performs well on the whole range of such hardware. As a result, there will soon be a vast number of multiprocessor systems that are significantly under-utilized for lack of software that harnesses their power effectively. This problem is exacerbated by the growing inventory of legacy programs in binary executable form with possibly unreachable source code.We present a system that improves the performance of optimized sequential binaries through dynamic recompilation. Leveraging observations made at runtime, a thin software layer recompiles executing code compiled for a uniprocessor and generates parallelized and/or vectorized code segments that exploit available parallel resources. Among the techniques employed are control speculation, loop distribution across several threads, and automatic parallelization of recursive routines.Our solution is entirely software-based and can be ported to existing hardware platforms that have parallel processing capabilities. Our performance results are obtained on real hardware without using simulation.In preliminary benchmarks on only modestly parallel (2-way) hardware, our system already provides speedups of upto 40% on SpecCPU benchmarks, and near-optimal speedups on more obviously parallelizable benchmarks.
%@ 1-59593-302-6
@inproceedings{1128040,
abstract = {As performance improvements are being increasingly sought via coarse-grained parallelism, established expectations of continued sequential performance increases are not being met. Current trends in computing point towards platforms seeking performance improvements through various degrees of parallelism, with coarse-grained parallelism features becoming commonplace in even entry-level systems.Yet the broad variety of multiprocessor configurations that will be available that differ in the number of processing elements will make it difficult to statically create a single parallel version of a program that performs well on the whole range of such hardware. As a result, there will soon be a vast number of multiprocessor systems that are significantly under-utilized for lack of software that harnesses their power effectively. This problem is exacerbated by the growing inventory of legacy programs in binary executable form with possibly unreachable source code.We present a system that improves the performance of optimized sequential binaries through dynamic recompilation. Leveraging observations made at runtime, a thin software layer recompiles executing code compiled for a uniprocessor and generates parallelized and/or vectorized code segments that exploit available parallel resources. Among the techniques employed are control speculation, loop distribution across several threads, and automatic parallelization of recursive routines.Our solution is entirely software-based and can be ported to existing hardware platforms that have parallel processing capabilities. Our performance results are obtained on real hardware without using simulation.In preliminary benchmarks on only modestly parallel (2-way) hardware, our system already provides speedups of upto 40% on SpecCPU benchmarks, and near-optimal speedups on more obviously parallelizable benchmarks.},
added-at = {2008-10-21T10:15:06.000+0200},
address = {New York, NY, USA},
author = {Yardimci, Efe and Franz, Michael},
biburl = {https://www.bibsonomy.org/bibtex/273fcdbf89c6e65f1c7f7bfa680aab331/gron},
booktitle = {CF '06: Proceedings of the 3rd conference on Computing frontiers},
description = {Dynamic parallelization and mapping of binary executables on hierarchical platforms},
doi = {http://doi.acm.org/10.1145/1128022.1128040},
interhash = {681977b61c58cef5d2cfad5d15de3556},
intrahash = {73fcdbf89c6e65f1c7f7bfa680aab331},
isbn = {1-59593-302-6},
keywords = {Parallelization automatic},
location = {Ischia, Italy},
pages = {127--138},
publisher = {ACM},
timestamp = {2008-10-21T10:15:06.000+0200},
title = {Dynamic Parallelization and Mapping of Binary Executables on Hierarchical Platforms},
url = {http://portal.acm.org/citation.cfm?id=1128022.1128040},
year = 2006
}