Stable and repeatable measurements are essential for comparing the performance of different systems or applications, and benchmarks are used to ensure accuracy and replication. However, if the corresponding measurements are not stable and repeatable, wrong conclusions can be drawn. To facilitate the task of determining whether the measurements are similar, we used a data set of 586 micro-benchmarks to (i) analyze the data set itself, (ii) examine our previous approach, and (iii) propose and evaluate a heuristic. To evaluate the different approaches, we perform a peer review to assess the dissimilarity of the benchmark runs. Our results show that this task is challenging even for humans and that our heuristic exhibits a sensitivity of 92%.
%0 Conference Paper
%1 10.1145/3578245.3584693
%A Bauer, André
%A Straesser, Martin
%A Leznik, Mark
%A Beierlieb, Lukas
%A Hadry, Marius
%A Hudson, Nathaniel
%A Chard, Kyle
%A Kounev, Samuel
%A Foster, Ian
%B Companion of the 2023 ACM/SPEC International Conference on Performance Engineering
%C New York, NY, USA
%D 2023
%I Association for Computing Machinery
%K ICPE_data_challenge descartes distance_measures similarity t_short time_series myown
%P 95–99
%T Searching for the Ground Truth: Assessing the Similarity of Benchmarking Runs
%U https://doi.org/10.1145/3578245.3584693
%X Stable and repeatable measurements are essential for comparing the performance of different systems or applications, and benchmarks are used to ensure accuracy and replication. However, if the corresponding measurements are not stable and repeatable, wrong conclusions can be drawn. To facilitate the task of determining whether the measurements are similar, we used a data set of 586 micro-benchmarks to (i) analyze the data set itself, (ii) examine our previous approach, and (iii) propose and evaluate a heuristic. To evaluate the different approaches, we perform a peer review to assess the dissimilarity of the benchmark runs. Our results show that this task is challenging even for humans and that our heuristic exhibits a sensitivity of 92%.
@inproceedings{10.1145/3578245.3584693,
abstract = {Stable and repeatable measurements are essential for comparing the performance of different systems or applications, and benchmarks are used to ensure accuracy and replication. However, if the corresponding measurements are not stable and repeatable, wrong conclusions can be drawn. To facilitate the task of determining whether the measurements are similar, we used a data set of 586 micro-benchmarks to (i) analyze the data set itself, (ii) examine our previous approach, and (iii) propose and evaluate a heuristic. To evaluate the different approaches, we perform a peer review to assess the dissimilarity of the benchmark runs. Our results show that this task is challenging even for humans and that our heuristic exhibits a sensitivity of 92%.},
added-at = {2023-04-10T01:05:04.000+0200},
address = {New York, NY, USA},
author = {Bauer, Andr\'{e} and Straesser, Martin and Leznik, Mark and Beierlieb, Lukas and Hadry, Marius and Hudson, Nathaniel and Chard, Kyle and Kounev, Samuel and Foster, Ian},
biburl = {https://www.bibsonomy.org/bibtex/2801fa8ba1e51f2cda34a00a2a7f87d10/lukas.beierlieb},
booktitle = {Companion of the 2023 ACM/SPEC International Conference on Performance Engineering},
interhash = {dfbd55cb013fe925675bf15611e27bee},
intrahash = {801fa8ba1e51f2cda34a00a2a7f87d10},
keywords = {ICPE_data_challenge descartes distance_measures similarity t_short time_series myown},
pages = {95–99},
publisher = {Association for Computing Machinery},
series = {ICPE '23 Companion},
timestamp = {2023-04-10T01:05:04.000+0200},
title = {Searching for the Ground Truth: Assessing the Similarity of Benchmarking Runs},
url = {https://doi.org/10.1145/3578245.3584693},
year = 2023
}