As the field of Large Language Models (LLMs) evolves at an accelerated pace, the critical need to assess and monitor their performance emerges. We introduce a benchmarking framework focused on knowledge graph engineering (KGE) accompanied by three challenges addressing syntax and error correction, facts extraction and dataset generation. We show that while being a useful tool, LLMs are yet unfit to assist in knowledge graph generation with zero-shot prompting. Consequently, our LLM-KG-Bench framework provides automatic evaluation and storage of LLM responses as well as statistical data and visualization tools to support tracking of prompt engineering and model performance.
%0 Conference Paper
%1 Meyer2023DevelopingScalableBenchmark
%A Meyer, Lars-Peter
%A Frey, Johannes
%A Junghanns, Kurt
%A Brei, Felix
%A Bulert, Kirill
%A Gründer-Fahrer, Sabine
%A Martin, Michael
%B Proceedings of Poster Track of Semantics 2023
%D 2023
%K es frey group_aksw junghanns lpmeyer martin
%R 10.48550/ARXIV.2308.16622
%T Developing a Scalable Benchmark for Assessing Large Language Models in Knowledge Graph Engineering
%X As the field of Large Language Models (LLMs) evolves at an accelerated pace, the critical need to assess and monitor their performance emerges. We introduce a benchmarking framework focused on knowledge graph engineering (KGE) accompanied by three challenges addressing syntax and error correction, facts extraction and dataset generation. We show that while being a useful tool, LLMs are yet unfit to assist in knowledge graph generation with zero-shot prompting. Consequently, our LLM-KG-Bench framework provides automatic evaluation and storage of LLM responses as well as statistical data and visualization tools to support tracking of prompt engineering and model performance.
@inproceedings{Meyer2023DevelopingScalableBenchmark,
abstract = {As the field of Large Language Models (LLMs) evolves at an accelerated pace, the critical need to assess and monitor their performance emerges. We introduce a benchmarking framework focused on knowledge graph engineering (KGE) accompanied by three challenges addressing syntax and error correction, facts extraction and dataset generation. We show that while being a useful tool, LLMs are yet unfit to assist in knowledge graph generation with zero-shot prompting. Consequently, our LLM-KG-Bench framework provides automatic evaluation and storage of LLM responses as well as statistical data and visualization tools to support tracking of prompt engineering and model performance.},
added-at = {2024-03-04T14:15:46.000+0100},
author = {Meyer, Lars-Peter and Frey, Johannes and Junghanns, Kurt and Brei, Felix and Bulert, Kirill and Gründer-Fahrer, Sabine and Martin, Michael},
biburl = {https://www.bibsonomy.org/bibtex/2d5679b27f00cc37d3b827aa54f162513/aksw},
booktitle = {Proceedings of Poster Track of Semantics 2023},
comment = {Code: https://github.com/AKSW/LLM-KG-Bench
Results: https://github.com/AKSW/LLM-KG-Bench-Results/blob/main/2023-SEMANTICS_LLM-KGE-Bench-Results},
doi = {10.48550/ARXIV.2308.16622},
interhash = {d67494f96887338345f3361ca57d82ac},
intrahash = {d5679b27f00cc37d3b827aa54f162513},
keywords = {es frey group_aksw junghanns lpmeyer martin},
timestamp = {2024-03-04T14:15:46.000+0100},
title = {Developing a Scalable Benchmark for Assessing Large Language Models in Knowledge Graph Engineering},
year = 2023
}