{"links":{},"id":2005948,"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:02005948","sets":["6164:6165:6617:1762413724940"]},"path":["1762413724940"],"owner":"80578","recid":"2005948","title":["Lies, Damned Lies and Benchmarks: An Exploration of LLM Inference Benchmarks for Long Context Workloads"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2025-11-24"},"_buckets":{"deposit":"fffd72ec-273f-456d-a2bb-aa139ef0a9b6"},"_deposit":{"id":"2005948","pid":{"type":"depid","value":"2005948","revision_id":0},"owners":[80578],"status":"published","created_by":80578},"item_title":"Lies, Damned Lies and Benchmarks: An Exploration of LLM Inference Benchmarks for Long Context Workloads","author_link":[],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"Lies, Damned Lies and Benchmarks: An Exploration of LLM Inference Benchmarks for Long Context Workloads","subitem_title_language":"ja"},{"subitem_title":"Lies, Damned Lies and Benchmarks: An Exploration of LLM Inference Benchmarks for Long Context Workloads","subitem_title_language":"en"}]},"item_type_id":"18","publish_date":"2025-11-24","item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_18_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"IBM Research―Tokyo"},{"subitem_text_value":"Delft University of Technology"}]},"item_18_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"IBM Research―Tokyo","subitem_text_language":"en"},{"subitem_text_value":"Delft University of Technology","subitem_text_language":"en"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/2005948/files/IPSJ-ComSys2025007.pdf","label":"IPSJ-ComSys2025007.pdf"},"date":[{"dateType":"Available","dateValue":"2027-11-24"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-ComSys2025007.pdf","filesize":[{"value":"2.1 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"11"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"e0d715aa-a6c6-4e00-b8bd-94cbfdaf4da0","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2025 by the Information Processing Society of Japan"}]},"item_18_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Valentijn Dymphnus,Van De Beek"}]},{"creatorNames":[{"creatorName":"Takeshi,Yoshimura"}]}]},"item_18_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Valentijn Dymphnus Van De Beek","creatorNameLang":"en"}]},{"creatorNames":[{"creatorName":"Takeshi Yoshimura","creatorNameLang":"en"}]}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_5794","resourcetype":"conference paper"}]},"item_18_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"After the introduction of the Transformer architecture in 2017, neural networks have seen widespread adoption across industry, academia, and the wider public. One notable aspect of Large Language Models (LLM) is the ability to develop emerging capabilities on tasks that it has not been trained for, such as prompt engineering, video generation, and multistep reasoning. This can be done by increasing the hardware, training data or context data available to the model. This has resulted in model providers steadily increasing the size of context from 8k tokens in 2023 to 10m tokens in 2025. Alongside this growth has been the publication of a large set of literature aimed at measuring the impact of these increases in context size. In this paper, we analyze 16 of these benchmarks published between 2023 and 2025 in terms of what they measure, what tasks they perform, and the distribution of various attributes. We found that the papers in the question do not consider attributes inherit to the benchmarks such as the token size distribution, the variance between the tasks or variance in prompts in the same task. Additionally, these attributes do have a significant impact on the accuracy of the benchmark. This makes it difficult to compare tasks within the benchmark and trust the accuracy reported in the same task. The amount of variance has increased significantly between generations of the models, while the median of token size has increased at a slower pace. The claimed increase of context size therefore seems to rely on the addition of outliers rather than increasing the overall size of tasks. Due to these attributes, the current set of long-context benchmarks is unsuitable for measuring results of systems research or evaluating large-scale cloud inferencing solutions.","subitem_description_type":"Other"}]},"item_18_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"After the introduction of the Transformer architecture in 2017, neural networks have seen widespread adoption across industry, academia, and the wider public. One notable aspect of Large Language Models (LLM) is the ability to develop emerging capabilities on tasks that it has not been trained for, such as prompt engineering, video generation, and multistep reasoning. This can be done by increasing the hardware, training data or context data available to the model. This has resulted in model providers steadily increasing the size of context from 8k tokens in 2023 to 10m tokens in 2025. Alongside this growth has been the publication of a large set of literature aimed at measuring the impact of these increases in context size. In this paper, we analyze 16 of these benchmarks published between 2023 and 2025 in terms of what they measure, what tasks they perform, and the distribution of various attributes. We found that the papers in the question do not consider attributes inherit to the benchmarks such as the token size distribution, the variance between the tasks or variance in prompts in the same task. Additionally, these attributes do have a significant impact on the accuracy of the benchmark. This makes it difficult to compare tasks within the benchmark and trust the accuracy reported in the same task. The amount of variance has increased significantly between generations of the models, while the median of token size has increased at a slower pace. The claimed increase of context size therefore seems to rely on the addition of outliers rather than increasing the overall size of tasks. Due to these attributes, the current set of long-context benchmarks is unsuitable for measuring results of systems research or evaluating large-scale cloud inferencing solutions.","subitem_description_type":"Other"}]},"item_18_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"70","bibliographic_titles":[{"bibliographic_title":"コンピュータシステム・シンポジウム論文集"}],"bibliographicPageStart":"59","bibliographicIssueDates":{"bibliographicIssueDate":"2025-11-24","bibliographicIssueDateType":"Issued"},"bibliographicVolumeNumber":"2025"}]},"relation_version_is_last":true,"weko_creator_id":"80578"},"created":"2025-11-18T08:00:24.734967+00:00","updated":"2025-11-26T00:37:31.042594+00:00"}