@techreport{oai:ipsj.ixsq.nii.ac.jp:00239732,
 author = {左高, 稜也 and 深沢, 圭一郎 and 岩下, 武史 and Ryoya, Sataka and Keiichiro, Fukazawa and Takeshi, Iwashita},
 issue = {3},
 month = {Sep},
 note = {現在の ML（機械学習）/ AI（人工知能）計算環境では GPU の利用が主流となっている中，Intel 第 4 世代 Xeon 以降に搭載されている CPU 内アクセラレータである AMX (Advanced Matrix Extensions) ではデータの移動に関しての優位点があると考えられる．そこで，本研究では，ML のベンチマーク（Python コード）を利用し，AMX の性能評価を行った．更にメモリ構成による性能の違いを評価するために，第 4 世代 Xeon の AMX+HBM と AMX+DRAM 構成という 2 つの環境を利用し，評価を実施した．評価の結果，AMX を利用することで AVX-512 を利用した場合の 2 倍以上の性能を確認できた．また，適切なアフィニティ設定することにより，より高い性能が達成でき，2CPU で 1GPU（A100）と同等の性能になることが明らかになった．更に，大規模データセットを利用することで，AMX と A100 の性能差が縮小する傾向があることが分かった．これらの結果は，CPU での ML/AI 計算においても AMX を利用することで，GPU に近い性能を達成できる可能性を示唆している．, In the current ML (Machine Learning) / AI (Artificial Intelligence) computing environments, the use of GPUs is mainstream. However, Intel's 4th Generation Xeon CPUs, which feature the Advanced Matrix Extensions (AMX) accelerator, may offer advantages in terms of data movement. In this study, we evaluated the performance of AMX using ML benchmarks (Python code). To assess the performance differences based on memory configurations, we conducted evaluations using two environments: AMX+HBM and AMX+DRAM on 4th Generation Xeon processors. The evaluation results showed that using AMX achieved more than twice the performance compared to using AVX-512. Additionally, we found that higher performance could be achieved with appropriate affinity settings, reaching performance equivalent to 1 GPU (A100) with 2 CPUs. Furthermore, the performance gap between AMX and A100 tended to narrow with the use of large-scale datasets. These findings suggest that it is possible to achieve performance close to that of GPUs by utilizing AMX in ML/AI computing on CPUs.},
 title = {深層学習アプリケーションを利用したAdvanced Matrix Extension（AMX）の性能評価},
 year = {2024}
}