@article{oai:ipsj.ixsq.nii.ac.jp:00198580,
 author = {藤田, 典久 and 小林, 諒平 and 山口, 佳樹 and 朴, 泰祐 and 吉川, 耕司 and 安部, 牧人 and 梅村, 雅之 and Norihisa, Fujita and Ryohei, Kobayashi and Yoshiki, Yamaguchi and Taisuke, Boku and Kohji, Yoshikawa and Makito, Abe and Masayuki, Umemura},
 issue = {3},
 journal = {情報処理学会論文誌コンピューティングシステム（ACS）},
 month = {Jul},
 note = {近年，High Performance Computing（HPC）におけるチャレンジの中の一つに，高い性能と低い消費電力を持つField Programmable Gate Array（FPGA）技術をどのようにして次世代のスーパーコンピュータに用いるかという問題がある．Graphics Processing Unit（GPU）がHPCにおけるアクセラレータとして最も広く用いられているが，均一な大量の並列計算が必要であり，これが性能上のボトルネックとなる場合がある．一方で，FPGAは再構成回路による柔軟さと効率さを持っており，様々な問題に適応できる可能性を持つ．しかしながら，ハードウェアの動作を記述することは複雑であり，アプリケーションの開発者がFPGA回路を実装することは容易ではない．近年のFPGAにおける開発環境の進歩により，OpenCL言語を用いた高位合成（HLS: High Level Synthesis）開発環境が一般的になってきている．我々のこれまでのOpenCLを用いたカーネル記述の経験より，FPGA向けにアプリケーション記述する際は“co-design”に基づくアグレッシブなプログラミング戦略が高い性能を達成するうえで必要であることが分かっている．本研究では，宇宙輻射輸送を解くプログラムで用いられているアルゴリズムであるAuthentic Radiation Transfer（ART）法をOpenCLで記述してFPGA向けに最適化を行う．OpenCLで記述されたアプリケーションに対してco-designに基づくFPGA向け最適化を適用し，CPU，GPU，FPGA間での性能比較を行った．マルチコアCPU実装と比べて最大4.9倍の高速化が達成され，GPU実装との比較ではGPUと同程度の性能を達成した．FPGA実装の性能はGPUと同程度であるが，FPGAの方が通信オーバヘッドはGPUと比べると小さく，並列計算を行う際の性能はGPUの性能を超えられると考えられることから，今後，並列FPGA計算の実装を行う予定である．, One of the recent challenges faced by HPC is how to apply FPGA technology to accelerate a next-generation supercomputer as an efficient method of achieving high performance and low power consumption. GPU is the most commonly used accelerator for HPC supported by regularly executed high degree of parallel operations which causes performance bottleneck in some cases. On the other hand, there are great opportunities to flexibly and efficiently utilize FPGAs in reconfigurable circuits to fit various computing situations. However, it is not easy for application developers to implement FPGA logic circuits for their applications and algorithms, which generally require complicated hardware logic descriptions. Because of the progress made in the FPGA development environment in recent years, the HLS development environment using the OpenCL language has become popular. Based on our experience describing kernels using OpenCL, we found that a more aggressive programming strategy is necessary to realize true high performance based on a “co-design” concept to implement the necessary features and operations to fit the target application in an FPGA design. In this paper, we optimize the ART method used in space radiative transfer problems on an FPGA using OpenCL. Using a co-designed method for the optimized programming of a specific application with OpenCL for an FPGA, we achieved a performance that is 4.9 times faster than that of a multicore CPU implementation, and almost the same performance as a GPU implementation. Considering the current advanced FPGAs with interconnection features, we believe that their parallelized implementation with multiple FPGAs will achieve a higher performance than GPU.},
 pages = {64--75},
 title = {宇宙輻射輸送コードにおけるOpenCLによるFPGA演算加速最適化},
 volume = {12},
 year = {2019}
}