@techreport{oai:ipsj.ixsq.nii.ac.jp:00231073, author = {Yijie, Yu and Toshihiro, Hanawa and Yijie, Yu and Toshihiro, Hanawa}, issue = {34}, month = {Nov}, note = {In recent years, Field Programmable Gate Arrays (FPGAs) have been studied as a new accelerator of the HPC field since it enables custom hardware for application-specific functions. Compared to conventional OpenCL descriptions, SYCL simplifies managing codes because it does not require the separation of host and device code. We designed and optimized a computation kernel of HACApK, a hierarchical matrix library (H-matrix), using SYCL for the FPGA acceleration. H-matrix can reduce computation and memory usage by approximating submatrices of dense matrices with low-rank matrices and representing the original large dense matrix as a set of smaller dense matrices and low-rank approximation matrices and are considered suitable for FPGA implementation because of their relatively complex calculation patterns that mix dense and low-rank approximation matrices. In this study, the matrix-vector multiplication in HACApK was ported to SYCL and optimized using several techniques. As a result, almost the same performance as the single CPU core of the Intel Xeon Skylake could be achieved with the single pipeline., In recent years, Field Programmable Gate Arrays (FPGAs) have been studied as a new accelerator of the HPC field since it enables custom hardware for application-specific functions. Compared to conventional OpenCL descriptions, SYCL simplifies managing codes because it does not require the separation of host and device code. We designed and optimized a computation kernel of HACApK, a hierarchical matrix library (H-matrix), using SYCL for the FPGA acceleration. H-matrix can reduce computation and memory usage by approximating submatrices of dense matrices with low-rank matrices and representing the original large dense matrix as a set of smaller dense matrices and low-rank approximation matrices and are considered suitable for FPGA implementation because of their relatively complex calculation patterns that mix dense and low-rank approximation matrices. In this study, the matrix-vector multiplication in HACApK was ported to SYCL and optimized using several techniques. As a result, almost the same performance as the single CPU core of the Intel Xeon Skylake could be achieved with the single pipeline.}, title = {Hierarchical Matrix Calculation for FPGA using SYCL}, year = {2023} }