@techreport{oai:ipsj.ixsq.nii.ac.jp:00241688,
 author = {Kengo, Nakajima and Kengo, Nakajima},
 issue = {23},
 month = {Dec},
 note = {Preconditioned iterative methods based on the Krylov subspace technique are widely employed in various scientific and technical computing. When utilizing large-scale parallel computing systems, the communication overhead tends to increase with the growth in the number of nodes, making its reduction a crucial challenge. In parallel FEM/FVM, halo communication and computation overlapping (CC-Overlapping) are commonly employed, often in conjunction with the dynamic loop scheduling feature of OpenMP. In the previous work, the author proposes a method to apply CC-Overlapping to the forward and backward substitutions of the IC(0) smoother of the parallel Conjugate Gradient method preconditioned by Multigrid (MGCG). Using up to 4,096 nodes on Wisteria/BDEC-01 (Odyssey) with A64FX, performance improvement of approximately 40+% was achieved compared to the original implementation. In the present work, effects of process/thread allocation within a compute node in OpenMP/MPI Hybrid parallel programming model has been conducted for optimization of CC-Overlapping., Preconditioned iterative methods based on the Krylov subspace technique are widely employed in various scientific and technical computing. When utilizing large-scale parallel computing systems, the communication overhead tends to increase with the growth in the number of nodes, making its reduction a crucial challenge. In parallel FEM/FVM, halo communication and computation overlapping (CC-Overlapping) are commonly employed, often in conjunction with the dynamic loop scheduling feature of OpenMP. In the previous work, the author proposes a method to apply CC-Overlapping to the forward and backward substitutions of the IC(0) smoother of the parallel Conjugate Gradient method preconditioned by Multigrid (MGCG). Using up to 4,096 nodes on Wisteria/BDEC-01 (Odyssey) with A64FX, performance improvement of approximately 40+% was achieved compared to the original implementation. In the present work, effects of process/thread allocation within a compute node in OpenMP/MPI Hybrid parallel programming model has been conducted for optimization of CC-Overlapping.},
 title = {Effects of Process/Thread Allocation for Optimization of Communication-Computation Overlapping in Parallel Multigrid Methods},
 year = {2024}
}