@techreport{oai:ipsj.ixsq.nii.ac.jp:00061253, author = {Hiroshi, Nakashima and Yohei, Miyake and Hideyuki, Usui and Yoshiharu, Omura and Hiroshi, Nakashima and Yohei, Miyake and Hideyuki, Usui and Yoshiharu, Omura}, issue = {14(2009-ARC-182)}, month = {Feb}, note = {We proposed an efficient and scalable load balancing method named OhHelp for Particle-in-Cell (PIC) simulations. This method simply and equally partitions the space domain, in which charged particles are distributed nonuniformly in general, so that each computation node works on each partitioned primary subdomain. Load imbalance problem caused by the nonuniformity of the particle distribution is solved by making every but one node also work on another subdomain where particles densely populate as its secondary subdomain together with a part of particles in it. We applied the OhHelp method to a production level full-3D PIC simulator for space plasma and evaluated its performance on our T2K Open Supercomputer. As a result, we confirmed our simulator is not only efficient showing 150--190 speedup with 256 CPU cores compared to the sequential execution of a reference simulator, but also scalable in terms of both the space domain size and the number of particles as the break down of execution times evidences., We proposed an efficient and scalable load balancing method named OhHelp for Particle-in-Cell (PIC) simulations. This method simply and equally partitions the space domain, in which charged particles are distributed nonuniformly in general, so that each computation node works on each partitioned primary subdomain. Load imbalance problem caused by the nonuniformity of the particle distribution is solved by making every but one node also work on another subdomain where particles densely populate as its secondary subdomain together with a part of particles in it. We applied the OhHelp method to a production level full-3D PIC simulator for space plasma and evaluated its performance on our T2K Open Supercomputer. As a result, we confirmed our simulator is not only efficient showing 150--190 speedup with 256 CPU cores compared to the sequential execution of a reference simulator, but also scalable in terms of both the space domain size and the number of particles as the break down of execution times evidences.}, title = {Performance Evaluation of OhHelp'ed 3D Particle-in-Cell Simulation}, year = {2009} }