@article{oai:ipsj.ixsq.nii.ac.jp:00220215, author = {Zhengyang, Bai and Tasuku, Hiraishi and Akihiro, Ida and Masahiro, Yasugi and Zhengyang, Bai and Tasuku, Hiraishi and Akihiro, Ida and Masahiro, Yasugi}, issue = {4}, journal = {情報処理学会論文誌プログラミング(PRO)}, month = {Sep}, note = {A hierarchical matrix (H-matrix) is an approximated form that represents N × N correlations of N objects. H-matrix construction is achieved by dividing a matrix into submatrices (partitioning), followed by calculating these submatrices' element values (filling). Matrix partitioning consists of two steps: cluster tree (CT) construction, where objects are divided into clusters hierarchically; and block cluster tree (BCT) construction, which involves observing all cluster pairs at the same CT level that satisfies the admissibility condition. This study proposes two parallel implementation methods of partitioning operations on distributed memory systems (DMSs): distributed cluster tree construction (DCTC) and redundant cluster tree construction (RCTC). In DCTC, both CT and BCT constructions are parallelized using workers in all computing nodes. In RCTC, CT is constructed in every computing node redundantly by employing only intra-node work stealing. The BCT is then constructed in parallel using workers in all computing nodes. RCTC cannot achieve speedup using multiple computing nodes, but can eliminate the data exchange cost incurred by DCTC. We used the task-parallel language Tascell, which employs both intra- and inter-node work stealing, to handle arbitrary unbalanced tree construction and traversal on DMSs. Our RCTC implementations achieved a 1.11-1.20-fold speedup using up to 8 nodes × 36 workers in numerical experiments with 3D electric field analyses and N ≃ 10 8. ------------------------------ This is a preprint of an article intended for publication Journal of Information Processing(JIP). This preprint should not be cited. This article should be cited as: Journal of Information Processing Vol.30(2022) (online) ------------------------------, A hierarchical matrix (H-matrix) is an approximated form that represents N × N correlations of N objects. H-matrix construction is achieved by dividing a matrix into submatrices (partitioning), followed by calculating these submatrices' element values (filling). Matrix partitioning consists of two steps: cluster tree (CT) construction, where objects are divided into clusters hierarchically; and block cluster tree (BCT) construction, which involves observing all cluster pairs at the same CT level that satisfies the admissibility condition. This study proposes two parallel implementation methods of partitioning operations on distributed memory systems (DMSs): distributed cluster tree construction (DCTC) and redundant cluster tree construction (RCTC). In DCTC, both CT and BCT constructions are parallelized using workers in all computing nodes. In RCTC, CT is constructed in every computing node redundantly by employing only intra-node work stealing. The BCT is then constructed in parallel using workers in all computing nodes. RCTC cannot achieve speedup using multiple computing nodes, but can eliminate the data exchange cost incurred by DCTC. We used the task-parallel language Tascell, which employs both intra- and inter-node work stealing, to handle arbitrary unbalanced tree construction and traversal on DMSs. Our RCTC implementations achieved a 1.11-1.20-fold speedup using up to 8 nodes × 36 workers in numerical experiments with 3D electric field analyses and N ≃ 10 8. ------------------------------ This is a preprint of an article intended for publication Journal of Information Processing(JIP). This preprint should not be cited. This article should be cited as: Journal of Information Processing Vol.30(2022) (online) ------------------------------}, title = {Parallelization of Matrix Partitioning in Hierarchical Matrix Construction on Distributed Memory Systems}, volume = {15}, year = {2022} }