@techreport{oai:ipsj.ixsq.nii.ac.jp:00198145,
 author = {Yosuke, Oyama and Naoya, Maruyama and Nikoli, Dryden and Peter, Harrington and Jan, Balewski and Satoshi, Matsuoka and Marc, Snir and Peter, Nugent and Brian, Van Essen and Yosuke, Oyama and Naoya, Maruyama and Nikoli, Dryden and Peter, Harrington and Jan, Balewski and Satoshi, Matsuoka and Marc, Snir and Peter, Nugent and Brian, Van Essen},
 issue = {8},
 month = {Jul},
 note = {We report our preliminary work on large-scale training of a 3D convolutional neural network model for cosmological analyses of dark matter distributions. Previous work showed promising results for predicting cosmological parameters using CNNs trained on a large-scale parallel computing platform. However, due to its weak scaling nature, there exists a trade-off of training performance and prediction accuracy. This paper extends the existing work for better prediction accuracy and performance by exploiting finer-grained parallelism in distributed convolutions. We show significant improvements using the latest complex cosmological dataset with a huge model that was previously unfeasible due to its memory pressure. We achieve 1.42 PFlop/s on a single training task with a mini-batch size of 128 by using 512 Tesla V100 GPUs. Our results imply that the state-of-the-art deep learning case study can be further advanced with HPC-based algorithms., We report our preliminary work on large-scale training of a 3D convolutional neural network model for cosmological analyses of dark matter distributions. Previous work showed promising results for predicting cosmological parameters using CNNs trained on a large-scale parallel computing platform. However, due to its weak scaling nature, there exists a trade-off of training performance and prediction accuracy. This paper extends the existing work for better prediction accuracy and performance by exploiting finer-grained parallelism in distributed convolutions. We show significant improvements using the latest complex cosmological dataset with a huge model that was previously unfeasible due to its memory pressure. We achieve 1.42 PFlop/s on a single training task with a mini-batch size of 128 by using 512 Tesla V100 GPUs. Our results imply that the state-of-the-art deep learning case study can be further advanced with HPC-based algorithms.},
 title = {Toward Training a Large 3D Cosmological CNN with Hybrid Parallelization},
 year = {2019}
}