{"updated":"2025-01-19T07:34:33.208326+00:00","links":{},"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00241698","sets":["1164:2240:11467:11814"]},"path":["11814"],"owner":"44499","recid":"241698","title":["Asynchronous Decentralized Distributed K-FAC : Enhancing Training Efficiency and Load Balancing in Heterogeneous Environments (unreferred)"],"pubdate":{"attribute_name":"公開日","attribute_value":"2024-12-09"},"_buckets":{"deposit":"65699593-d461-4537-85cb-13be5b94b182"},"_deposit":{"id":"241698","pid":{"type":"depid","value":"241698","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"Asynchronous Decentralized Distributed K-FAC : Enhancing Training Efficiency and Load Balancing in Heterogeneous Environments (unreferred)","author_link":["665870","665868","665869","665867"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"Asynchronous Decentralized Distributed K-FAC : Enhancing Training Efficiency and Load Balancing in Heterogeneous Environments (unreferred)"},{"subitem_title":"Asynchronous Decentralized Distributed K-FAC : Enhancing Training Efficiency and Load Balancing in Heterogeneous Environments (unreferred)","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"並列計算","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2024-12-09","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"Graduate School of Science and Technology, University of Tsukuba"},{"subitem_text_value":"Center for Computational Sciences, University of Tsukuba"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Graduate School of Science and Technology, University of Tsukuba","subitem_text_language":"en"},{"subitem_text_value":"Center for Computational Sciences, University of Tsukuba","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/241698/files/IPSJ-HPC24197005.pdf","label":"IPSJ-HPC24197005.pdf"},"date":[{"dateType":"Available","dateValue":"2026-12-09"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-HPC24197005.pdf","filesize":[{"value":"1.8 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"14"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"8ad867c9-f4c9-45e7-b227-aaa674d18414","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2024 by the Information Processing Society of Japan"}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Mingzhe, Yu"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Osamu, Tatebe"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Mingzhe, Yu","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Osamu, Tatebe","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10463942","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8841","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"We propose AD-KFAC, an asynchronous decentralized framework extending the K-FAC optimizer for large-scale distributed deep learning in heterogeneous and unstable environments. Our method incorporates dynamic load balancing and fault tolerance mechanisms to enhance scalability and robustness. By combining an asynchronous K-FAC computation module, an RPC communication module, and dynamic task allocation based on the Raft consensus algorithm, our framework mitigates the impact of straggler nodes and network instability. Experimental results on a 16-node cluster using ResNet-18 on the CIFAR-10 dataset demonstrate that AD-KFAC outperforms the baseline integrated with PyTorch Distributed Data Parallel under varying computation and communication delays, as well as in the presence of node crashes or disconnections. Our method maintains stable convergence and achieves comparable or better final accuracy, showcasing its effectiveness.","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"We propose AD-KFAC, an asynchronous decentralized framework extending the K-FAC optimizer for large-scale distributed deep learning in heterogeneous and unstable environments. Our method incorporates dynamic load balancing and fault tolerance mechanisms to enhance scalability and robustness. By combining an asynchronous K-FAC computation module, an RPC communication module, and dynamic task allocation based on the Raft consensus algorithm, our framework mitigates the impact of straggler nodes and network instability. Experimental results on a 16-node cluster using ResNet-18 on the CIFAR-10 dataset demonstrate that AD-KFAC outperforms the baseline integrated with PyTorch Distributed Data Parallel under varying computation and communication delays, as well as in the presence of node crashes or disconnections. Our method maintains stable convergence and achieves comparable or better final accuracy, showcasing its effectiveness.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"10","bibliographic_titles":[{"bibliographic_title":"研究報告ハイパフォーマンスコンピューティング(HPC)"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2024-12-09","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"5","bibliographicVolumeNumber":"2024-HPC-197"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":241698,"created":"2025-01-19T01:46:27.400283+00:00"}