{"id":29195,"updated":"2025-01-22T17:41:34.482638+00:00","links":{},"created":"2025-01-18T22:59:06.710146+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00029195","sets":["1164:2240:2268:2270"]},"path":["2270"],"owner":"1","recid":"29195","title":["自律的な通信回復を行う Fault Tolerant MPI の実装と評価"],"pubdate":{"attribute_name":"公開日","attribute_value":"2003-08-04"},"_buckets":{"deposit":"5d736f45-bc0f-4d40-b38b-67165752d53b"},"_deposit":{"id":"29195","pid":{"type":"depid","value":"29195","revision_id":0},"owners":[1],"status":"published","created_by":1},"item_title":"自律的な通信回復を行う Fault Tolerant MPI の実装と評価","author_link":["0","0"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"自律的な通信回復を行う Fault Tolerant MPI の実装と評価"},{"subitem_title":"Implementation and Evaluation of a Fault Tolerant MPI with Reliable TCP/IP Sockets","subitem_title_language":"en"}]},"item_type_id":"4","publish_date":"2003-08-04","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"東京工業大学"},{"subitem_text_value":"東京工業大学"},{"subitem_text_value":"東京工業大学/国立情報学研究所"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Tokyo Institute of Technology","subitem_text_language":"en"},{"subitem_text_value":"Tokyo Institute of Technology","subitem_text_language":"en"},{"subitem_text_value":"Tokyo Institute of Technology/National Institute of Informatics","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/29195/files/IPSJ-HPC03095026.pdf"},"date":[{"dateType":"Available","dateValue":"2005-08-04"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-HPC03095026.pdf","filesize":[{"value":"154.0 kB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"14"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"4ec728ba-d14c-4591-b09a-86c66e01d1e4","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2003 by the Information Processing Society of Japan"}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"實本, 英之"},{"creatorName":"高宮, 安仁"},{"creatorName":"松岡, 聡"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Hideyuki, Jitsumoto","creatorNameLang":"en"},{"creatorName":"Yasuhito, Takamiya","creatorNameLang":"en"},{"creatorName":"Satoshi, Matsuoka","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10463942","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"クラスタシステムでは、ノード数の増大によりシステム全体の障害発生の潜在的可能性が高い。そのため、長時間にわたる計算を安定して行うには、耐故障性を持ったミドルウェアが必要になる。本研究では、耐故障性を持った MPI の実装と評価を行った。この MPI は逐次プロセスのチェックポインタと耐故障性通信路により MPI プロセスのチェックポインティング/リスタートを行う。ベースとしてMPICHを用い、通信路の耐故障性はRocks ライブラリ、チェックポインタはckpt ライブラリを用いた。32プロセスを用いたNPB-CGの結果、本実装では、オーバーヘッドがオリジナルMPICHの高々8%程度に抑えられることを確認した。","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"On cluster systems, failure rates tend to be high due to a large number of constituents. Therefore, to perform stable long-time computation on cluster systems, middleware support for fault-tolerancy is inevitably required. We implemented a fault-tolerant MPI prototype system and measured the overhead of the system. Our MPI system implements coordinated checkpointing and recovery protocol on MPICH using a single process checkpointer called ckpt and a reliable network called Rocks. Preliminary evaluation using NPB-CG with 32 processes showed the overhead posed by Rocks stayed within just 8% .","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"154","bibliographic_titles":[{"bibliographic_title":"情報処理学会研究報告ハイパフォーマンスコンピューティング(HPC)"}],"bibliographicPageStart":"149","bibliographicIssueDates":{"bibliographicIssueDate":"2003-08-04","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"83(2003-HPC-095)","bibliographicVolumeNumber":"2003"}]},"relation_version_is_last":true,"weko_creator_id":"1"}}