{"id":29363,"updated":"2025-01-22T17:36:03.101967+00:00","links":{},"created":"2025-01-18T22:59:13.988351+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00029363","sets":["1164:2240:2278:2280"]},"path":["2280"],"owner":"1","recid":"29363","title":["ユーザ透過な耐故障性を実現するMPIへ向けて"],"pubdate":{"attribute_name":"公開日","attribute_value":"2001-07-25"},"_buckets":{"deposit":"3a4c40a9-f206-4aaa-855d-3f6ccae2a1da"},"_deposit":{"id":"29363","pid":{"type":"depid","value":"29363","revision_id":0},"owners":[1],"status":"published","created_by":1},"item_title":"ユーザ透過な耐故障性を実現するMPIへ向けて","author_link":["0","0"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"ユーザ透過な耐故障性を実現するMPIへ向けて"},{"subitem_title":"Towards MPI with user - transparent fault tolerance","subitem_title_language":"en"}]},"item_type_id":"4","publish_date":"2001-07-25","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"東京工業大学"},{"subitem_text_value":"東京工業大学/科学技術振興事業団"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Tokyo Institute of Technology","subitem_text_language":"en"},{"subitem_text_value":"Tokyo Institute of Technology/JST","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/29363/files/IPSJ-HPC01087023.pdf"},"date":[{"dateType":"Available","dateValue":"2003-07-25"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-HPC01087023.pdf","filesize":[{"value":"469.5 kB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"14"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"3aad2a3e-f6ad-4c83-baec-c9558c3c39a9","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2001 by the Information Processing Society of Japan"}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"高宮, 安仁"},{"creatorName":"松岡, 聡"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Yasuhito, Takamiya","creatorNameLang":"en"},{"creatorName":"Satoshi, Matsuoka","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10463942","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"コモディティクラスタリングシステムにおける,ノード数規模の拡大,計算実行時間およびメモリ空間の急激なスケールアップに伴い,アプリケーションおよびシステムの障害発生の潜在的可能性への対処が急務となっている.しかし,クラスタ等の並列計算分野では,これまでこうした耐故障性についてのソフトウェア開発が重視されておらず,十分ではなかった.また,信頼性 ユーザ透過性 実行時オーバヘッドの兼ね合いをユーザが指定することのできる,柔軟な耐故障性機構が求められているが,従来のクラスタ向け耐故障性システムでは,単一のポリシ/機構専用のものがほとんどであった.加えて,実アプリケーションを用いた場合のオーバヘッドも明らかではなかった.本稿では,耐故障性機構をもつ MPI である,Parakeet システムを提案する.Parakeet システムを用いることによって,ユーザは性能を損ねることなく,容易に耐故障性,リカバリのポリシ/機構を指定できる.本稿では予備段階として,ユーザレベルチェックポインタ,プロセスマイグレーション Coordinated Checkpointing を MPICH 上にユーザ透過に実装した.予備的な評価の結果,Parakeet システムは移植性を保ちつつ効率的であり,本研究の将来的な目標であるプラグアンドプレイクラスタリングの基礎技術として有用であることがわかった.","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"Rapid increase in the number of nodes as well as the massive scale of computing in terms of both time and memory space for commodity clustering is mandating the handling the potential failure of applications and system as the norm,while inherent fault tolerance and recovery have not been integral part of software tools being developed for parallel computing on such clusters.Moreover,flexible fault tolerance mechanisms in which the user could manage the balance of reliability vs.transparency vs.execution overhead would be vital,but most previous work on cluster fault tolerance have made available only a single policy and/or mechanism, and moreover, their overhead have not been exactly measured for practical applications. Instead, we propose a new fault tolerant MPI system called Parakeet which allows various fault tolerance and recovery mechanism could be easily specified by the user, while retaining the efficiency.As a preliminary basis,we have implemented a user-level,coordinated checkpointing and migration protocol on top of MPICH in a user-transparent fashion. By specifying new protocols based on the underlying Parakeet mechanism, one could achieve Plug-and-Play management of large-scale clusters.Preliminary benchmarks show that Parakeet is portable and efficient,and could well serve as a basis for Plug-and-Play clustering.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"134","bibliographic_titles":[{"bibliographic_title":"情報処理学会研究報告ハイパフォーマンスコンピューティング(HPC)"}],"bibliographicPageStart":"129","bibliographicIssueDates":{"bibliographicIssueDate":"2001-07-25","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"77(2001-HPC-087)","bibliographicVolumeNumber":"2001"}]},"relation_version_is_last":true,"weko_creator_id":"1"}}