{"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00207671","sets":["6164:6165:6210:10402"]},"path":["10402"],"owner":"44499","recid":"207671","title":["二人零和マルコフゲームにおけるオフ方策評価のためのQ学習"],"pubdate":{"attribute_name":"公開日","attribute_value":"2020-11-06"},"_buckets":{"deposit":"44ccd413-bc40-43ab-9c35-825dd0fc69ce"},"_deposit":{"id":"207671","pid":{"type":"depid","value":"207671","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"二人零和マルコフゲームにおけるオフ方策評価のためのQ学習","author_link":["518907","518904","518906","518905"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"二人零和マルコフゲームにおけるオフ方策評価のためのQ学習"},{"subitem_title":"Q-Learning for Off-Policy Evaluation in Two-Player Zero-Sum Markov Games","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"Off-Policy Evaluation","subitem_subject_scheme":"Other"},{"subitem_subject":"Multi-Agent Reinforcement Learning","subitem_subject_scheme":"Other"},{"subitem_subject":"Causal Inference","subitem_subject_scheme":"Other"}]},"item_type_id":"18","publish_date":"2020-11-06","item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_18_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"株式会社サイバーエージェント"},{"subitem_text_value":"株式会社サイバーエージェント"}]},"item_18_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"CyberAgent, Inc.","subitem_text_language":"en"},{"subitem_text_value":"CyberAgent, Inc.","subitem_text_language":"en"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/207671/files/IPSJ-GPWS2020027.pdf","label":"IPSJ-GPWS2020027.pdf"},"date":[{"dateType":"Available","dateValue":"2020-11-06"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-GPWS2020027.pdf","filesize":[{"value":"6.0 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"5"},{"tax":["include_tax"],"price":"0","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"18"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"1dd5734e-339a-427c-8818-94400c03eedd","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2020 by the Information Processing Society of Japan"}]},"item_18_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"阿部, 拳之"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"金子, 雄祐"}],"nameIdentifiers":[{}]}]},"item_18_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Kenshi, Abe","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Yusuke, Kaneko","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_5794","resourcetype":"conference paper"}]},"item_18_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"オフ方策評価は,ある方策から取得した履歴データを使用してオフラインで新しい方策を評価する問題である.本研究では,二人零和マルコフゲームにおけるオフ方策評価のために,新しいQ 学習アルゴリズムであるBest Response (BR) Q-learning を提案する.BR Q-learning は,二人零和マルコフゲームにおける履歴データを用いて,与えられた戦略に対する最適反応戦略の状態行動価値関数を推定する.本論文では,BR Q-learning によって更新される状態行動価値関数が,最適反応戦略の状態行動価値関数へと確率1 で収束することを証明する. さらに,BR Q-learning を用いることで,与えられた戦略プロファイルのexploitability を推定する手法を提案し,推定されたexploitability が, 真のexploitability に確率1 で収束することを示す. また,実験によってBR Q-learning の有効性を確認する.","subitem_description_type":"Other"}]},"item_18_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"Off-policy evaluation (OPE) is the problem of evaluating new policies using historical data obtained from a different policy. In this study, we propose a novel Q-learning algorithm, called Best Response (BR) Q-learning, for OPE in two-player zero-sum Markov games. BR Q-learning estimates the state-action value of the best response to the given strategy. We prove that BR Q-learning converges the state-value of the best response with probability one. Further, we propose the novel off-policy estimator for exploitability using BR Q-learning. Then, we show that the estimated exploitability converges to the true exploitability with probability one. Finally, we demonstrate the effectiveness and performance of BR Q-learning through experiments.","subitem_description_type":"Other"}]},"item_18_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"174","bibliographic_titles":[{"bibliographic_title":"ゲームプログラミングワークショップ2020論文集"}],"bibliographicPageStart":"169","bibliographicIssueDates":{"bibliographicIssueDate":"2020-11-06","bibliographicIssueDateType":"Issued"},"bibliographicVolumeNumber":"2020"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"updated":"2025-01-19T19:05:46.469132+00:00","created":"2025-01-19T01:09:17.562665+00:00","links":{},"id":207671}