{"created":"2025-11-07T02:45:08.649409+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:02005559","sets":["1:11818:11916"]},"path":["11916"],"owner":"13","recid":"2005559","title":["5分で分かる!? 有名論文ナナメ読み:Matt Deitke, Christopher Clark, Sangho Lee, et al. : Molmo and PixMo : Open Weights and Open Data for State-ofthe-Art Vision-Language Models"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2025-11-15"},"_buckets":{"deposit":"99a9da1c-3c80-4e8e-b8d2-309699074f26"},"_deposit":{"id":"2005559","pid":{"type":"depid","value":"2005559","revision_id":0},"owner":"13","owners":[13],"status":"published","created_by":13},"item_title":"5分で分かる!? 有名論文ナナメ読み:Matt Deitke, Christopher Clark, Sangho Lee, et al. : Molmo and PixMo : Open Weights and Open Data for State-ofthe-Art Vision-Language Models","author_link":[],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"5分で分かる!? 有名論文ナナメ読み:Matt Deitke, Christopher Clark, Sangho Lee, et al. : Molmo and PixMo : Open Weights and Open Data for State-ofthe-Art Vision-Language Models","subitem_title_language":"ja"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"連載","subitem_subject_scheme":"Other"}]},"item_type_id":"30","publish_date":"2025-11-15","item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_30_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"サイバーエージェント"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/2005559/files/IPSJ-MGN661208.pdf","label":"IPSJ-MGN661208.pdf"},"date":[{"dateType":"Available","dateValue":"2027-11-15"}],"billing":["billing_file"],"filename":"IPSJ-MGN661208.pdf","filesize":[{"value":"456.0 KB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"0","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"6e3075f2-7c5c-4957-b4e3-ca5ed6fa2be3","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2025 by the Information Processing Society of Japan"}]},"item_30_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"大谷,まゆ"}]}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_6501","resourcetype":"article"}]},"item_30_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN00116625","subitem_source_identifier_type":"NCID"}]},"item_30_publisher_14":{"attribute_name":"公開者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"item_30_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"近年,画像と言語を統合的に取り扱うマルチモーダルAIの性能が飛躍的に向上している一方で,その多くは非公開モデルであり,学習データやモデル構造が不明であることが研究の進展を妨げている.本記事では,こうした状況に対して透明性を重視し,モデル構造・学習データ・重みをすべて公開した視覚言語モデル「Molmo」と,その学習を支えるデータセット「PixMo」を解説する.まず,視覚言語モデルMolmoで採用された工夫を紹介し,次にPixMoの音声入力や大規模言語モデルを活用したデータ収集手法を概説する.最後に,これらの取り組みがもたらすAI研究への意義と今後の展望について考察する.","subitem_description_type":"Other"}]},"item_30_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"563","bibliographic_titles":[{"bibliographic_title":"情報処理"}],"bibliographicPageStart":"562","bibliographicIssueDates":{"bibliographicIssueDate":"2025-11-15","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"12","bibliographicVolumeNumber":"66"}]},"relation_version_is_last":true,"item_30_identifier_registration":{"attribute_name":"ID登録","attribute_value_mlt":[{"subitem_identifier_reg_text":"10.20729/0002005559","subitem_identifier_reg_type":"JaLC"}]},"weko_creator_id":"13"},"links":{},"id":2005559,"updated":"2025-12-17T01:54:28.491032+00:00"}