{"links":{},"id":2001903,"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:02001903","sets":["1164:4961:1739501972671:1744768172009"]},"path":["1744768172009"],"owner":"80578","recid":"2001903","title":["マルチモーダル大規模言語モデルを活用したOCRシステムの構築"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2025-05-10"},"_buckets":{"deposit":"7ba246f8-4386-45d6-9369-f0b7d53d0432"},"_deposit":{"id":"2001903","pid":{"type":"depid","value":"2001903","revision_id":0},"owners":[80578],"status":"published","created_by":80578},"item_title":"マルチモーダル大規模言語モデルを活用したOCRシステムの構築","author_link":[],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"マルチモーダル大規模言語モデルを活用したOCRシステムの構築","subitem_title_language":"ja"},{"subitem_title":"Building an OCR System Leveraging Multimodal Large Language Models","subitem_title_language":"en"}]},"item_type_id":"4","publish_date":"2025-05-10","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"名古屋大学"},{"subitem_text_value":"桜美林大学"},{"subitem_text_value":"東京大学"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Nagoya University","subitem_text_language":"en"},{"subitem_text_value":"J.F. Oberlin University","subitem_text_language":"en"},{"subitem_text_value":"Tokyo University","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/2001903/files/IPSJ-CH25138016.pdf","label":"IPSJ-CH25138016.pdf"},"date":[{"dateType":"Available","dateValue":"2027-05-10"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-CH25138016.pdf","filesize":[{"value":"528.5 KB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"24"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"598f874d-ffd0-4b68-aca5-56d0909464dd","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2025 by the Information Processing Society of Japan"}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"岩田,直也"}]},{"creatorNames":[{"creatorName":"田中,一孝"}]},{"creatorNames":[{"creatorName":"小川,潤"}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Naoya Iwata","creatorNameLang":"en"}]},{"creatorNames":[{"creatorName":"Ikko Tanaka","creatorNameLang":"en"}]},{"creatorNames":[{"creatorName":"Jun Ogawa","creatorNameLang":"en"}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN1010060X","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8957","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"近年，マルチモーダル大規模言語モデル（mLLM）は，画像とテキストを複合的に理解・処理する能力をもち，OCR（光学文字認識）用途にも高い有効性を発揮することが報告されている．とりわけGoogleが提供する最新のLLM（Gemini 2.0 Flash）は，高度な画像認識精度を備え，研究用途としても利用可能な価格帯で提供されるようになったが，実際の運用に際しては，プログラミングに関する知識やAPI操作の経験が必要になるという課題が存在する．本研究では，このような最新の商用マルチモーダルLLMをバックエンドで利用しつつ，自然言語による簡便な指示（プロンプト入力）で柔軟に出力内容を調整可能なOCRシステムを開発した．本システムでは，特に学術研究分野で求められる複雑なレイアウトや注釈要素の除外に特化したユーザーインターフェースと，抽出テキストの自動校正フローを設計し，ノーコードで高精度なOCR処理を実現できる．本発表では，システムの設計思想とワークフロー，さらに古典文献を題材にした具体的な精度検証の結果を報告し，人文情報学分野におけるマルチモーダルLLMを活用したOCRシステムの可能性と課題を論じる．","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"5","bibliographic_titles":[{"bibliographic_title":"研究報告人文科学とコンピュータ（CH）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2025-05-10","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"16","bibliographicVolumeNumber":"2025-CH-138"}]},"relation_version_is_last":true,"weko_creator_id":"80578"},"created":"2025-04-30T01:52:40.667914+00:00","updated":"2025-04-30T01:52:45.527965+00:00"}