{"created":"2025-01-19T01:13:02.368189+00:00","updated":"2025-01-19T17:37:53.546399+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00211922","sets":["934:1022:10454:10606"]},"path":["10606"],"owner":"44499","recid":"211922","title":["Effects and Mitigation of Out-of-vocabulary in Universal Language Models"],"pubdate":{"attribute_name":"公開日","attribute_value":"2021-07-13"},"_buckets":{"deposit":"307e490c-c3e7-40dc-92eb-455e1eb883b7"},"_deposit":{"id":"211922","pid":{"type":"depid","value":"211922","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"Effects and Mitigation of Out-of-vocabulary in Universal Language Models","author_link":["539432","539431","539430","539429"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"Effects and Mitigation of Out-of-vocabulary in Universal Language Models"},{"subitem_title":"Effects and Mitigation of Out-of-vocabulary in Universal Language Models","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"[研究論文] Natural language processing, Machine learning, Transfer learning, Language models","subitem_subject_scheme":"Other"}]},"item_type_id":"3","publish_date":"2021-07-13","item_3_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"Tokyo Institute of Technology／Presently with Odd Concepts Inc."},{"subitem_text_value":"Tokyo Institute of Technology"}]},"item_3_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Tokyo Institute of Technology / Presently with Odd Concepts Inc.","subitem_text_language":"en"},{"subitem_text_value":"Tokyo Institute of Technology","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"eng"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/211922/files/IPSJ-TOD1403003.pdf","label":"IPSJ-TOD1403003.pdf"},"date":[{"dateType":"Available","dateValue":"2023-07-13"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-TOD1403003.pdf","filesize":[{"value":"980.4 kB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"5"},{"tax":["include_tax"],"price":"0","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"13"},{"tax":["include_tax"],"price":"0","billingrole":"39"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"fffb04d7-0ac7-484e-a735-9030bbe52476","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2021 by the Information Processing Society of Japan"}]},"item_3_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Sangwhan, Moon"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Naoaki, Okazaki"}],"nameIdentifiers":[{}]}]},"item_3_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Sangwhan, Moon","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Naoaki, Okazaki","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_3_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AA11464847","subitem_source_identifier_type":"NCID"}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_6501","resourcetype":"journal article"}]},"item_3_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"1882-7799","subitem_source_identifier_type":"ISSN"}]},"item_3_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"One of the most important recent natural language processing (NLP) trends is transfer learning - using representations from language models implemented through a neural network to perform other tasks. While transfer learning is a promising and robust method, downstream task performance in transfer learning depends on the robustness of the backbone model's vocabulary, which in turn represents both the positive and negative characteristics of the corpus used to train it. With subword tokenization, out-of-vocabulary (OOV) is generally assumed to be a solved problem. Still, in languages with a large alphabet such as Chinese, Japanese, and Korean (CJK), this assumption does not hold. In our work, we demonstrate the adverse effects of OOV in the context of transfer learning in CJK languages, then propose a novel approach to maximize the utility of a pre-trained model suffering from OOV. Additionally, we further investigate the correlation of OOV to task performance and explore if and how mitigation can salvage a model with high OOV.\n------------------------------\nThis is a preprint of an article intended for publication Journal of\nInformation Processing(JIP). This preprint should not be cited. This\narticle should be cited as: Journal of Information Processing Vol.29(2021) (online)\n------------------------------","subitem_description_type":"Other"}]},"item_3_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"One of the most important recent natural language processing (NLP) trends is transfer learning - using representations from language models implemented through a neural network to perform other tasks. While transfer learning is a promising and robust method, downstream task performance in transfer learning depends on the robustness of the backbone model's vocabulary, which in turn represents both the positive and negative characteristics of the corpus used to train it. With subword tokenization, out-of-vocabulary (OOV) is generally assumed to be a solved problem. Still, in languages with a large alphabet such as Chinese, Japanese, and Korean (CJK), this assumption does not hold. In our work, we demonstrate the adverse effects of OOV in the context of transfer learning in CJK languages, then propose a novel approach to maximize the utility of a pre-trained model suffering from OOV. Additionally, we further investigate the correlation of OOV to task performance and explore if and how mitigation can salvage a model with high OOV.\n------------------------------\nThis is a preprint of an article intended for publication Journal of\nInformation Processing(JIP). This preprint should not be cited. This\narticle should be cited as: Journal of Information Processing Vol.29(2021) (online)\n------------------------------","subitem_description_type":"Other"}]},"item_3_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographic_titles":[{"bibliographic_title":"情報処理学会論文誌データベース（TOD）"}],"bibliographicIssueDates":{"bibliographicIssueDate":"2021-07-13","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"3","bibliographicVolumeNumber":"14"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":211922,"links":{}}