{"created":"2025-01-19T01:33:27.083431+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00232530","sets":["1164:5159:11541:11549"]},"path":["11549"],"owner":"44499","recid":"232530","title":["CTC音声認識モデルにおけるビームサーチデコーディング内での暗黙的言語情報の置換"],"pubdate":{"attribute_name":"公開日","attribute_value":"2024-02-22"},"_buckets":{"deposit":"2c685bf7-8753-4ea2-9d2d-f1d0eea383c5"},"_deposit":{"id":"232530","pid":{"type":"depid","value":"232530","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"CTC音声認識モデルにおけるビームサーチデコーディング内での暗黙的言語情報の置換","author_link":["629603","629607","629605","629608","629604","629606","629601","629602"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"CTC音声認識モデルにおけるビームサーチデコーディング内での暗黙的言語情報の置換"},{"subitem_title":"Substitution of Implicit Linguistic Information in Beam Search Decoding Using CTC-based Speech Recognition Models","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"ポスターセッション2 SP/SLP","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2024-02-22","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"豊橋技術科学大学"},{"subitem_text_value":"豊橋技術科学大学"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"豊橋技術科学大学"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Toyohashi University of Technology","subitem_text_language":"en"},{"subitem_text_value":"Toyohashi University of Technology","subitem_text_language":"en"},{"subitem_text_value":"NTT Corporation","subitem_text_language":"en"},{"subitem_text_value":"Toyohashi University of Technology","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/232530/files/IPSJ-SLP24151060.pdf","label":"IPSJ-SLP24151060.pdf"},"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-SLP24151060.pdf","filesize":[{"value":"1.1 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"22"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_login","version_id":"9700ac0e-e3ed-449b-b668-577554bf4fb7","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2024 by the Institute of Electronics, Information and Communication Engineers This SIG report is only available to those in membership of the SIG."}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"高城, 巽成"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"若林, 佑幸"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"小川, 厚徳"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"北岡, 教英"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Tatsunari, Takagi","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Yukoh, Wakabayashi","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Atsunori, Ogawa","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Norihide, Kitaoka","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10442647","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8663","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"自動音声認識の分野ではニューラルネットワークの台頭により,音声認識モデルの精度が向上し,字幕生成や議事録作成など様々な分野で利用されている.しかしながら,音声認識モデルの学習と推論に用いるデータのドメインが異なる場合,認識精度が低下する問題がある.この問題を解決するドメイン適応手法として,Shallow FusionやDensity Ratio Approach (DRA) が提案されており,音声認識モデルの再学習なしにドメイン適応が可能であるため,導入のコストが低く,より実用的な手法となっている.我々の先行研究では,CTC デコーダを用いた音声認識モデルにおけるDRA の適用を検討したが,ビームサーチと併用した場合の DRA の適用については検討されていない.そこで本研究では,CTC デコーダを用いた音声認識モデルにおけるビームサーチ内での DRA の適用を検討した.ビームサーチ内で DRA を適用させるためにアルゴリズムを拡張した.実験において,加算用と減算用の言語モデルについてさまざまな種類の言語モデルを組み合わせ,実験を行った.実験の結果として,ビームサーチ内での DRA の適用により,1-gram 言語モデルから 6-gram 言語モデルを用いた様々な組み合わせで認識精度が向上することを示した.特に,減算用言語モデルに 1-gram 言語モデルを用いた場合に最も認識精度が向上することを示した.これは CTC デコーダを用いた音声認識モデルにおける暗黙の言語情報が 1-gram 言語モデルに近い言語情報を持っており,正しく言語情報を置換できたことで認識精度が向上したと考えられる.","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"The rise of neural networks in the field of automatic speech recognition has notably improved the accuracy of speech recognition models, making them widely applicable in areas such as subtitle generation and meeting transcription. However, a challenge arises when there is a mismatch between the domains of data used for training and inference, leading to reduced accuracy. To address this issue, domain adaptation techniques like Shallow Fusion and Density Ratio Approach (DRA) have been proposed. These methods enable domain adaptation without the need to retrain the speech recognition model, making them cost-effective and practical. In our prior research, we explored the application of DRA in speech recognition models using a Connectionist Temporal Classification (CTC) decoder. However, the use of DRA in conjunction with beam search had not been examined. Therefore, our current study investigates the application of DRA within beam search in CTC decoder-based speech recognition models. We expanded the algorithm to integrate DRA within beam search. Our experiments involved various combinations of additive and subtractive language models. The results demonstrated that applying DRA within beam search improved recognition accuracy across different combinations of language models, from 1-gram to 6-gram. Notably, the use of a 1-gram model as the subtractive language model showed the most significant improvement in accuracy. This suggests that the implicit linguistic information in CTC decoder-based speech recognition models is closely aligned with a 1-gram language model, and the correct replacement of linguistic information through DRA led to enhanced recognition accuracy.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"6","bibliographic_titles":[{"bibliographic_title":"研究報告音声言語情報処理(SLP)"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2024-02-22","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"60","bibliographicVolumeNumber":"2024-SLP-151"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":232530,"updated":"2025-01-19T10:25:01.217196+00:00","links":{}}