@techreport{oai:ipsj.ixsq.nii.ac.jp:00211592,
 author = {森, 大輝 and 太田, 健吾 and 西村, 良太 and 小川, 厚徳 and 北岡, 教英 and Daiki, Mori and Kengo, Ohta and Ryota, Nishimura and Atsunori, Ogawa and Norihide, Kitaoka},
 issue = {17},
 month = {Jun},
 note = {近年，End-to-end 音声認識が従来の DNN-HMM 音声認識と比べ，高速かつ簡潔であることから注目されている．さらに大量のテキストデータによって学習された言語モデルを併用することで，認識精度が向上すると報告されている．本稿では，音声認識モデルと言語モデルの一般的な統合方法とされる Shallow Fusion を応用した新しい言語モデルの統合方法である Language Model Replacement を提案する．提案法では，事前学習済み音声認識モデルと事前学習済み言語モデルを用いる．提案法ではベイズ則に基づき，音声認識モデルに暗黙的に含まれる言語情報を差し替えることが可能となっている．我々の実験では，学術講演音声データを使用して学習された音声認識モデル内部の言語情報を，模擬講演テキストデータで学習した言語モデルによって差し替えた．模擬講演ドメインにおける提案法の CER は Shallow Fusion での認識精度と比較して，1.3 ポイント上回った．, Recently, end-to-end speech recognition has attracted much attention because it is faster and more concise than conventional DNN-HMM speech recognition. It has also been reported that recognition performance is improved by employing a language model trained with a large amount of text data. Based on these observations, we propose a new language model integration method which we call Language Model Replacement. In our proposed method, we use a pre-trained speech recognition model and a pre-trained language model. In contrast to the Shallow Fusion method, our proposed method can replace the linguistic information implied in the ASR model with independently trained model based on Bayes' rule. In our experiments, the ASR linguistic information implicitly trained using the Japanese language Academic Presentation Speech corpus is replaced with the language model trained using the Japanese language Simulated Public Speech corpus. We then compare ASR performance for Japanese speech recognition tasks using the Character Error Rate (CER). Our proposed Language Model Replacement method achieved 1.3 percent lower CER in comparison to the Shallow Fusion method.},
 title = {End-to-end音声認識モデルにおける暗黙的言語情報の置換法},
 year = {2021}
}