@techreport{oai:ipsj.ixsq.nii.ac.jp:00169880,
 author = {藤田, 悠哉 and 磯, 健一 and Yuya, Fujita and Ken-ichi, Iso},
 issue = {9},
 month = {Jul},
 note = {DNN に基づく音声区間検出に音素エントロピーによる棄却を追加することで背景発話の誤検出を抑制する方法を提案する．我々が運用している音声認識サービスでは，DNN に基づく音声区間検出を採用している．音声区間検出の誤りを観察したところ，そのほとんどが TV またはラジオや周囲の人の会話に由来する背景発話の誤検出だった．本稿ではそのような誤検出を抑制するために，DNN 音響モデルの音素事後確率のエントロピーに基づく信頼度スコアを導入する．背景発話はユーザーが音声認識サービスの利用を意図して行う発話よりもマイクロフォンとの距離が遠いことが多く，ノイズや残響の影響を受けやすい．従って背景発話音声は音素事後確率のエントロピーが大きな値を持つと考えられる．そこで，DNN に基づく音声区間検出により音声と判定されたフレームのうち，音素事後確率のエントロピーが閾値以上のフレームを棄却し，背景発話による誤検出を抑制する．実験により，音声認識サービスの文誤り率が 10%以上削減できることを確認した．, We propose a DNN-based voice activity detector augmented by entropy based frame rejection. DNN-based VAD classifies a frame into speech or non-speech and achieves significantly higher VAD performance compared to conventional statistical model-based VAD. We observed that many of the remaining errors are false alarms caused by background human speech, such as TV / radio or surrounding peoples' conversations. In order to reject such background speech frames, we introduce an entropy-based confidence measure using the phone posterior probability output by a DNN-based acoustic model. Compared to the target speaker's voice background speech tends to have relatively unclear pronunciation or is contaminated by other types of noises so its entropy becomes larger than audio signals with only the target speaker's voice. Combining DNN-based VAD and the entropy criterion, we reject speech frames classified by the DNN-based VAD as having an entropy larger than a threshold value. We have evaluated the proposed approach and confirmed greater than 10% reduction in Sentence Error Rate.},
 title = {音素エントロピーを利用した背景発話に頑健なDNNに基づく音声区間検出},
 year = {2016}
}