@article{oai:ipsj.ixsq.nii.ac.jp:00211654, author = {鈴木, 貴仁 and 緒方, 淳 and 綱川, 隆司 and 西田, 昌史 and 西村, 雅史 and Takahito, Suzuki and Jun, Ogata and Takashi, Tsunakawa and Masafumi, Nishida and Masafumi, Nishimura}, issue = {6}, journal = {情報処理学会論文誌}, month = {Jun}, note = {咽喉マイクは接話マイクのような一般的なマイクよりも外部雑音に頑健であるが,一般的なマイクとの音響ミスマッチが大きく,通常の音声認識システムでは認識精度が低下する.また,大量の音声データが利用可能という状況にもない.本研究では接話マイクと咽喉マイクで同時収録した小規模パラレルデータを活用した咽喉マイク音声認識のための学習手法を提案する.提案手法では,まず既存の大規模音声データベースから抽出した接話マイク特徴量を咽喉マイクの特徴量空間にマッピングし,咽喉マイク用音響モデル(DNN-HMM)の学習データを拡張する.このとき特徴マッピングはパラレルデータを用いてLSTMによって学習する.続いて,特徴マッピングによって得た特徴量でDNN-HMMを初期学習し,これを生徒モデルとする.そして,大量の接話マイク特徴量で学習したDNN-HMMを教師モデルとし,知識蒸留に基づき生徒モデルの再学習を行う.読み上げ音声を用いた評価の結果,提案法は咽喉マイク音声のみで学習したDNN-HMMと比べて約36.5%の文字誤り率の削減を達成した., Throat microphones are more robust against external noise than conventional acoustic microphones such as close-talk. However, automatic speech recognition (ASR) performance is degraded when throat microphone speech signals are simply input to a general (clean) ASR system due to large acoustic mismatches. Moreover, the amount of throat microphone speech data is not enough to train accurate ASR systems. In this study, we propose a training approach for throat microphone ASR utilizing a small parallel corpus simultaneously recorded by close-talk and throat microphones. As a data-augmentation process, existing large-amount close-talk microphone features are transformed to a throat microphone feature space with the LSTM-based feature mapping which is trained from the parallel corpus. The DNN-HMM is then pre-trained with the mapped features, and fine-tuned by knowledge distillation from a DNN-HMM trained with a large amount of close-talk microphone speech data. Experimental results using read speech data showed that the proposed approach achieved 36.5% relative improvement of character error rate compared to the DNN-HMM trained only with throat microphone speech data.}, pages = {1373--1381}, title = {咽喉マイクを用いた大語彙音声認識のための特徴マッピングによるデータ拡張と知識蒸留}, volume = {62}, year = {2021} }