@techreport{oai:ipsj.ixsq.nii.ac.jp:00056623,
 author = {福田, 隆 and 市川, 治 and 西村, 雅史 and Takashi, Fukuda and Osamu, Ichikawa and Masafumi, Nishimura},
 issue = {102(2008-SLP-073)},
 month = {Oct},
 note = {高精度な発話区間検出 (VAD) の実現は,音声認識性能に直結する重要な課題であるが,高騒音下ではいまだ性能が乏しい.本報告では,スペクトルの長時間変動に着目し,低 S/N 環境下における VAD 性能の改善を目指す.提案手法は,平均音素長以上の区間から長時間変動を抽出することにより,検出性能を大幅に改善することを示す.続いて,低 S/N 環境における過剰な湧き出し誤りの削減のため,音声の調波構造に基づく特徴量を VAD システムに導入する.この特徴量は,基本周波数 (F0) の明示的な推定を必要とせず,スペクトルの長時間変動情報と併用することで高い性能を実現する.提案システムは,CENSREC-1-C を用いた評価実験において,雑音環境下での性能を顕著に改善し,標準化手法である ETSI AFE-VAD に対して 77.7% の誤り削減を達成した., Accurate voice activity detection (VAD) is important for robust automatic speech recognition (ASR) systems. However the VAD system can often fail to detect speech present segments in low S/N environments. This paper first proposes a noise-robust VAD system using long-term temporal information in speech. Long-term temporal information has been an ASR focus recently, but has not been investigated sufficiently for VAD. This paper describes an attempt to incorporate long-term temporal information into a feature parameter set by using a longer window length than average phoneme duration. Next, harmonic structure-based feature extraction is applied to the VAD system in order to reduce false alerts in low S/N environments. The proposed feature extraction doesn’t need an explicit fundamental frequency estimation. The VAD system combining long-term features with harmonic structure-based features led to considerable improvements in noisy environments and had 77.7% error reduction as compared to the standardized ETSI AFE-VAD.},
 title = {長時間スペクトル変動情報と調波構造特徴量を併用した発話区間検出法},
 year = {2008}
}