@article{oai:ipsj.ixsq.nii.ac.jp:00183616,
 author = {金川, 裕紀 and 太刀岡, 勇気 and 渡部, 晋治 and 石井, 純 and Hiroki, Kanagawa and Yuuki, Tachioka and Shinji, Watanabe and Jun, Ishii},
 issue = {9},
 journal = {情報処理学会論文誌},
 month = {Sep},
 note = {音声認識では適応が重要である．特徴量空間での適応（fMLLR）は，特徴量ベクトル系列に単一の変換行列を乗算することで実現されるため，デコーディング処理とは独立な，特徴量に関する前処理として実装できる．このためガウス混合分布（GMM）と同様にディープ・ニューラルネットワーク（DNN）の音響モデルに対しても適用できる．一方でモデル空間の適応は，回帰木に基づき複数の変換行列を用いることで，単一の変換行列を用いるfMLLRよりも高い精度で適応が可能である．しかしこの手法には2つの課題がある．1つ目は適応とデコードに同じ生成モデル（例：GMM）の音響モデルを共有しなければならず，DNNの音響モデルには適用できないこと，2つ目は変換行列の数が多くなると，変換行列の推定が過学習しやすいことである．本論文では，1パスの状態アラインメント情報を用いてフレームごとに対応する複数の変換行列を対応付け，それらを用いて重み付け線形和で表現される変換行列で特徴量変換を行う手法を提案する．さらに2つ目の課題に対し，構造的な事前確率の導入により変換行列をMAP推定する，特徴量空間における構造的事後確率最大線形（fSMAPLR）を提案する．実験より，提案するfSMAPLRはfMLLRの性能を上回った．, In automatic speech recognition, an adaptation is important. Feature-space maximum-likelihood linear regression (fMLLR) transforms acoustic features to adapted ones by a multiplication operation with a single transformation matrix. This property realizes an efficient adaptation performed within a pre-precessing, which is independent of a decoding process, and this type of adaptation can be applied to deep neural network (DNN). On the other hand, model-space adaptations (i.e., CMLLR) improve the performance of fMLLR because it can use multiple transformation matrices based on a regression tree. However, there are two problems in the model-space adaptations: first, these types of adaptation cannot be applied to DNN because adaptation and decoding must share the same generative model, i.e., Gaussian mixture model (GMM). Second, transformation matrices tend to be over-estimated when the number of transformation matrices is large. This paper proposes to use multiple transformation matrices within a feature-space adaptation framework. The proposed method first estimates multiple transformation matrices in the GMM framework according to the first-pass decoding results and the alignments, and then takes a weighted sum of these matrices to obtain a single feature transformation matrix frame-by-frame. In addition, to address the second problem, we propose feature-space structural maximum a posteriori linear regression (fSMAPLR), which introduces hierarchal prior distributions to regularize the MAP estimation. Experimental results show that the proposed fSMAPLR outperformed fMLLR.},
 pages = {1555--1564},
 title = {音声認識のための回帰木に基づく複数の変換行列の重み付けによる特徴量空間の適応},
 volume = {58},
 year = {2017}
}