@techreport{oai:ipsj.ixsq.nii.ac.jp:00096748, author = {中鹿, 亘 and 滝口, 哲也 and 有木, 康雄 and Toru, Nakashika and Tetsuya, Takiguchi and Yasuo, Ariki}, issue = {14}, month = {Dec}, note = {本研究では,元の音響特徴量空間よりも音韻性や時間変化性を抑え,話者性を強調させることによって,より入力話者音声の声質を出力話者のものへと変換しやすい話者依存空間を形成することを目的として,話者ごとに conditional restricted Boltzmann machine (CRBM) を用いた声質変換法を提案する.提案手法ではまず初めに,話者ごとに用意した学習データ (パラレルデータである必要は無い) を用いて,入力話者,出力話者の CRBM を独立に学習させる.次に,少量のパラレルデータの音響特徴量を,それぞれの CRBM を通して話者依存高次元空間へ写像 (CRBM の前方推論) し,その高次特徴量同士を Neural Network (NN) を用いて変換させる.NN の変換で得られた特徴量は,CRBM の後方推論によって元の音響特徴量へ逆変換することが可能である.評価実験では,従来の GMM や NN,DBN を用いた声質変換法に比べて,主観的にも客観的にも良い精度が得られたことを確認した., In this paper, we present a voice conversion (VC) method that utilizes conditional restricted Boltzmann machines (CRBMs) for each speaker to obtain time-invariant speaker-independent spaces where voice features are converted more easily than those in an original acoustic feature space. First, we train two CRBMs for a source and target speaker independently using speaker-dependent training data (without the need to parallelize the training data). Then, a small number of parallel data are fed into each CRBM and the high-order features produced by the CRBMs are used to train a concatenating neural network (NN) between the two CRBMs. Finally, the entire network (the two CRBMs and the NN) is fine-tuned using the acoustic parallel data. Through voice-conversion experiments, we confirmed the high performance of our method in terms of objective and subjective evaluations, comparing it with conventional GMM, NN, and speaker-dependent DBN approaches.}, title = {話者依存型 Conditional Restricted Boltzmann Machine による声質変換}, year = {2013} }