@techreport{oai:ipsj.ixsq.nii.ac.jp:00194516,
 author = {郡山, 知樹 and 高道, 慎之介 and 小林, 隆夫},
 issue = {1},
 month = {Feb},
 note = {人間の音声生成のように発話間変動を持つ音声合成の実現を目標とし，我々はこれまでに，生成的モーメントマッチングネットワーク (GMNN) に基づく音声パラメータのランダム生成手法を提案している．GMMN では分布間の距離を表す条件付き maximum mean discrepancy (CMMD) を最小にするようにニューラルネットワークを学習する．音声合成のように学習データのサイズが大きい場合，CMMD を直接求めることは計算量の観点から非現実的であり何らかの近似を行う必要があったが，これまで近似手法について十分な検討が行われていなかった．本研究では CMMD の計算手法として，変数同士の類似度を表すグラム行列に random Fourier features (RFF) を用いる近似手法を提案し従来のブロック対角近似手法との比較を行う．またミニバッチの選択手法して，従来のランダム選択の代わりに K-means クラスタリングを用いて，類似した入力変数を同じミニバッチとする手法を検討する．主観評価実験では提案法が従来法に比べ，発話間変動が知覚されやすいという結果を得た．, To realize human-like synthetic speech, synthetic speech samples should change every time even if the same sentences is spoken. In this context, we have proposed a technique of random sampling of synthetic speech parameters based on generative moment matching network (GMMN). GMMN is a neural network whose parameters are trained using conditional maximum mean discrepancy (CMMD) which represents the distance of two distributions. An issue of GMMN is that CMMD is computationally infeasible for a large amount of data, including speech synthesis database. In this report, we propose an approximation method based on random Fourier features and minibatch selection technique using K-means clustering. In the subjective evaluations, the proposed method outperformed the conventional one in the perception of inter-speech diversity.},
 title = {GMMNに基づく音声合成におけるグラム行列のスパース近似の検討},
 year = {2019}
}