{"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00213239","sets":["1164:5159:10515:10722"]},"path":["10722"],"owner":"44499","recid":"213239","title":["大規模音声コーパスを用いたDNN-HSMM音声合成のためのモデル学習の検討"],"pubdate":{"attribute_name":"公開日","attribute_value":"2021-10-12"},"_buckets":{"deposit":"2fbfc062-37e6-4ae4-80ca-a1b03e0605eb"},"_deposit":{"id":"213239","pid":{"type":"depid","value":"213239","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"大規模音声コーパスを用いたDNN-HSMM音声合成のためのモデル学習の検討","author_link":["545294","545296","545293","545295"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"大規模音声コーパスを用いたDNN-HSMM音声合成のためのモデル学習の検討"},{"subitem_title":"A Study on model training for DNN-HMM-based speech synthesis using a large-scale speech corpus","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"H/SP/SLP","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2021-10-12","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"株式会社KDDI 総合研究所"},{"subitem_text_value":"株式会社KDDI 総合研究所"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"KDDI Research, Inc.","subitem_text_language":"en"},{"subitem_text_value":"KDDI Research, Inc.","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/213239/files/IPSJ-SLP21138011.pdf","label":"IPSJ-SLP21138011.pdf"},"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-SLP21138011.pdf","filesize":[{"value":"1.7 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"22"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_login","version_id":"24434b6f-1fde-4218-b71b-70f496a31d93","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2021 by the Institute of Electronics, Information and Communication Engineers This SIG report is only available to those in membership of the SIG."}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"西澤, 信行"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"服部, 元"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Nobuyuki, Nishizawa","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Gen, Hattori","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10442647","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8663","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"本研究では，接続合成用に収集された大規模音声コーパスを用いて，DNN-HSMM 音声合成のためのモデル学習を行った．従来の HSMM 音声合成では入力である言語情報に対応する HSMM のパラメータを決定木で予測していたが，DNN-HSMM 音声合成はこの予測に DNN を用いており，より高い合成音品質が期待できる．しかし，HSMM の状態継続長分布のパラメータを同時に DNN で推定するため，モデル学習の初期段階では学習データに対して HSMM の状態のアラインメントを適切に行うことができず，確率的勾配法によるモデル学習が進まない可能性がある．特に DNN に LSTM（long short-term memory）を用いた RNN を用いた場合の学習時の挙動については充分な検討が行われていない．そこで本研究では大規模な音声学習セットを用いて，LSTM を用いた場合のモデル学習時の挙動について調べた．実験の結果，オプティマイザの学習率を適切に設定することで，パラメータをランダムに設定した初期状態からモデル学習が可能なこと，また，各層が 2048 セルの LSTM  で構成される 3 層の双方向 RNN を用いた場合，推定誤差性能が飽和する学習データサイズは 20.6 時間以上であることが分かった．","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"In this study, an investigation into model training for DNN-HSMM-based speech synthesis using a large speech cor- pus collected for connection synthesis was conducted. While conventional HSMM-based speech synthesis uses decision trees to predict the HSMM parameters corresponding to the linguistic information, DNN-HSMM-based speech synthesis uses DNNs for this prediction. Thus, it is expected to synthesize higher quality sounds by the method. However, since the parameters of the state duration distributions of the HSMMs are simultaneously estimated by the training, the training by the stochastic gradient method may not properly progress in the early stage of model training where the states cannot be appropriately aligned with training data yet. In particular, the behavior of training of RNNs using LSTM (long short-term memory) for DNN-HSMM-based speech synthesis has not yet been sufﬁciently studied. The experimental results show that the model can be trained from the randomly initialized states by setting the learning rate of the optimizer appropriately, and the training data size at which performance of the prediction saturates is more than 20.6 hours where using a three-layer bidirectional RNN where each layer consists of 2048-cell LSTMs.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"6","bibliographic_titles":[{"bibliographic_title":"研究報告音声言語情報処理（SLP）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2021-10-12","bibliographicIssueDateType":"Issued"},"bibliographicVolumeNumber":"2021-SLP-138"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":213239,"updated":"2025-01-19T17:13:45.329577+00:00","links":{},"created":"2025-01-19T01:14:07.582363+00:00"}