{"id":226368,"updated":"2025-01-19T12:29:56.046274+00:00","links":{},"created":"2025-01-19T01:25:49.317731+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00226368","sets":["1164:5064:11199:11282"]},"path":["11282"],"owner":"44499","recid":"226368","title":["x-vector と音声認識コーパスを用いた多様な発話スタイルに対応する複数話者テキスト音声合成の評価"],"pubdate":{"attribute_name":"公開日","attribute_value":"2023-06-16"},"_buckets":{"deposit":"c17ed132-33cb-4113-99d4-54053fbfa3ca"},"_deposit":{"id":"226368","pid":{"type":"depid","value":"226368","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"x-vector と音声認識コーパスを用いた多様な発話スタイルに対応する複数話者テキスト音声合成の評価","author_link":["600863","600862","600855","600858","600859","600860","600854","600856","600857","600852","600853","600861"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"x-vector と音声認識コーパスを用いた多様な発話スタイルに対応する複数話者テキスト音声合成の評価"},{"subitem_title":"Evaluation of multi-speaker text-to-speech synthesis using a corpus for speech recognition with x-vectors for various speech styles","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"一般発表","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2023-06-16","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"和歌山大学／情報通信研究機構"},{"subitem_text_value":"情報通信研究機構"},{"subitem_text_value":"和歌山大学"},{"subitem_text_value":"情報通信研究機構"},{"subitem_text_value":"名古屋大学／情報通信研究機構"},{"subitem_text_value":"情報通信研究機構"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Wakayama University / National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"Wakayama University","subitem_text_language":"en"},{"subitem_text_value":"National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"Nagoya University / National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"National Institute of Information and Communications Technology","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/226368/files/IPSJ-MUS23137064.pdf","label":"IPSJ-MUS23137064.pdf"},"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-MUS23137064.pdf","filesize":[{"value":"1.3 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"21"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_login","version_id":"4ece0033-634e-43bc-91d1-7ec72dae5ec9","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2023 by the Institute of Electronics, Information and Communication Engineers This SIG report is only available to those in membership of the SIG."}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"日田, 光紀"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"岡本, 拓磨"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"西村, 竜一"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"大谷, 大和"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"戸田, 智基"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"河井, 恒"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Koki, Hida","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Takuma, Okamoto","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Ryuichi, Nishimura","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Yamato, Ohtani","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Tomoki, Toda","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Hisashi, Kawai","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10438388","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8752","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"これまでに，合成時に対応できる話者数の増加を目的として，音声認識用コーパスである CSJ を用いて，また話者埋め込みとして x-vector を用いた複数話者テキスト音声合成を最新の End-to-End モデルである JETS を用いて実装した．本研究においては，未知話者として日本語話者 10 名 (CSJ とJVS)，英語話者 4 名 (CMU ARCTIC)，発話スタイルとして裏声およびささやき声 (JVS) を目的話者とすることにより，本モデルの実用性について検証する．また，感情音声 (JECS) とボーカロイド音声 (初音ミク) に関しても，目的話者として検証する．分析として，t-SNE を用いて次元削減した x-vector の比較を行った．これにより，本モデルによる合成音声が目的話者の話者性を再現可能であるかを検討した．","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"We have implemented multi-speaker end-to-end text-to-speech synthesis based on JETS using x-vectors as speaker embedding and CSJ for automatic speech recognition corpus for synthesizing various kinds of speakers. In this study, we investigate the capacity of the model by using 10 Japanese speakers (CSJ and JVS) and 4 English speakers (CMU ARCTIC) as unknown speakers, and speech styles such as backward speech and whispered speech as target speakers. In addition, emotional voice (JECS) and vocaloid voice (Hatsune Miku) are also tested as target speakers. We compared x-vectors with dimensionality reduction based on t-SNE. The results examine whether the model is able to reproduce the target speaker’s speech.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"6","bibliographic_titles":[{"bibliographic_title":"研究報告音楽情報科学（MUS）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2023-06-16","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"64","bibliographicVolumeNumber":"2023-MUS-137"}]},"relation_version_is_last":true,"weko_creator_id":"44499"}}