{"created":"2025-01-19T01:25:52.570146+00:00","updated":"2025-01-19T12:28:48.403550+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00226425","sets":["1164:5159:11151:11283"]},"path":["11283"],"owner":"44499","recid":"226425","title":["全結合層型アップサンプリングを導入した高速ニューラル波形生成モデル"],"pubdate":{"attribute_name":"公開日","attribute_value":"2023-06-16"},"_buckets":{"deposit":"516558ed-2c56-40cc-8824-20aaacfeb985"},"_deposit":{"id":"226425","pid":{"type":"depid","value":"226425","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"全結合層型アップサンプリングを導入した高速ニューラル波形生成モデル","author_link":["601123","601129","601121","601130","601124","601131","601122","601132","601133","601126","601127","601134","601125","601128"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"全結合層型アップサンプリングを導入した高速ニューラル波形生成モデル"},{"subitem_title":"Fast Neural Waveform Generation Model With Fully Connected Upsampling","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"一般発表","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2023-06-16","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"神戸大学／情報通信研究機構"},{"subitem_text_value":"情報通信研究機構"},{"subitem_text_value":"神戸大学"},{"subitem_text_value":"情報通信研究機構"},{"subitem_text_value":"神戸大学"},{"subitem_text_value":"名古屋大学"},{"subitem_text_value":"情報通信研究機構"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"Kobe University / National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"Kobe University","subitem_text_language":"en"},{"subitem_text_value":"National Institute of Information and Communications Technology","subitem_text_language":"en"},{"subitem_text_value":"Kobe University","subitem_text_language":"en"},{"subitem_text_value":"Nagoya University","subitem_text_language":"en"},{"subitem_text_value":"National Institute of Information and Communications Technology","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/226425/files/IPSJ-SLP23147054.pdf","label":"IPSJ-SLP23147054.pdf"},"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-SLP23147054.pdf","filesize":[{"value":"1.8 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"22"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_login","version_id":"56255c0e-3bf9-480f-94f7-810013db9631","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2023 by the Institute of Electronics, Information and Communication Engineers This SIG report is only available to those in membership of the SIG."}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"山下, 陽生"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"岡本, 拓磨"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"高島, 遼一"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"大谷, 大和"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"滝口, 哲也"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"戸田, 智基"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"河井, 恒"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Haruki, Yamashita","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Takuma, Okamoto","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Ryoichi, Takashima","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Yamato, Ohtani","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Tetsuya, Takiguchi","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Tomoki, Toda","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Hisashi, Kawai","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10442647","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8663","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"近年，VITS や JETS といったモデルを用いることで高速かつ高品質なテキスト音声合成 (Text-toSpeech: TTS)が可能になった．しかし１つの CPU での Real Time Factor (RTF) は 1 弱であり，その品質を保ったまま更に推論速度を向上させることが求められている．ここでボコーダである HiFi-GAN が推論速度のボトルネックになっていることが知られており，HiFi-GAN の高速化モデルとして，Multi-stream (MS) HiFi-GAN，iSTFTNet，MS-iSTFT-HiFiGAN，などが提案されてきた．しかし本研究において，iSTFTNet の推論する中間特徴量は STFT 結果のスペクトログラムとは全く違う形になっており，特徴量を効率的に扱えていないことが分かった．そこで本稿では，iSTFT 構造を全結合層 (Fully Connected: FC) に変更した FC 構造を iSTFTNet と MS-iSTFT-HiFiGAN に適応した FC-HiFi-GAN と MS-FC-HiFiGANを提案する．FC 構造を用いたモデルは iSTFT 構造を用いるよりも中間特徴量を効率よく扱うことができ，VITS，JETSを用いた TTS による合成品質の向上が確認された．","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"In recent years, in text-to-speech synthesis, it is required to improve the inference speed while keeping the quality. Multi-stream (MS) iSTFT-HiFiGAN was proposed as a high-speed model of HiFi-GAN, a vocoder capable of inferring waveforms on single CPU. In the TTS task using VITS, although there was some deterioration in sound quality, the speed was increased by about 4 times. In this paper, we propose a MS-FC-HiFiGAN in which the inverse short-time Fourier transform (iSTFT) part is changed to trainable fully connected layer for the purpose of improving the synthesis quality of the MS-iSTFT-HiFiGAN. As for the inference speed, RTF was 0.15 on 1 CPU, which is the same as MS-iSTFT-HiFiGAN. Synthesis quality was inferior to that of MS-iSTFT-HiFiGAN in TTS task, but was superior to thatin analysis/synthesis task","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"6","bibliographic_titles":[{"bibliographic_title":"研究報告音声言語情報処理（SLP）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2023-06-16","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"54","bibliographicVolumeNumber":"2023-SLP-147"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":226425,"links":{}}