@techreport{oai:ipsj.ixsq.nii.ac.jp:00224399,
 author = {山下, 陽生 and 岡本, 拓磨 and 高島, 遼一 and 滝口, 哲也 and 戸田, 智基 and 河井, 恒 and Haruki, Yamashita and Takuma, Okamoto and Ryoichi, Takashima and Tetsuya, Takiguchi and Tomoki, Toda and Hisashi, Kawai},
 issue = {2},
 month = {Feb},
 note = {近年テキスト音声合成 (Text-to-Speech: TTS) では品質を保ったまま推論速度を向上することが求められており，そのためニューラルボコーダの高速化が研究されている．Multi-Stream (MS) iSTFT-HiFi-GAN は 1CPU でも音声波形を推論可能なボコーダである HiFi-GAN の高速モデルとして提案され，VITS を用いた TTS タスクにおいて若干の音質の劣化があったものの約 4 倍の高速化がなされた．そこで本稿では，MS-iSTFT-HiFiGAN の合成品質向上を目的として逆短時間フーリエ変換 (iSTFT) 部を学習可能な全結合層へと変更した MS-FC-HiFi-GAN を提案する．このモデルについて，分析合成タスクとテキスト音声合成タスクの 2 つのタスクにおいて推論速度，合成品質を既存のHiFi-GAN の高速モデルと比較を行った．実験の結果，分析合成タスクにおける提案モデルの推論速度は 1CPU において 0.15 の Real Time Factor となり，MS-iSTFT-HiFiGAN と同程度であることが確認された．また提案モデルの合成品質は，TTS タスクではMS-iSTFT-HiFiGAN に劣ったものの分析合成では上回る結果となった．, In recent years, in text-to-speech synthesis, it is required to improve the inference speed while keeping the quality. Multi-stream(MS) iSTFT-HiFiGAN was proposed as a high-speed model of HiFi-GAN, a vocoder capable of inferring waveforms on single CPU. In the TTS task using VITS, although there was some deterioration in sound quality, the speed was increased by about 4 times. In this paper, we propose a MS-FC-HiFi-GAN in which the inverse short-time Fourier transform (iSTFT) part is changed to trainable fully connected layer for the purpose of improving the synthesis quality of the MS-iSTFT-HiFiGAN. As for the inference speed, RTF was 0.15 on 1 CPU, which is the same as MS-iSTFT-HiFiGAN. Synthesis quality was inferior to that of MS-iSTFT-HiFiGAN in TTS task, but was superior to thatin analysis/synthesis task.},
 title = {MS-FC-HiFiGAN : 学習可能な軽量アップサンプリングを用いた高速ニューラル波形生成モデル},
 year = {2023}
}