@techreport{oai:ipsj.ixsq.nii.ac.jp:00228438, author = {Lester, Phillip Violeta and Wen-ChinHuang, Ding Ma and 山本, 龍一 and 小林, 和弘 and 戸田, 智基 and Lester, Phillip Violeta and Wen-Chin, Huang and Ding, Ma and Ryuichi, Yamamoto and Kazuhiro, Kobayashi and Tomoki, Toda}, issue = {8}, month = {Oct}, note = {Although pretraining and fine-tuning approaches have proven to work well in speech intelligibility enhancement, various mismatches, such as the speech type mismatch or a speaker mismatches between the datasets used in each stage, can deteriorate the conversion performance of this framework. We propose a linguistic encoder robust enough to project both EL and typical speech in the same latent space, while still being able to extract accurate linguistic information, creating a unified representation to reduce the speech type mismatch. Furthermore, we introduce HuBERT output features to the proposed framework for reducing the speaker mismatch. Such a framework makes it possible to effectively use a large-scale parallel dataset during pretraining. We show that compared to the conventional framework using mel-spectrogram input and output features, using the proposed framework enables the model to synthesize more intelligible and naturally sounding speech, as shown by a significant 16% improvement in character error rate and 0.83 improvement in naturalness score., Although pretraining and fine-tuning approaches have proven to work well in speech intelligibility enhancement, various mismatches, such as the speech type mismatch or a speaker mismatches between the datasets used in each stage, can deteriorate the conversion performance of this framework. We propose a linguistic encoder robust enough to project both EL and typical speech in the same latent space, while still being able to extract accurate linguistic information, creating a unified representation to reduce the speech type mismatch. Furthermore, we introduce HuBERT output features to the proposed framework for reducing the speaker mismatch. Such a framework makes it possible to effectively use a large-scale parallel dataset during pretraining. We show that compared to the conventional framework using mel-spectrogram input and output features, using the proposed framework enables the model to synthesize more intelligible and naturally sounding speech, as shown by a significant 16% improvement in character error rate and 0.83 improvement in naturalness score.}, title = {言語表現による喉頭摘出者のための音声強調システム}, year = {2023} }