@techreport{oai:ipsj.ixsq.nii.ac.jp:00228438,
 author = {Lester, Phillip Violeta and Wen-ChinHuang, Ding Ma and 山本, 龍一 and 小林, 和弘 and 戸田, 智基 and Lester, Phillip Violeta and Wen-Chin, Huang and Ding, Ma and Ryuichi, Yamamoto and Kazuhiro, Kobayashi and Tomoki, Toda},
 issue = {8},
 month = {Oct},
 note = {Although pretraining and ﬁne-tuning approaches have proven to work well in speech intelligibility enhancement, various mismatches, such as the speech type mismatch or a speaker mismatches between the datasets used in each stage, can deteriorate the conversion performance of this framework. We propose a linguistic encoder robust enough to project both EL and typical speech in the same latent space, while still being able to extract accurate linguistic information, creating a uniﬁed representation to reduce the speech type mismatch. Furthermore, we introduce HuBERT output features to the proposed framework for reducing the speaker mismatch. Such a framework makes it possible to eﬀectively use a large-scale parallel dataset during pretraining. We show that compared to the conventional framework using mel-spectrogram input and output features, using the proposed framework enables the model to synthesize more intelligible and naturally sounding speech, as shown by a signiﬁcant 16% improvement in character error rate and 0.83 improvement in naturalness score., Although pretraining and ﬁne-tuning approaches have proven to work well in speech intelligibility enhancement, various mismatches, such as the speech type mismatch or a speaker mismatches between the datasets used in each stage, can deteriorate the conversion performance of this framework. We propose a linguistic encoder robust enough to project both EL and typical speech in the same latent space, while still being able to extract accurate linguistic information, creating a uniﬁed representation to reduce the speech type mismatch. Furthermore, we introduce HuBERT output features to the proposed framework for reducing the speaker mismatch. Such a framework makes it possible to eﬀectively use a large-scale parallel dataset during pretraining. We show that compared to the conventional framework using mel-spectrogram input and output features, using the proposed framework enables the model to synthesize more intelligible and naturally sounding speech, as shown by a signiﬁcant 16% improvement in character error rate and 0.83 improvement in naturalness score.},
 title = {言語表現による喉頭摘出者のための音声強調システム},
 year = {2023}
}