@techreport{oai:ipsj.ixsq.nii.ac.jp:00184864, author = {Jaime, Lorenzo-Trueba and Gustav, Eje Henter and Shinji, Takaki and Junichi, Yamagishi and Jaime, Lorenzo-Trueba and Gustav, Eje Henter and Shinji, Takaki and Junichi, Yamagishi}, issue = {8}, month = {Dec}, note = {This paper investigates simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions? Second, how should the emotional information be represented as labels for supervised DNN training? We evaluate on a large-scale corpus of emotional speech from a professional actress, additionally annotated with perceived emotional labels from crowd-sourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates., This paper investigates simultaneous modeling of multiple emotions in DNN-based expressive speech synthesis, and how to represent the emotional labels, such as emotional class and strength, for this task. Our goal is to answer two questions: First, what is the best way to annotate speech data with multiple emotions? Second, how should the emotional information be represented as labels for supervised DNN training? We evaluate on a large-scale corpus of emotional speech from a professional actress, additionally annotated with perceived emotional labels from crowd-sourced listeners. By comparing DNN-based speech synthesizers that utilize different emotional representations, we assess the impact of these representations and design decisions on human emotion recognition rates.}, title = {Analyzing the impact of including listener perception annotations in RNN-based emotional speech synthesis}, year = {2017} }