{"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00214117","sets":["1164:5159:10515:10760"]},"path":["10760"],"owner":"44499","recid":"214117","title":["VQVAEによって獲得されたキャラクター演技スタイルに基づく多話者オーディオブック音声合成"],"pubdate":{"attribute_name":"公開日","attribute_value":"2021-11-24"},"_buckets":{"deposit":"34c1b770-61da-48df-90a8-e220f3327dab"},"_deposit":{"id":"214117","pid":{"type":"depid","value":"214117","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"VQVAEによって獲得されたキャラクター演技スタイルに基づく多話者オーディオブック音声合成","author_link":["548729","548728","548723","548732","548730","548733","548727","548724","548734","548736","548726","548725","548735","548731"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"VQVAEによって獲得されたキャラクター演技スタイルに基づく多話者オーディオブック音声合成"},{"subitem_title":"Multi-speaker Audiobook Speech Synthesis using Discrete Character Acting Styles Acquired by VQVAE","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"音声合成","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2021-11-24","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":" 東京大学"},{"subitem_text_value":"東京大学"},{"subitem_text_value":"東京大学"},{"subitem_text_value":"東京大学"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"東京大学"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"The University of Tokyo","subitem_text_language":"en"},{"subitem_text_value":"The University of Tokyo","subitem_text_language":"en"},{"subitem_text_value":"The University of Tokyo","subitem_text_language":"en"},{"subitem_text_value":"The University of Tokyo","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"The University of Tokyo","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/214117/files/IPSJ-SLP21139023.pdf","label":"IPSJ-SLP21139023.pdf"},"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-SLP21139023.pdf","filesize":[{"value":"1.4 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"22"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_login","version_id":"1db9a2a6-d448-4455-b60e-918480d716f2","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2021 by the Institute of Electronics, Information and Communication Engineers This SIG report is only available to those in membership of the SIG."}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"中田, 亘"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"郡山, 知樹"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"高道, 慎之介"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"齋藤, 佑樹"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"井島, 勇祐"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"増村, 亮"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"猿渡, 洋"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Wataru, Nakata","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Tomoki, Koriyama","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Shinnosuke, Takamichi","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Yuki, Saito","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Yusuke, Ijima","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Ryo, Masumura","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Hiroshi, Saruwatari","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10442647","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8663","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"本研究では，Vector Quantized Variational AutoEncoder (VQVAE) を用いたキャラクター演技スタイルの抽出，及びそれを用いた多話者オーディオブック音声合成を提案する．声優によるオーディオブック音声では，登場人物の属性などにより異なるキャラクター演技スタイルが含まれため，オーディオブック音声合成においても異なるキャラクター演技スタイルを実現することが望まれる．一方で，テキスト情報のみから登場人物の属性と対応するキャラクター演技スタイルを推測することは困難である．そこで本研究では，音声からキャラクター演技スタイルを抽出しそれに基づく多話者オーディオブック音声合成を提案する．主観評価では，提案法を用いることにより，より原音声に近いキャラクター演技スタイルが実現できることが確認された．","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"In this paper, we propose a method of extracting discrete character acting styles using vector quantized variational autoencoder (VQVAE) and multi-speaker audiobook speech synthesis based on extracted character acting styles. In audiobook corpora uttered by voice talents, the speech utterances contain acting depending on the character’s attributes. Such acting should also be contained in synthesized audiobooks. However, predicting proper acting style and character attributes is still a hard challenge. To this end, we propose a method for extracting character acting styles from audiobook speech and conditioning TTS models by the extracted character acting styles to synthesize speech with character acting. The subjective evaluation shows that the proposed method achieves a closer character acting style to the ground truth speech.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"6","bibliographic_titles":[{"bibliographic_title":"研究報告音声言語情報処理（SLP）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2021-11-24","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"23","bibliographicVolumeNumber":"2021-SLP-139"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":214117,"updated":"2025-01-19T16:53:44.384096+00:00","links":{},"created":"2025-01-19T01:14:57.002277+00:00"}