{"created":"2025-01-19T01:11:03.843529+00:00","updated":"2025-01-19T18:23:41.370153+00:00","metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:00209776","sets":["1164:5159:10515:10530"]},"path":["10530"],"owner":"44499","recid":"209776","title":["オーディオビジュアル目的話者抽出の 実環境動作に向けたattention機構の検討"],"pubdate":{"attribute_name":"公開日","attribute_value":"2021-02-24"},"_buckets":{"deposit":"7a530f68-cb65-4e0f-8573-d2d922257d71"},"_deposit":{"id":"209776","pid":{"type":"depid","value":"209776","revision_id":0},"owners":[44499],"status":"published","created_by":44499},"item_title":"オーディオビジュアル目的話者抽出の 実環境動作に向けたattention機構の検討","author_link":["529752","529754","529751","529747","529743","529744","529745","529750","529753","529746","529749","529748"],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"オーディオビジュアル目的話者抽出の 実環境動作に向けたattention機構の検討"},{"subitem_title":"Evaluation of Attention Fusion based Audio-Visual Target Speaker Extraction on Real Recordings","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"SP2","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2021-02-24","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"日本電信電話株式会社"},{"subitem_text_value":"日本電信電話株式会社"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"HirNippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporation","subitem_text_language":"en"},{"subitem_text_value":"Nippon Telegraph and Telephone Corporationoshi Sato","subitem_text_language":"en"},{"subitem_text_value":"Tsubasa Ochiai","subitem_text_language":"en"},{"subitem_text_value":"Keisuke Kinoshita","subitem_text_language":"en"},{"subitem_text_value":"Marc Delcroix","subitem_text_language":"en"},{"subitem_text_value":"Tomohiro Nakatani","subitem_text_language":"en"},{"subitem_text_value":"Shoko Araki","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/209776/files/IPSJ-SLP21136038.pdf","label":"IPSJ-SLP21136038.pdf"},"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-SLP21136038.pdf","filesize":[{"value":"3.4 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"0","billingrole":"22"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_login","version_id":"208b8615-7257-415e-a7b3-ac6c9aeefcc7","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2021 by the Institute of Electronics, Information and Communication Engineers This SIG report is only available to those in membership of the SIG."}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"佐藤, 宏"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"落合, 翼"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"木下, 慶介"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"デルクロア, マーク"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"中谷, 智広"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"荒木, 章子"}],"nameIdentifiers":[{}]}]},"item_4_creator_6":{"attribute_name":"著者名(英)","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"Hiroshi, Sato","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Tsubasa, Ochiai","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Keisuke, Kinoshita","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Marc, Delcroix","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Tomohiro, Nakatani","creatorNameLang":"en"}],"nameIdentifiers":[{}]},{"creatorNames":[{"creatorName":"Shoko, Araki","creatorNameLang":"en"}],"nameIdentifiers":[{}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10442647","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8663","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"目的話者抽出技術とは，目的話者に関する手がかりを用いて混合音からその話者の音声を抽出する技術である．従来目的話者手がかりとして，事前に登録した目的話者の音声(audio 手がかり) を用いる手法と，目的話者の顔の動画(visual手がかり) を用いる手法が提案されてきた．さらに近年ではより頑健に抽出が可能な手法としてaudio手がかりとvisual手がかり両者を用いるaudio-visual目的話者抽出が検討されている．これまでにaudio-visual目的話者抽出は，単一モーダルの手がかりを用いる手法と比較して頑健に動作することがシミュレーションデータに対して示されている．しかしその実環境への適用に関する検討はまだ十分なされていなかった．audio-visual 目的話者抽出の実環境適応における課題の 1 つは手がかりの欠損である．例えば撮影された visual手がかりが話者の手をはじめとする遮蔽物に隠された場合など，実環境において手がかりは必ずしも高い信頼性で得られるとは限らない．本研究では信頼性の低い手がかりの悪影響を低減し，より頑健な動作を実現するために，異なるモーダルから得られる手がかりの情報を信頼性に基づいて重みづけて統合する新たな attention機構とその学習方法を提案した．シミュレーションデータを用いた評価実験の結果，提案法は従来手法と比較して SDRを1.0 dB改善することが確認された． 加えて本研究では実収録データを作成し，提案法を用いた audio-visual目的話者抽出が実収録データに対しても動作することを示した.","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"The audio-visual target speech extraction, which aims at extracting a target speaker’s voice from a mixture withaudio and visual clues, has received much interest. In previous works, the audio-visual target speaker extraction has shown more stable performance than single modality methods for simulated data. However, its adaptation towards realistic situations has not been fully explored as well as evaluations on real recorded mixtures. Especially, we focus on clue corruption problem that occurs often in real recordings. In this work, we propose a novel attention mechanism for multi-modal fusion and its training methods that enable to selective use of more reliable clues. We record an audio-visual dataset of simultaneous speech with realistic visual clue corruption, and show that audio-visual target speech extraction with our proposals successfully work on real data as well as on simulated data.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"6","bibliographic_titles":[{"bibliographic_title":"研究報告音声言語情報処理（SLP）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2021-02-24","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"38","bibliographicVolumeNumber":"2021-SLP-136"}]},"relation_version_is_last":true,"weko_creator_id":"44499"},"id":209776,"links":{}}