{"created":"2026-02-18T10:49:53.527613+00:00","links":{},"updated":"2026-02-18T10:49:57.566616+00:00","id":2007606,"metadata":{"_oai":{"id":"oai:ipsj.ixsq.nii.ac.jp:02007606","sets":["1164:5159:1771204180543:1771204240542"]},"path":["1771204240542"],"owner":"80578","recid":"2007606","title":["音トークンのクロスドメイン変動分析：音声・音楽・環境音間の比較"],"pubdate":{"attribute_name":"PubDate","attribute_value":"2026-02-24"},"_buckets":{"deposit":"6e0dc0f0-1254-47b3-8736-55129f97c557"},"_deposit":{"id":"2007606","pid":{"type":"depid","value":"2007606","revision_id":0},"owners":[80578],"status":"published","created_by":80578},"item_title":"音トークンのクロスドメイン変動分析：音声・音楽・環境音間の比較","author_link":[],"item_titles":{"attribute_name":"タイトル","attribute_value_mlt":[{"subitem_title":"音トークンのクロスドメイン変動分析：音声・音楽・環境音間の比較","subitem_title_language":"ja"},{"subitem_title":"Cross-Domain Variation of Discrete Tokens: Comparative Analysis of Speech, Music, and Environmental Sounds","subitem_title_language":"en"}]},"item_keyword":{"attribute_name":"キーワード","attribute_value_mlt":[{"subitem_subject":"SLP","subitem_subject_scheme":"Other"}]},"item_type_id":"4","publish_date":"2026-02-24","item_4_text_3":{"attribute_name":"著者所属","attribute_value_mlt":[{"subitem_text_value":"NTT株式会社"},{"subitem_text_value":"NTT株式会社"},{"subitem_text_value":"NTT株式会社"},{"subitem_text_value":"NTT株式会社"},{"subitem_text_value":"NTT株式会社"}]},"item_4_text_4":{"attribute_name":"著者所属(英)","attribute_value_mlt":[{"subitem_text_value":"NTT, Inc.","subitem_text_language":"en"},{"subitem_text_value":"NTT, Inc.","subitem_text_language":"en"},{"subitem_text_value":"NTT, Inc.","subitem_text_language":"en"},{"subitem_text_value":"NTT, Inc.","subitem_text_language":"en"},{"subitem_text_value":"NTT, Inc.","subitem_text_language":"en"}]},"item_language":{"attribute_name":"言語","attribute_value_mlt":[{"subitem_language":"jpn"}]},"item_publisher":{"attribute_name":"出版者","attribute_value_mlt":[{"subitem_publisher":"情報処理学会","subitem_publisher_language":"ja"}]},"publish_status":"0","weko_shared_id":-1,"item_file_price":{"attribute_name":"Billing file","attribute_type":"file","attribute_value_mlt":[{"url":{"url":"https://ipsj.ixsq.nii.ac.jp/record/2007606/files/IPSJ-SLP26159037.pdf","label":"IPSJ-SLP26159037.pdf"},"date":[{"dateType":"Available","dateValue":"2028-02-24"}],"format":"application/pdf","billing":["billing_file"],"filename":"IPSJ-SLP26159037.pdf","filesize":[{"value":"1.6 MB"}],"mimetype":"application/pdf","priceinfo":[{"tax":["include_tax"],"price":"660","billingrole":"5"},{"tax":["include_tax"],"price":"330","billingrole":"6"},{"tax":["include_tax"],"price":"0","billingrole":"22"},{"tax":["include_tax"],"price":"0","billingrole":"44"}],"accessrole":"open_date","version_id":"5d030d04-513e-414a-bc88-7168b2844263","displaytype":"detail","licensetype":"license_note","license_note":"Copyright (c) 2026 by the Information Processing Society of Japan"}]},"item_4_creator_5":{"attribute_name":"著者名","attribute_type":"creator","attribute_value_mlt":[{"creatorNames":[{"creatorName":"芦原,孝典"}]},{"creatorNames":[{"creatorName":"デルクロア,マーク"}]},{"creatorNames":[{"creatorName":"落合,翼"}]},{"creatorNames":[{"creatorName":"松浦,孝平"}]},{"creatorNames":[{"creatorName":"堀口,翔太"}]}]},"item_4_source_id_9":{"attribute_name":"書誌レコードID","attribute_value_mlt":[{"subitem_source_identifier":"AN10442647","subitem_source_identifier_type":"NCID"}]},"item_4_textarea_12":{"attribute_name":"Notice","attribute_value_mlt":[{"subitem_textarea_value":"SIG Technical Reports are nonrefereed and hence may later appear in any journals, conferences, symposia, etc."}]},"item_resource_type":{"attribute_name":"資源タイプ","attribute_value_mlt":[{"resourceuri":"http://purl.org/coar/resource_type/c_18gh","resourcetype":"technical report"}]},"item_4_source_id_11":{"attribute_name":"ISSN","attribute_value_mlt":[{"subitem_source_identifier":"2188-8663","subitem_source_identifier_type":"ISSN"}]},"item_4_description_7":{"attribute_name":"論文抄録","attribute_value_mlt":[{"subitem_description":"ニューラルオーディオコーデックや自己教師あり学習モデルに基づく離散的な音表現(音トークン)は，その高い圧縮性に加え，テキストトークンと同じ離散空間上で音情報を取り扱うことが出来るため，近年注目を集めている．このような音トークンは，これまで音声・音楽・環境音といったドメインごとに個別に検討されてきたが，ドメイン横断的な特性はまだ十分に明らかになっていない．そこで本稿では，音トークンの振る舞いついて基礎的なクロスドメイン分析を行う．分析結果から，順位-頻度分布およびパープレキシティから推定される統計的特性および確率的な予測可能性は，ドメイン間で概ね一致していた．一方で，音トークンの使用分布はドメイン毎に異なっていた．このような知見は複数ドメインを一体的に処理可能な音声言語モデルの有効性を裏付けるとともに，ドメイン固有のトークン使用パターンをより適切に捉えることで更に性能改善し得ることを示唆している．","subitem_description_type":"Other"}]},"item_4_description_8":{"attribute_name":"論文抄録(英)","attribute_value_mlt":[{"subitem_description":"Techniques for discrete audio representation, which convert an audio signal into a sequence of audio tokens using neural audio codecs or self-supervised speech models, have gained attention for offering the possibility of modeling audio with large language models (LM) efficiently. While these audio tokens have been studied in various domains (e.g., speech, music, and general sound), their encoding properties across domains remain unclear. This paper examines several audio token types to analyze cross-domain variations. Our major findings include that audio tokens exhibit consistent statistical structures and probabilistic predictability deduced from rank-frequency distribution and perplexity, regardless of the domain. However, the token usage pattern is somewhat domain-dependent. This result underpins the steady success of the versatile audio LM, while also suggesting that domain-aware LM could further optimize performance by better capturing domain-specific token usage distributions.","subitem_description_type":"Other"}]},"item_4_biblio_info_10":{"attribute_name":"書誌情報","attribute_value_mlt":[{"bibliographicPageEnd":"7","bibliographic_titles":[{"bibliographic_title":"研究報告音声言語情報処理（SLP）"}],"bibliographicPageStart":"1","bibliographicIssueDates":{"bibliographicIssueDate":"2026-02-24","bibliographicIssueDateType":"Issued"},"bibliographicIssueNumber":"37","bibliographicVolumeNumber":"2026-SLP-159"}]},"relation_version_is_last":true,"weko_creator_id":"80578"}}