@techreport{oai:ipsj.ixsq.nii.ac.jp:00169883, author = {大田, 健翔 and 秋田, 祐哉 and 河原, 達也 and Kensho, Ota and Yuya, Akita and Tatsuya, Kawahara}, issue = {12}, month = {Jul}, note = {本研究では,聴覚障がい者への情報保障のために,講演に対する音声認識を用いたリアルタイムの字幕付与を扱う.話し言葉を音声認識で書き起こす際には,冗長な語句も認識結果として出力されるため文字数が増えて読みにくくなる.そこで本研究では,文意を保存しつつ冗長な語句を削減する簡約処理を検討する.具体的には,講演内容を理解するにあたって必要な単語 (内容語) とそうでない単語 (付属語) に分類し,原則として後者を削除し前者のみを残して字幕として提示する.この原則にあてはまらないものがあるので,内容語で削除するものをアノテーション頻度の比率に基づいて決定し,付属語で復元するものをアノテーション頻度の比率,N-gram による言語尤度比較,機械学習を用いる方法で決定する.講演音声の書き起こしに対して簡約処理を行った結果,正解率 78%・圧縮率 64%で文を圧縮することができた., We have been investigating a real-time captioning framework using automatic speech recognition (ASR) technology for hearing-impaired audience. Since an ASR system transcribes all of speech input, including redundant spoken expressions, resulting captions are very long and thus hard to read and understand. To solve this problem, we propose a “condensation” method, which reduces unnecessary expressions in ASR results as much as possible while retaining key meaning of the utterances. Specifically, each word in ASR results is classified into a content word or a dependent word. Basically, the latter is deleted, while the former is retained for captions. However, there are exceptions in this principle, thus we further introduce refinement process. Redundant content words to be deleted are determined using occurrence counts in annotated training data. On the other hand, for recovery of dependent words, we investigate three methods: occurrence counts in annotated training data, linguistic likelihood measure calculated by an N-gram language model, and a machine learning framework. In an experiment over real lecture transcriptions, word-based compression rate of 64% and accuracy of 78% was obtained.}, title = {講演のリアルタイム字幕付与のための音声認識結果の簡約}, year = {2016} }