@techreport{oai:ipsj.ixsq.nii.ac.jp:00239002,
 author = {黒川, 博生 and 宮森, 恒},
 issue = {13},
 month = {Sep},
 note = {統計データは，政府等が保有するオープンデータの一種であり，近年，社会問題となっているフェイクニュースに対処するための事実確認（ファクトチェック）への活用をはじめ，有効活用するためのアドホック検索基盤の重要性が高まっている．しかし，従来の統計データ検索では，文書長の短さや表記ゆれなどによる検索漏れ等により，十分なランキング性能を達成できていない．そこで，本稿では大規模言語モデルを用いた文書補強とリランキングによる統計データのアドホック検索手法を提案する．提案手法では，まず統計データから抽出された見出し，行名，列名，値に基づき，その内容説明を大規模言語モデルで生成することでメタデータを補強した文書を作成する．次に，補強された文書を利用してランキングを行い，最後に大規模言語モデルを用いて意味内容の類似に基づくリランキングを行う．評価実験の結果，提案手法は，文書補強およびリランキングを行わないベースラインと比較して, ランキング性能が nDCG@10 で 0.133 改善することを確認した．, Statistical data is a kind of open data held by governments and others, and its use for fact-checking to address the social problem such as fake news. Therefore, the importance of ad-hoc search infrastructure for effective use of data is increasing. However, traditional statistical data search systems have not achieved sufficient ranking performance due to omission in a search caused by short document length and spelling variations. In this paper, we proprose a statistical data ad-hoc search method using document augmentation and re-ranking with large language model. First, we create a document with augmented metadata by generating descriptions of the contents on the basis of extracted headers, row names, column names, and values from the statistical data using a large language model. Next, we rank the augmented documents and finally re-rank them based on semantic similarity using a large language model. As a result of the evaluation experiment, we confirmed that the proposed method improves the ranking performance by 0.133 in nDCG@10 compared to the baseline that does not perform document augmentation and re-ranking.},
 title = {大規模言語モデルを用いた文書補強とリランキングによる統計データ検索},
 year = {2024}
}