@inproceedings{oai:ipsj.ixsq.nii.ac.jp:00208705, author = {浅原, 正幸 and 加藤, 祥 and Masayuki, Asahara Sachi Kato}, book = {じんもんこん2020論文集}, month = {Dec}, note = {内省が効かない古典語について研究を進めるにあたり,統語・語義的に類似用例を提示する技術が求められている.近年,自然言語処理の分野で単語埋め込みの研究が盛んになり,単語の出現毎に異なるベクトルを付与することにより統語・語義的類似度を計量する「文脈化単語埋め込み」の技術が確立した.本研究では220億語規模の現代語の『国語研日本語ウェブコーパス』の事前学習モデルを語彙素に基づき構築し,共通の語彙素が付与されている『日本語歴史コーパス』に文脈化単語埋め込みを付与した.本稿では,文脈化単語埋め込みに基づく意味空間により,古典語に対してどのような研究ができるかについて検討する., Because introspection is not effective for the analysis of ancient languages, a technique to syntactically and semantically present the word similarities is required. Recently, researches on word embeddings have been conducted in the field of natural language processing, and the technique of “contextual word embeddings” has been established to assign a different word vector for each word token. The contextual word embeddings enable us to calculate the cosine between two word (or sentence) tokens that define syntactic and semantic similarities. We developed a pre-training model of BERT based on lexemes from the 22 billion token “NINJAL Web Japanese Corpus” and assigned contextual word vectors on the “Corpus of Historical Japanese” using common lexeme standards. This study explored the effect of contextual word embeddings on historical linguistic studies.}, pages = {241--246}, publisher = {情報処理学会}, title = {『日本語歴史コーパス』の文脈化単語埋め込みに基づく意味空間}, volume = {2020}, year = {2020} }