@techreport{oai:ipsj.ixsq.nii.ac.jp:00222039, author = {Tianqi, Wang and Teruhiko, Takagi and Tetsuro, Ito and Masanori, Takagi and Tianqi, Wang and Teruhiko, Takagi and Tetsuro, Ito and Masanori, Takagi}, issue = {9}, month = {Oct}, note = {In education, it is crucial to characterize student learning processes to determine learners' efficiency and success in acquiring new knowledge. Creating a test consisting of different items for a specific target knowledge is necessary to assess and quantify knowledge gain, but doing so can burden educators. In previous research, we proposed a method to calculate similarity from extracted target content to retrieve similar test items from a dataset, thereby helping educators create tests for target knowledge. However, that method ignores semantic features that may be an important clue for similarity. Meanwhile, large-scale language models are now well developed and have recently become proficient in many natural language processing tasks. The performance of large-scale language models in test item similarity tasks remains unexplored. In this paper, we build on previous research to explore the performance of methods based on pretrained language models to calculate the similarity between test items. We apply the methods to the Japanese history question dataset. Experimental results show that pretrained language models help capture semantic similarity between words but do not improve the overall performance as expected., In education, it is crucial to characterize student learning processes to determine learners' efficiency and success in acquiring new knowledge. Creating a test consisting of different items for a specific target knowledge is necessary to assess and quantify knowledge gain, but doing so can burden educators. In previous research, we proposed a method to calculate similarity from extracted target content to retrieve similar test items from a dataset, thereby helping educators create tests for target knowledge. However, that method ignores semantic features that may be an important clue for similarity. Meanwhile, large-scale language models are now well developed and have recently become proficient in many natural language processing tasks. The performance of large-scale language models in test item similarity tasks remains unexplored. In this paper, we build on previous research to explore the performance of methods based on pretrained language models to calculate the similarity between test items. We apply the methods to the Japanese history question dataset. Experimental results show that pretrained language models help capture semantic similarity between words but do not improve the overall performance as expected.}, title = {Application and Evaluation of Language Model Based Methods for Test Item Similarity Calculation}, year = {2022} }