@inproceedings{oai:ipsj.ixsq.nii.ac.jp:02006234, author = {青池,亨 and 木下,貴文 and Toru Aoike and Takafumi Kinoshita}, book = {じんもんこん2025論文集}, month = {Dec}, note = {The National Diet Library (NDL) has made a considerable effort both to use optical character recognition (OCR) in converting its collection into digital text and to develop OCR technology. Given the sheer volume of the materials that must be handled, even as more advanced methods that yielded more sophisticated results became available, the difficulty of performing large-scale reprocessing on materials that had already undergone OCR processing was a significant challenge. The results of this study, which was made using materials for which copyright protection had expired, show that the usability of large volumes of existing OCR text data can be improved in a fast and resource-efficient manner by applying post-processing with a lightweight layout recognition model. In addition, the results of this study were applied in the development of an experimental feature that has been added to the Next Digital Library in the form of a text mode that displays only the structured text data., The National Diet Library (NDL) has made a considerable effort both to use optical character recognition (OCR) in converting its collection into digital text and to develop OCR technology. Given the sheer volume of the materials that must be handled, even as more advanced methods that yielded more sophisticated results became available, the difficulty of performing large-scale reprocessing on materials that had already undergone OCR processing was a significant challenge. The results of this study, which was made using materials for which copyright protection had expired, show that the usability of large volumes of existing OCR text data can be improved in a fast and resource-efficient manner by applying post-processing with a lightweight layout recognition model. In addition, the results of this study were applied in the development of an experimental feature that has been added to the Next Digital Library in the form of a text mode that displays only the structured text data.}, pages = {431--436}, publisher = {情報処理学会}, title = {軽量なレイアウト認識モデルを活用した 大規模なOCRテキストデータの構造化及び成果物の分析}, volume = {2025}, year = {2025} }