@article{oai:ipsj.ixsq.nii.ac.jp:00202966, author = {Shigeyuki, Sato and Hiroka, Ihara and Kenjiro, Taura and Shigeyuki, Sato and Hiroka, Ihara and Kenjiro, Taura}, issue = {1}, journal = {情報処理学会論文誌プログラミング(PRO)}, month = {Jan}, note = {It is important to handle data in text formats such as XML, JSON, and CSV because these data very often appear in the context of data exchange. Only parts of these data are typically used afterwards so that it is not worth ingesting the whole of them into databases. It is therefore desired to match and extract the concerned part in a lightweight ad hoc manner. Classically used for such a purpose are linewise regular expression tools such as grep, sed, and awk. These are, however, not powerful enough for text formats commonly used for data exchange because they cannot recognize nested structures in general. To support a lightweight ad hoc data processing, we present Centaurus, a just-in-time parallel-parser generator library. By generating native scannerless LL(*) parsers dynamically, our library enables us to process input data in parallel merely by calling Python functions with LL(*) grammars and Python actions. This presentation gives the design and implementation of Centaurus and reports its experimental performance on data filtering., It is important to handle data in text formats such as XML, JSON, and CSV because these data very often appear in the context of data exchange. Only parts of these data are typically used afterwards so that it is not worth ingesting the whole of them into databases. It is therefore desired to match and extract the concerned part in a lightweight ad hoc manner. Classically used for such a purpose are linewise regular expression tools such as grep, sed, and awk. These are, however, not powerful enough for text formats commonly used for data exchange because they cannot recognize nested structures in general. To support a lightweight ad hoc data processing, we present Centaurus, a just-in-time parallel-parser generator library. By generating native scannerless LL(*) parsers dynamically, our library enables us to process input data in parallel merely by calling Python functions with LL(*) grammars and Python actions. This presentation gives the design and implementation of Centaurus and reports its experimental performance on data filtering.}, pages = {18--18}, title = {Centaurus: A Just-in-time Parallel-parser Generator for Ad Hoc Data Processing}, volume = {13}, year = {2020} }