@inproceedings{oai:ipsj.ixsq.nii.ac.jp:00228725,
 author = {芳村, 涼介 and 外谷, 渉 and 赤木, 雅弥 and 岡本, 健 and 堀江, 則之 and 小笠原, 恒雄 and Ryosuke, Yoshimura and Sho, Sotoya and Masaya, Akagi and Takeshi, Okamoto and Noriyuki, Horie and Tsuneo, Ogasawara},
 book = {コンピュータセキュリティシンポジウム2023論文集},
 month = {Oct},
 note = {ソフトウェア製品の脆弱性対策では，公開リポジトリの中に該当のエクスプロイトコードがあるか確認することが重要である．この調査により，どの脆弱性を先に対応するべきかといった知見が得られる．我々はこれまでGitHubのリポジトリに存在するエクスプロイトコードについて，CVE識別番号に着目して実態を調査してきた．この取り組みにより，セキュリティパッチや偽のエクスプロイトコードといった誤検知を誘発するリポジトリが多く検出された．判定の精度をあげるためには，人為的に調査するという方法が考えられるものの，日々増加するリポジトリの調査をすることは現実的ではない．そこで本研究では，機械学習を用いてGitHubからエクスプロイトコードを検出する手法を提案する．提案手法では最初にGitHubのリポジトリを対象とし，ファイル形式の解析と構文解析，外部アーカイブの検索を行った．その後，各種解析結果のデータ列に対しLightGBMを用いて，対象リポジトリがエクスプロイトコードであるか否かを判定した．また，本提案手法の有効性を検証するため，我々は実証実験を試みた．実験ではGitHubからランダムにリポジトリを選定し，エクスプロイトコードか否かを判定したところ，F値0.89という結果を得た．これらの手法により，エクスプロイトコードを高精度で識別することができ，20％を超えるノイズを排除することができた．, In order to clarify countermeasures priority of any vulnerability in software products, it is necessary to assess whether the vulnerability related exploit code has actually existed in the related public accessible repositories in the internet. We have been assessing and examining recent actual state of related vulnerability's exploit code that exists in the GitHub repository by cross-referencing on their CVE identification number, yet the effort found many repositories that triggered false positives in its detection, such as triggered by similar code written in related vulnerability's security patches or in bogus exploit code. Focusing on improving the accuracy of the detection, the method of artificially performing assessment on exploit codes shared in those repositories is conceivable, yet it is not realistic to investigate all of them by this method considering the vast increasing number of released repositories every day. In this paper, we propose a method to detect exploit codes from GitHub using machine learning to detect exploit code from GitHub repositories. The method is conducting various techniques, from performing file format analysis, syntax parsing, and searches of external archives. The parsed data to then is filtered with LightGBM to determine whether the target repository contains targeted exploit code or not. In addition, we tried to demonstrate an experiment to verify the effectiveness of proposed method where repositories were randomly selected from GitHub repositories entries, and their contents were assessed and tested for its presence of desired exploit codes. In machine learning model the precision factor is known as F-score (also known as the F1 score or F-measure), in our case, after making efforts in tuning and testing, the final result of F-measure value is 0.89.},
 pages = {821--827},
 publisher = {情報処理学会},
 title = {LightGBMを用いた公開リポジトリのエクスプロイト判定手法},
 year = {2023}
}