@techreport{oai:ipsj.ixsq.nii.ac.jp:00214037, author = {Takahiro, Nakamura and Toshinori, Endo and Naoki, Osada and Takahiro, Nakamura and Toshinori, Endo and Naoki, Osada}, issue = {1}, month = {Nov}, note = {PR domain-containing 9 (PRDM9) is a zinc-finger protein that binds to specific DNA motifs and induces the crossing-over between chromosomes, resulting in a high recombination rate around binding sites. In this study, we developed a strategy to evaluate the prediction accuracy of PRDM9 binding site by examining the correlation with local recombination rate to avoid the effect of overfitting to one type of data. We compared the methods using position-specific weight matrix (PWM), which has been commonly used in previous studies, and convolutional network (CNN), which has recently performed well. Approximately 170,000 genomic DNA fragments of humans (301 bp each) containing the Chromatin Immuno-Precipitation with high-throughput sequencing (ChIP-seq) peak of PRDM9 of B-allele in the HEK293T cell line were used for constructing PWM and positive data to train CNN. We found that CNN outperformed PWM in terms of area under the curve, and the prediction scores of CNN correlated more strongly with the local recombination rate than PWM. We also investigated the potential PRDM9 binding sites missed by the ChIP-seq experiments but labeled as positive in CNN and discuss the reason for the difference in performances., PR domain-containing 9 (PRDM9) is a zinc-finger protein that binds to specific DNA motifs and induces the crossing-over between chromosomes, resulting in a high recombination rate around binding sites. In this study, we developed a strategy to evaluate the prediction accuracy of PRDM9 binding site by examining the correlation with local recombination rate to avoid the effect of overfitting to one type of data. We compared the methods using position-specific weight matrix (PWM), which has been commonly used in previous studies, and convolutional network (CNN), which has recently performed well. Approximately 170,000 genomic DNA fragments of humans (301 bp each) containing the Chromatin Immuno-Precipitation with high-throughput sequencing (ChIP-seq) peak of PRDM9 of B-allele in the HEK293T cell line were used for constructing PWM and positive data to train CNN. We found that CNN outperformed PWM in terms of area under the curve, and the prediction scores of CNN correlated more strongly with the local recombination rate than PWM. We also investigated the potential PRDM9 binding sites missed by the ChIP-seq experiments but labeled as positive in CNN and discuss the reason for the difference in performances.}, title = {Predicting PRDM9 binding sites by a convolutional neural network and verification using genetic recombination map}, year = {2021} }