@techreport{oai:ipsj.ixsq.nii.ac.jp:00048268,
 author = {関口, 洋一 and 山本, 和英 and Youichi, Sekiguchi and Kazuhide, Yamamoto},
 issue = {98(2003-NL-157)},
 month = {Sep},
 note = {Webをコーパスの情報源としたWebコーパスの構築手法を提案する．一般的に用いられている新聞コーパスの量やそれに伴う用例の少なさは否めない．そこで，我々はWebに着目した．Webを用いることで量的な問題を解決できるが，そのまま用いたのでは表現そのものや，文の構造に問題がある．そこでコーパスを質の面から検討を行う．質改善の手法として，HTMLタグや日本語文章の書法を用いて改善を試みる外面的質の考慮を挙げる．さらに記号を多用した文や話しことばの崩れた文を削除し，文字種の割合を示す字面比を用いて文を削除する等の内面的質を考慮する手法を提案する．構築したWebコーパスに対して2種類の実験を行った．1つめは，異なり単語数やシソーラスを用いて単語の特徴を観察した．2つめは，有用性を調査するため，格フレームを用いて調査を行った．その結果，異なり単語数，格フレーム数ともに新聞や未処理のWebテキストを上回るコーパスを構築できた．, We present a method for construction of a Web corpus.  There is a quantity issue in a newspaper corpus as we use it as a text corpus for natural language processing.  We use a collection of Web pages so that we can solve lack of resource amount.  However, some of the Web texts have a low quality.  We then propose some methods to reduce some of these texts out of the Web corpus.  The methods include sentence determination using a part of HTML tags, and filtering out-of-range sentences by proportions of each character type.  We have confirmed that our Web corpus outperformed a newspaper corpus, in terms of number of words and case frames.  We also show that our Web corpus is also superior to unprocessed Web texts.},
 title = {Webコーパスの提案},
 year = {2003}
}