Node.jsを使って形態素解析をする

Contents

はじめに
kuromoji.jsとは
環境
kuromoji.jsを追加と実行
終わりに

はじめに

以前の投稿で、PythonでWebページからテキストデータを抽出し、形態素解析器を使って形態素に分けてみた

という記事を書きましたが、Pythonでなく、Javascript(Node.js)でも同様なことができないかと、ライブラリを探していました。結果、kuromoji.jsが使えそうということで、試しに使ってみました。

kuromoji.jsとは

kuromoji.jsとはもともと、@takuya_aさんによるJavaの形態素解析器Kuromojiがあり、それをJavascript向けに移植したもののようです。

早速ですが、動かしてみたいと思います。

環境

実験した環境は以下の通りです。

$ node -v
v10.22.0

$ yarn -v
1.22.4

kuromoji.jsのバージョンはpackage.jsonの中身を見てみます。

{


・・・（省略）
  "dependencies": {
    "kuromoji": "^0.1.2"
  }
}

kuromoji.jsを追加と実行

kuromojiを追加します。npmではinstallですが、yarnではaddを使います。

$ yarn add kuromoji

次にコードを書きます。sample.jsという名前で書きました。

let kuromoji = require("kuromoji");

kuromoji.builder({
    dicPath: "node_modules/kuromoji/dict"
}).build( (err, tokenizer) => {
    let path = tokenizer.tokenize("隣の客はよく柿食う客だ。");
    console.log(path);
});

書き終わったら、保存して、nodeで実行します。

$ node sample.js
[ { word_id: 124720,     
    word_type: 'KNOWN',  
    word_position: 1,    
    surface_form: '隣',  
    pos: '名詞',
    pos_detail_1: '一般',
    pos_detail_2: '*',   
    pos_detail_3: '*',   
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: '隣',
    reading: 'トナリ',
    pronunciation: 'トナリ' },
  { word_id: 93100,
    word_type: 'KNOWN',
    word_position: 2,
    surface_form: 'の',
    pos: '助詞',
    pos_detail_1: '連体化',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: 'の',
    reading: 'ノ',
    pronunciation: 'ノ' },
  { word_id: 1296480,
    word_type: 'KNOWN',
    word_position: 3,
    surface_form: '客',
    pos: '名詞',
    pos_detail_1: '一般',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: '客',
    reading: 'キャク',
    pronunciation: 'キャク' },
  { word_id: 93010,
    word_type: 'KNOWN',
    word_position: 4,
    surface_form: 'は',
    pos: '助詞',
    pos_detail_1: '係助詞',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: 'は',
    reading: 'ハ',
    pronunciation: 'ワ' },
  { word_id: 105260,
    word_type: 'KNOWN',
    word_position: 5,
    surface_form: 'よく',
    pos: '副詞',
    pos_detail_1: '一般',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: 'よく',
    reading: 'ヨク',
    pronunciation: 'ヨク' },
  { word_id: 868270,
    word_type: 'KNOWN',
    word_position: 7,
    surface_form: '柿',
    pos: '名詞',
    pos_detail_1: '一般',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: '柿',
    reading: 'カキ',
    pronunciation: 'カキ' },
  { word_id: 2916190,
    word_type: 'KNOWN',
    word_position: 8,
    surface_form: '食う',
    pos: '動詞',
    pos_detail_1: '自立',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '五段・ワ行促音便',
    conjugated_form: '基本形',
    basic_form: '食う',
    reading: 'クウ',
    pronunciation: 'クウ' },
  { word_id: 1296480,
    word_type: 'KNOWN',
    word_position: 10,
    surface_form: '客',
    pos: '名詞',
    pos_detail_1: '一般',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: '客',
    reading: 'キャク',
    pronunciation: 'キャク' },
  { word_id: 23680,
    word_type: 'KNOWN',
    word_position: 11,
    surface_form: 'だ',
    pos: '助動詞',
    pos_detail_1: '*',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '特殊・ダ',
    conjugated_form: '基本形',
    basic_form: 'だ',
    reading: 'ダ',
    pronunciation: 'ダ' },
  { word_id: 90940,
    word_type: 'KNOWN',
    word_position: 12,
    surface_form: '。',
    pos: '記号',
    pos_detail_1: '句点',
    pos_detail_2: '*',
    pos_detail_3: '*',
    conjugated_type: '*',
    conjugated_form: '*',
    basic_form: '。',
    reading: '。',
    pronunciation: '。' } ]