MeCab ライブラリ

MeCabはC/C++のライブラリを提供しています。また, SWIGを通して Perl/Ruby/Python から利用することも可能です。

シングルスレッド環境で単純な形態素解析を行う場合は、MeCab::Tagger クラスのみでほとんどのことが行えます。マルチスレッド環境で1つの辞書を共有しながら形態素解析を行いたい場合や、 MeCabの辞書を解析中にアップデートするなど、高度な応用には MeCab::Tagger に加えて、 MeCab::Model, MeCab::Lattice クラスを使用します。

C++のAPIセットのドキュメントはこちらを参照ください。 CのAPIセットはこちらを参照ください。

C++ サンプルコード

シングルスレッド環境 (MeCab::Tagger)

Taggerオブジェクトを生成すると、1つの辞書オブジェクトが生成されます。

#include <iostream>
#include <mecab.h>

#define CHECK(eval) if (! eval) { \
   const char *e = tagger ? tagger->what() : MeCab::getTaggerError(); \
   std::cerr << "Exception:" << e << std::endl; \
   delete tagger; \
   return -1; }

// Sample of MeCab::Tagger class.
int main (int argc, char **argv) {
  char input[1024] = "太郎は次郎が持っている本を花子に渡した。";

  MeCab::Tagger *tagger = MeCab::createTagger("");
  CHECK(tagger);

  // Gets tagged result in string format.
  const char *result = tagger->parse(input);
  CHECK(result);
  std::cout << "INPUT: " << input << std::endl;
  std::cout << "RESULT: " << result << std::endl;

  // Gets N best results in string format.
  result = tagger->parseNBest(3, input);
  CHECK(result);
  std::cout << "NBEST: " << std::endl << result;

  // Gets N best results in sequence.
  CHECK(tagger->parseNBestInit(input));
  for (int i = 0; i < 3; ++i) {
    std::cout << i << ":" << std::endl << tagger->next();
  }

  // Gets Node object.
  const MeCab::Node* node = tagger->parseToNode(input);
  CHECK(node);
  for (; node; node = node->next) {
    std::cout << node->id << ' ';
    if (node->stat == MECAB_BOS_NODE)
      std::cout << "BOS";
    else if (node->stat == MECAB_EOS_NODE)
      std::cout << "EOS";
    else
      std::cout.write (node->surface, node->length);

    std::cout << ' ' << node->feature
	      << ' ' << (int)(node->surface - input)
	      << ' ' << (int)(node->surface - input + node->length)
	      << ' ' << node->rcAttr
	      << ' ' << node->lcAttr
	      << ' ' << node->posid
	      << ' ' << (int)node->char_type
	      << ' ' << (int)node->stat
	      << ' ' << (int)node->isbest
	      << ' ' << node->alpha
	      << ' ' << node->beta
	      << ' ' << node->prob
	      << ' ' << node->cost << std::endl;
  }

  // Dictionary info.
  const MeCab::DictionaryInfo *d = tagger->dictionary_info();
  for (; d; d = d->next) {
    std::cout << "filename: " <<  d->filename << std::endl;
    std::cout << "charset: " <<  d->charset << std::endl;
    std::cout << "size: " <<  d->size << std::endl;
    std::cout << "type: " <<  d->type << std::endl;
    std::cout << "lsize: " <<  d->lsize << std::endl;
    std::cout << "rsize: " <<  d->rsize << std::endl;
    std::cout << "version: " <<  d->version << std::endl;
  }

  delete tagger;

  return 0;
}

C++ サンプルコード

マルチスレッド環境 (MeCab::Tagger, MeCab::Model, MeCab::Lattice)

  • MeCab::createModel() を使い、Modelオブジェクトを生成します
  • model->createTagger() を使い、Taggerオブジェクトを生成します。Taggerはスレッド毎に複数作成しても、同一のmodelを共有します。Taggerがアクティブの間は、modelを削除してはなりません。
  • model->createLattice もしくは MeCab::createLattice() を使い、Latticeオブジェクトを作成します。Latticeオプジェクトは解析に必要なすべてのローカル変数を含んでいます。必ずスレッド毎に1つのオブジェクトを作成してください
  • model->swap(another_model) を呼ぶと、model から生成されたすべてのTaggerオブジェクトのモデルを another_modelに置き換えます。この操作はスレッドセーフです。
#include <iostream>
#include <mecab.h>

#define CHECK(eval) if (! eval) { \
   const char *e = tagger ? tagger->what() : MeCab::getTaggerError(); \
   std::cerr << "Exception:" << e << std::endl; \
   delete tagger; \
   return -1; }

int main (int argc, char **argv) {
  char input[1024] = "太郎は次郎が持っている本を花子に渡した。";

  // Create model object.
  MeCab::Model *model = MeCab::createModel(argc, argv);

  // Create Tagger
  // All taggers generated by Model::createTagger() method share
  // the same model/dictionary.
  MeCab::Tagger *tagger = model->createTagger();
  CHECK(tagger);

  // Create lattice object per thread.
  MeCab::Lattice *lattice = model->createLattice();

  // Gets tagged result in string
  lattice->set_sentence(input);

  // this method is thread safe, as long as |lattice| is thread local.
  CHECK(tagger->parse(lattice));
  std::cout << lattice->toString() << std::endl;

  // Gets node object.
  const MeCab::Node* node = lattice->bos_node();
  CHECK(node);
  for (; node; node = node->next) {
    std::cout << node->id << ' ';
    if (node->stat == MECAB_BOS_NODE)
      std::cout << "BOS";
    else if (node->stat == MECAB_EOS_NODE)
      std::cout << "EOS";
    else
      std::cout.write (node->surface, node->length);

    std::cout << ' ' << node->feature
	      << ' ' << (int)(node->surface - input)
	      << ' ' << (int)(node->surface - input + node->length)
	      << ' ' << node->rcAttr
	      << ' ' << node->lcAttr
	      << ' ' << node->posid
	      << ' ' << (int)node->char_type
	      << ' ' << (int)node->stat
	      << ' ' << (int)node->isbest
	      << ' ' << node->alpha
	      << ' ' << node->beta
	      << ' ' << node->prob
	      << ' ' << node->cost << std::endl;
  }


  // begin_nodes/end_nodes
  const size_t len = lattice->size();
  for (int i = 0; i <= len; ++i) {
    MeCab::Node *b = lattice->begin_nodes(i);
    MeCab::Node *e = lattice->end_nodes(i);
    for (; b; b = b->bnext) {
      printf("B[%d] %s\t%s\n", i, b->surface, b->feature);
    }
    for (; e; e = e->enext) {
      printf("E[%d] %s\t%s\n", i, e->surface, e->feature);
    }
  }

  // N best results
  lattice->set_request_type(MECAB_NBEST);
  lattice->set_sentence(input);
  CHECK(tagger->parse(lattice));
  for (int i = 0; i < 10; ++i) {
    std::cout << "NBEST: " << i << std::endl;
    std::cout << lattice->toString();
    if (!lattice->next()) {
      // No more results
      break;
    }
  }

  // Marginal probabilities
  lattice->remove_request_type(MECAB_NBEST);
  lattice->set_request_type(MECAB_MARGINAL_PROB);
  lattice->set_sentence(input);
  CHECK(tagger->parse(lattice));
  std::cout << lattice->theta() << std::endl;
  for (const MeCab::Node *node = lattice->bos_node();
       node; node = node->next) {
    std::cout.write(node->surface, node->length);
    std::cout << "\t" << node->feature;
    std::cout << "\t" << node->prob << std::endl;
  }

  // Dictionary info
  const MeCab::DictionaryInfo *d = model->dictionary_info();
  for (; d; d = d->next) {
    std::cout << "filename: " <<  d->filename << std::endl;
    std::cout << "charset: " <<  d->charset << std::endl;
    std::cout << "size: " <<  d->size << std::endl;
    std::cout << "type: " <<  d->type << std::endl;
    std::cout << "lsize: " <<  d->lsize << std::endl;
    std::cout << "rsize: " <<  d->rsize << std::endl;
    std::cout << "version: " <<  d->version << std::endl;
  }

  // Swap model atomically.
  MeCab::Model *another_model = MeCab::createModel("");
  model->swap(another_model);

  delete lattice;
  delete tagger;
  delete model;

  return 0;
}

C サンプルコード

シングルスレッド環境

#include <mecab.h>
#include <stdio.h>

#define CHECK(eval) if (! eval) { \
    fprintf (stderr, "Exception:%s\n", mecab_strerror (mecab)); \
    mecab_destroy(mecab); \
    return -1; }

int main (int argc, char **argv)  {
  char input[] = "太郎は次郎が持っている本を花子に渡した。";
  mecab_t *mecab;
  const mecab_node_t *node;
  const char *result;
  int i;
  size_t len;

  // Create tagger object
  mecab = mecab_new(argc, argv);
  CHECK(mecab);

  // Gets tagged result in string.
  result = mecab_sparse_tostr(mecab, input);
  CHECK(result)
  printf ("INPUT: %s\n", input);
  printf ("RESULT:\n%s", result);

  // Gets N best results
  result = mecab_nbest_sparse_tostr (mecab, 3, input);
  CHECK(result);
  fprintf (stdout, "NBEST:\n%s", result);

  CHECK(mecab_nbest_init(mecab, input));
  for (i = 0; i < 3; ++i) {
    printf ("%d:\n%s", i, mecab_nbest_next_tostr (mecab));
  }

  // Gets node object
  node = mecab_sparse_tonode(mecab, input);
  CHECK(node);
  for (; node; node = node->next) {
    if (node->stat == MECAB_NOR_NODE || node->stat == MECAB_UNK_NODE) {
      fwrite (node->surface, sizeof(char), node->length, stdout);
      printf("\t%s\n", node->feature);
    }
  }

  // Dictionary info
  const mecab_dictionary_info_t *d = mecab_dictionary_info(mecab);
  for (; d; d = d->next) {
    printf("filename: %s\n", d->filename);
    printf("charset: %s\n", d->charset);
    printf("size: %d\n", d->size);
    printf("type: %d\n", d->type);
    printf("lsize: %d\n", d->lsize);
    printf("rsize: %d\n", d->rsize);
    printf("version: %d\n", d->version);
  }

  mecab_destroy(mecab);

  return 0;
}

C サンプルコード

マルチスレッド環境

#include <mecab.h>
#include <stdio.h>

#define CHECK(eval) if (! eval) { \
    fprintf (stderr, "Exception:%s\n", mecab_strerror (mecab)); \
    mecab_destroy(mecab); \
    return -1; }

int main (int argc, char **argv)  {
  char input[] = "太郎は次郎が持っている本を花子に渡した。";
  mecab_model_t *model, *another_model;
  mecab_t *mecab;
  mecab_lattice_t *lattice;
  const mecab_node_t *node;
  const char *result;
  int i;
  size_t len;

  model = mecab_model_new(argc, argv);
  CHECK(model);

  mecab = mecab_model_new_tagger(model);
  CHECK(mecab);

  lattice = mecab_model_new_lattice(model);
  CHECK(lattice);

  mecab_lattice_set_sentence(lattice, input);
  mecab_parse_lattice(mecab, lattice);

  printf("RESULT: %s\n", mecab_lattice_tostr(lattice));

  node = mecab_lattice_get_bos_node(lattice);
  for (;  node; node = node->next) {
    printf("%d ", node->id);

    if (node->stat == MECAB_BOS_NODE)
      printf("BOS");
    else if (node->stat == MECAB_EOS_NODE)
      printf("EOS");
    else
      fwrite (node->surface, sizeof(char), node->length, stdout);

    printf(" %s %d %d %d %d %d %d %d %d %f %f %f %ld\n",
	   node->feature,
	   (int)(node->surface - input),
	   (int)(node->surface - input + node->length),
	   node->rcAttr,
	   node->lcAttr,
	   node->posid,
	   (int)node->char_type,
	   (int)node->stat,
	   (int)node->isbest,
	   node->alpha,
	   node->beta,
	   node->prob,
	   node->cost);
  }

  len = mecab_lattice_get_size(lattice);
  for (i = 0; i <= len; ++i) {
    mecab_node_t *b, *e;
    b = mecab_lattice_get_begin_nodes(lattice, (size_t)i);
    e = mecab_lattice_get_end_nodes(lattice, (size_t)i);
    for (; b; b = b->bnext) {
        printf("B[%d] %s\t%s\n", i, b->surface, b->feature);
    }
    for (; e; e = e->enext) {
        printf("E[%d] %s\t%s\n", i, e->surface, e->feature);
    }
  }

  mecab_lattice_set_sentence(lattice, input);
  mecab_lattice_set_request_type(lattice, MECAB_NBEST);
  mecab_parse_lattice(mecab, lattice);
  for (i = 0; i < 10; ++i) {
    fprintf(stdout, "%s", mecab_lattice_tostr(lattice));
    if (!mecab_lattice_next(lattice)) {
      break;
    }
  }

  mecab_lattice_set_sentence(lattice, input);
  mecab_lattice_set_request_type(lattice, MECAB_MARGINAL_PROB);
  mecab_lattice_set_theta(lattice, 0.001);
  mecab_parse_lattice(mecab, lattice);
  node = mecab_lattice_get_bos_node(lattice);
  for (;  node; node = node->next) {
    fwrite(node->surface, sizeof(char), node->length, stdout);
    fprintf(stdout, "\t%s\t%f\n", node->feature, node->prob);
  }

  mecab_set_lattice_level(mecab, 0);
  mecab_set_all_morphs(mecab, 1);
  node = mecab_sparse_tonode(mecab, input);
  CHECK(node);
  for (; node; node = node->next) {
    fwrite (node->surface, sizeof(char), node->length, stdout);
    printf("\t%s\n", node->feature);
  }

  const mecab_dictionary_info_t *d = mecab_dictionary_info(mecab);
  for (; d; d = d->next) {
    printf("filename: %s\n", d->filename);
    printf("charset: %s\n", d->charset);
    printf("size: %d\n", d->size);
    printf("type: %d\n", d->type);
    printf("lsize: %d\n", d->lsize);
    printf("rsize: %d\n", d->rsize);
    printf("version: %d\n", d->version);
  }


  mecab_destroy(mecab);
  mecab_lattice_destroy(lattice);
  mecab_model_destroy(model);

  return 0;
}

コンパイル方法

UNIX の場合

% cc -O2 `mecab-config --cflags` example.c -o example \
         `mecab-config --libs`

Windows の場合

まず, コンパイル作業を行うディレクトリに include\mecab.h, bin\libmecab.dll lib\libmecab.lib をコピーします. この後の作業は, 使用するコンパイラによって微妙に変わります.

cygwin/mingw 環境の場合

% gcc -DDLL_IMPORT -I. example.c -o example.exe libmecab.dll

VC++ 環境の場合

% cl -DDLL_IMPORT -I. example.c libmecab.lib