Port Japanese mecab tokenizer from v1 (#2036)

* Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests
2025-05-28 09:43:17 +03:00 · 2018-05-04 01:38:26 +09:00 · 2018-05-04 01:38:26 +09:00 · bd72fbf09c
commit bd72fbf09c
parent cc8e804648
4 changed files with 192 additions and 16 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -3,23 +3,87 @@ from __future__ import unicode_literals, print_function

 from ...language import Language
 from ...attrs import LANG
-from ...tokens import Doc
+from ...tokens import Doc, Token
 from ...tokenizer import Tokenizer
+from .tag_map import TAG_MAP

+import re
+from collections import namedtuple
+
+ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
+
+# XXX Is this the right place for this?
+Token.set_extension('mecab_tag', default=None)
+
+def try_mecab_import():
+    """Mecab is required for Japanese support, so check for it.
+
+    It it's not available blow up and explain how to fix it."""
+    try:
+        import MeCab
+        return MeCab
+    except ImportError:
+        raise ImportError("Japanese support requires MeCab: "
+                          "https://github.com/SamuraiT/mecab-python3")
+
+def resolve_pos(token):
+    """If necessary, add a field to the POS tag for UD mapping.
+
+    Under Universal Dependencies, sometimes the same Unidic POS tag can
+    be mapped differently depending on the literal token or its context
+    in the sentence. This function adds information to the POS tag to 
+    resolve ambiguous mappings.
+    """
+
+    # NOTE: This is a first take. The rules here are crude approximations.
+    # For many of these, full dependencies are needed to properly resolve
+    # PoS mappings.
+
+    if token.pos == '連体詞,*,*,*':
+        if re.match('^[こそあど此其彼]の', token.surface):
+            return token.pos + ',DET'
+        if re.match('^[こそあど此其彼]', token.surface):
+            return token.pos + ',PRON'
+        else:
+            return token.pos + ',ADJ'
+    return token.pos
+
+def detailed_tokens(tokenizer, text):
+    """Format Mecab output into a nice data structure, based on Janome."""
+
+    node = tokenizer.parseToNode(text)
+    node = node.next # first node is beginning of sentence and empty, skip it
+    words = []
+    while node.posid != 0:
+        surface = node.surface
+        base = surface # a default value. Updated if available later.
+        parts = node.feature.split(',')
+        pos = ','.join(parts[0:4])
+
+        if len(parts) > 6:
+            # this information is only available for words in the tokenizer dictionary
+            reading = parts[6]
+            base = parts[7]
+
+        words.append( ShortUnitWord(surface, base, pos) )
+        node = node.next
+    return words

 class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        try:
-            from janome.tokenizer import Tokenizer
-        except ImportError:
-            raise ImportError("The Japanese tokenizer requires the Janome "
-                              "library: https://github.com/mocobeta/janome")
-        self.tokenizer = Tokenizer()
+
+        MeCab = try_mecab_import()
+        self.tokenizer = MeCab.Tagger()

    def __call__(self, text):
-        words = [x.surface for x in self.tokenizer.tokenize(text)]
-        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+        dtokens = detailed_tokens(self.tokenizer, text)
+        words = [x.surface for x in dtokens]
+        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
+        for token, dtoken in zip(doc, dtokens):
+            token._.mecab_tag = dtoken.pos
+            token.tag_ = resolve_pos(dtoken)
+        return doc

    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
@ -35,22 +99,21 @@ class JapaneseTokenizer(object):
    def from_disk(self, path, **exclude):
        return self

-
 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ja'
+    tag_map = TAG_MAP

    @classmethod
    def create_tokenizer(cls, nlp=None):
        return JapaneseTokenizer(cls, nlp)

-
 class Japanese(Language):
    lang = 'ja'
    Defaults = JapaneseDefaults
+    Tokenizer = JapaneseTokenizer

    def make_doc(self, text):
        return self.tokenizer(text)

-
 __all__ = ['Japanese']
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@ -0,0 +1,88 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import *
+
+TAG_MAP = {
+    # Explanation of Unidic tags:
+    # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
+
+    # Universal Dependencies Mapping:
+    # http://universaldependencies.org/ja/overview/morphology.html
+    # http://universaldependencies.org/ja/pos/all.html
+
+    "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
+    "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
+
+    "感動詞,フィラー,*,*": {POS: INTJ},
+    "感動詞,一般,*,*": {POS: INTJ},
+
+    # this is specifically for unicode full-width space
+    "空白,*,*,*": {POS: X}, 
+
+    "形状詞,一般,*,*":{POS: ADJ},
+    "形状詞,タリ,*,*":{POS: ADJ}, 
+    "形状詞,助動詞語幹,*,*":{POS: ADJ}, 
+    "形容詞,一般,*,*":{POS: ADJ},
+    "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
+
+    "助詞,格助詞,*,*":{POS: ADP}, 
+    "助詞,係助詞,*,*":{POS: ADP}, 
+    "助詞,終助詞,*,*":{POS: PART}, 
+    "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
+    "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
+    "助詞,副助詞,*,*":{POS: PART},  # ばかり, つつ after a verb
+    "助動詞,*,*,*":{POS: AUX},
+    "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
+
+    "接頭辞,*,*,*":{POS: NOUN}, 
+    "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック 
+    "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
+    "接尾辞,動詞的,*,*":{POS: NOUN},  # -じみ
+    "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
+    "接尾辞,名詞的,一般,*":{POS: NOUN},
+    "接尾辞,名詞的,助数詞,*":{POS: NOUN}, 
+    "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
+
+    "代名詞,*,*,*":{POS: PRON},
+    "動詞,一般,*,*":{POS: VERB},
+    "動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
+    "動詞,非自立可能,*,*,AUX":{POS: AUX},
+    "動詞,非自立可能,*,*,VERB":{POS: VERB},
+    "副詞,*,*,*":{POS: ADV},
+
+    "補助記号,ＡＡ,一般,*":{POS: SYM}, # text art
+    "補助記号,ＡＡ,顔文字,*":{POS: SYM}, # kaomoji
+    "補助記号,一般,*,*":{POS: SYM}, 
+    "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
+    "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
+    "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
+    "補助記号,読点,*,*":{POS: PUNCT}, # comma
+
+    "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
+    "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
+    "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
+    "名詞,固有名詞,人名,名":{POS: PROPN}, # first name
+    "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
+    "名詞,固有名詞,地名,国":{POS: PROPN}, # country name
+
+    "名詞,助動詞語幹,*,*":{POS: AUX}, 
+    "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
+
+    "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
+    "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, 
+    "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, 
+
+    "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
+    "名詞,普通名詞,一般,*":{POS: NOUN}, 
+    "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
+    "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, 
+    "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, 
+    "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
+    "名詞,普通名詞,副詞可能,*":{POS: NOUN},
+
+    "連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
+    "連体詞,*,*,*,ADJ":{POS: ADJ}, 
+    "連体詞,*,*,*,PRON":{POS: PRON}, 
+    "連体詞,*,*,*,DET":{POS: DET}, 
+}
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -135,10 +135,9 @@ def da_tokenizer():

@pytest.fixture
 def ja_tokenizer():
-    janome = pytest.importorskip("janome")
+    janome = pytest.importorskip("MeCab")
    return util.get_lang_class('ja').Defaults.create_tokenizer()

-
@pytest.fixture
 def th_tokenizer():
    pythainlp = pytest.importorskip("pythainlp")
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -5,15 +5,41 @@ import pytest


 TOKENIZER_TESTS = [
-        ("日本語だよ", ['日本語', 'だ', 'よ']),
+        ("日本語だよ", ['日本', '語', 'だ', 'よ']),
        ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
        ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
-        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
+        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
        ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
 ]

+TAG_TESTS = [
+        ("日本語だよ", ['日本語だよ', '名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
+        ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
+        ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
+        ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点 ']),
+        ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
+]
+
+POS_TESTS = [
+        ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
+        ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
+        ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
+        ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
+        ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
+]
+

@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
 def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
    tokens = [token.text for token in ja_tokenizer(text)]
    assert tokens == expected_tokens
+
+@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
+def test_japanese_tokenizer(ja_tokenizer, text, expected_tags):
+    tags = [token.tag_ for token in ja_tokenizer(text)]
+    assert tags == expected_tags
+
+@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
+def test_japanese_tokenizer(ja_tokenizer, text, expected_pos):
+    pos = [token.pos_ for token in ja_tokenizer(text)]
+    assert pos == expected_pos