Port Japanese mecab tokenizer from v1 (#2036)

* Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests
2025-09-22 20:16:43 +03:00 · 2018-05-04 01:38:26 +09:00 · 2018-05-04 01:38:26 +09:00 · bd72fbf09c
commit bd72fbf09c
parent cc8e804648
4 changed files with 192 additions and 16 deletions
--- a/spacy/lang/ja/init.py
+++ b/spacy/lang/ja/init.py
@ -3,23 +3,87 @@ from __future__ import unicode_literals, print_function
 from ...language import Language
 from ...attrs import LANG
-from ...tokens import Doc
+from ...tokens import Doc, Token
 from ...tokenizer import Tokenizer
 from .tag_map import TAG_MAP
 import re
 from collections import namedtuple
 ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
 # XXX Is this the right place for this?
 Token.set_extension('mecab_tag', default=None)
 def try_mecab_import():
    """Mecab is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it."""
    try:
        import MeCab
        return MeCab
    except ImportError:
        raise ImportError("Japanese support requires MeCab: "
                          "https://github.com/SamuraiT/mecab-python3")
 def resolve_pos(token):
    """If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function adds information to the POS tag to 
    resolve ambiguous mappings.
    """
    # NOTE: This is a first take. The rules here are crude approximations.
    # For many of these, full dependencies are needed to properly resolve
    # PoS mappings.
    if token.pos == '連体詞,*,*,*':
        if re.match('^[こそあど此其彼]の', token.surface):
            return token.pos + ',DET'
        if re.match('^[こそあど此其彼]', token.surface):
            return token.pos + ',PRON'
        else:
            return token.pos + ',ADJ'
    return token.pos
 def detailed_tokens(tokenizer, text):
    """Format Mecab output into a nice data structure, based on Janome."""
    node = tokenizer.parseToNode(text)
    node = node.next # first node is beginning of sentence and empty, skip it
    words = []
    while node.posid != 0:
        surface = node.surface
        base = surface # a default value. Updated if available later.
        parts = node.feature.split(',')
        pos = ','.join(parts[0:4])
        if len(parts) > 6:
            # this information is only available for words in the tokenizer dictionary
            reading = parts[6]
            base = parts[7]
        words.append( ShortUnitWord(surface, base, pos) )
        node = node.next
    return words
 class JapaneseTokenizer(object):
    def __init__(self, cls, nlp=None):
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
-        try:
+
-            from janome.tokenizer import Tokenizer
+        MeCab = try_mecab_import()
-        except ImportError:
+        self.tokenizer = MeCab.Tagger()
            raise ImportError("The Japanese tokenizer requires the Janome "
                              "library: https://github.com/mocobeta/janome")
        self.tokenizer = Tokenizer()
    def __call__(self, text):
-        words = [x.surface for x in self.tokenizer.tokenize(text)]
+        dtokens = detailed_tokens(self.tokenizer, text)
-        return Doc(self.vocab, words=words, spaces=[False]*len(words))
+        words = [x.surface for x in dtokens]
        doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
        for token, dtoken in zip(doc, dtokens):
            token._.mecab_tag = dtoken.pos
            token.tag_ = resolve_pos(dtoken)
        return doc
    # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
    # allow serialization (see #1557)
@ -35,22 +99,21 @@ class JapaneseTokenizer(object):
    def from_disk(self, path, **exclude):
        return self
 class JapaneseDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'ja'
    tag_map = TAG_MAP
    @classmethod
    def create_tokenizer(cls, nlp=None):
        return JapaneseTokenizer(cls, nlp)
 class Japanese(Language):
    lang = 'ja'
    Defaults = JapaneseDefaults
    Tokenizer = JapaneseTokenizer
    def make_doc(self, text):
        return self.tokenizer(text)
 __all__ = ['Japanese']
--- a/spacy/lang/ja/tag_map.py
+++ b/spacy/lang/ja/tag_map.py
@ -0,0 +1,88 @@
 # encoding: utf8
 from __future__ import unicode_literals
 from ...symbols import *
 TAG_MAP = {
    # Explanation of Unidic tags:
    # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
    # Universal Dependencies Mapping:
    # http://universaldependencies.org/ja/overview/morphology.html
    # http://universaldependencies.org/ja/pos/all.html
    "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
    "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
    "感動詞,フィラー,*,*": {POS: INTJ},
    "感動詞,一般,*,*": {POS: INTJ},
    # this is specifically for unicode full-width space
    "空白,*,*,*": {POS: X}, 
    "形状詞,一般,*,*":{POS: ADJ},
    "形状詞,タリ,*,*":{POS: ADJ}, 
    "形状詞,助動詞語幹,*,*":{POS: ADJ}, 
    "形容詞,一般,*,*":{POS: ADJ},
    "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
    "助詞,格助詞,*,*":{POS: ADP}, 
    "助詞,係助詞,*,*":{POS: ADP}, 
    "助詞,終助詞,*,*":{POS: PART}, 
    "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
    "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
    "助詞,副助詞,*,*":{POS: PART},  # ばかり, つつ after a verb
    "助動詞,*,*,*":{POS: AUX},
    "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
    "接頭辞,*,*,*":{POS: NOUN}, 
    "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック 
    "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
    "接尾辞,動詞的,*,*":{POS: NOUN},  # -じみ
    "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
    "接尾辞,名詞的,一般,*":{POS: NOUN},
    "接尾辞,名詞的,助数詞,*":{POS: NOUN}, 
    "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
    "代名詞,*,*,*":{POS: PRON},
    "動詞,一般,*,*":{POS: VERB},
    "動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
    "動詞,非自立可能,*,*,AUX":{POS: AUX},
    "動詞,非自立可能,*,*,VERB":{POS: VERB},
    "副詞,*,*,*":{POS: ADV},
    "補助記号,ＡＡ,一般,*":{POS: SYM}, # text art
    "補助記号,ＡＡ,顔文字,*":{POS: SYM}, # kaomoji
    "補助記号,一般,*,*":{POS: SYM}, 
    "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
    "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
    "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
    "補助記号,読点,*,*":{POS: PUNCT}, # comma
    "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
    "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
    "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
    "名詞,固有名詞,人名,名":{POS: PROPN}, # first name
    "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
    "名詞,固有名詞,地名,国":{POS: PROPN}, # country name
    "名詞,助動詞語幹,*,*":{POS: AUX}, 
    "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
    "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
    "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, 
    "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, 
    "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
    "名詞,普通名詞,一般,*":{POS: NOUN}, 
    "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
    "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, 
    "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, 
    "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
    "名詞,普通名詞,副詞可能,*":{POS: NOUN},
    "連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
    "連体詞,*,*,*,ADJ":{POS: ADJ}, 
    "連体詞,*,*,*,PRON":{POS: PRON}, 
    "連体詞,*,*,*,DET":{POS: DET}, 
 }
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -135,10 +135,9 @@ def da_tokenizer():
@pytest.fixture
 def ja_tokenizer():
-    janome = pytest.importorskip("janome")
+    janome = pytest.importorskip("MeCab")
    return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture
 def th_tokenizer():
    pythainlp = pytest.importorskip("pythainlp")
--- a/spacy/tests/lang/ja/test_tokenizer.py
+++ b/spacy/tests/lang/ja/test_tokenizer.py
@ -5,15 +5,41 @@ import pytest
 TOKENIZER_TESTS = [
-        ("日本語だよ", ['日本語', 'だ', 'よ']),
+        ("日本語だよ", ['日本', '語', 'だ', 'よ']),
        ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
        ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
-        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
+        ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
        ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
 ]
 TAG_TESTS = [
        ("日本語だよ", ['日本語だよ', '名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
        ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
        ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
        ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点 ']),
        ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
 ]
 POS_TESTS = [
        ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
        ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
        ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
        ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
        ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
 ]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
 def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
    tokens = [token.text for token in ja_tokenizer(text)]
    assert tokens == expected_tokens
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
 def test_japanese_tokenizer(ja_tokenizer, text, expected_tags):
    tags = [token.tag_ for token in ja_tokenizer(text)]
    assert tags == expected_tags
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
 def test_japanese_tokenizer(ja_tokenizer, text, expected_pos):
    pos = [token.pos_ for token in ja_tokenizer(text)]
    assert pos == expected_pos