From bd72fbf09c575a7693a7f00c00a6e023e8bf96b4 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Fri, 4 May 2018 01:38:26 +0900 Subject: [PATCH] Port Japanese mecab tokenizer from v1 (#2036) * Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests --- spacy/lang/ja/__init__.py | 87 ++++++++++++++++++++++---- spacy/lang/ja/tag_map.py | 88 +++++++++++++++++++++++++++ spacy/tests/conftest.py | 3 +- spacy/tests/lang/ja/test_tokenizer.py | 30 ++++++++- 4 files changed, 192 insertions(+), 16 deletions(-) create mode 100644 spacy/lang/ja/tag_map.py diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 3b67c5489..1286469de 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -3,23 +3,87 @@ from __future__ import unicode_literals, print_function from ...language import Language from ...attrs import LANG -from ...tokens import Doc +from ...tokens import Doc, Token from ...tokenizer import Tokenizer +from .tag_map import TAG_MAP +import re +from collections import namedtuple + +ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos']) + +# XXX Is this the right place for this? +Token.set_extension('mecab_tag', default=None) + +def try_mecab_import(): + """Mecab is required for Japanese support, so check for it. + + It it's not available blow up and explain how to fix it.""" + try: + import MeCab + return MeCab + except ImportError: + raise ImportError("Japanese support requires MeCab: " + "https://github.com/SamuraiT/mecab-python3") + +def resolve_pos(token): + """If necessary, add a field to the POS tag for UD mapping. + + Under Universal Dependencies, sometimes the same Unidic POS tag can + be mapped differently depending on the literal token or its context + in the sentence. This function adds information to the POS tag to + resolve ambiguous mappings. + """ + + # NOTE: This is a first take. The rules here are crude approximations. + # For many of these, full dependencies are needed to properly resolve + # PoS mappings. + + if token.pos == '連体詞,*,*,*': + if re.match('^[こそあど此其彼]の', token.surface): + return token.pos + ',DET' + if re.match('^[こそあど此其彼]', token.surface): + return token.pos + ',PRON' + else: + return token.pos + ',ADJ' + return token.pos + +def detailed_tokens(tokenizer, text): + """Format Mecab output into a nice data structure, based on Janome.""" + + node = tokenizer.parseToNode(text) + node = node.next # first node is beginning of sentence and empty, skip it + words = [] + while node.posid != 0: + surface = node.surface + base = surface # a default value. Updated if available later. + parts = node.feature.split(',') + pos = ','.join(parts[0:4]) + + if len(parts) > 6: + # this information is only available for words in the tokenizer dictionary + reading = parts[6] + base = parts[7] + + words.append( ShortUnitWord(surface, base, pos) ) + node = node.next + return words class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) - try: - from janome.tokenizer import Tokenizer - except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome " - "library: https://github.com/mocobeta/janome") - self.tokenizer = Tokenizer() + + MeCab = try_mecab_import() + self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in self.tokenizer.tokenize(text)] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + dtokens = detailed_tokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + for token, dtoken in zip(doc, dtokens): + token._.mecab_tag = dtoken.pos + token.tag_ = resolve_pos(dtoken) + return doc # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # allow serialization (see #1557) @@ -35,22 +99,21 @@ class JapaneseTokenizer(object): def from_disk(self, path, **exclude): return self - class JapaneseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'ja' + tag_map = TAG_MAP @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) - class Japanese(Language): lang = 'ja' Defaults = JapaneseDefaults + Tokenizer = JapaneseTokenizer def make_doc(self, text): return self.tokenizer(text) - __all__ = ['Japanese'] diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py new file mode 100644 index 000000000..0191df88f --- /dev/null +++ b/spacy/lang/ja/tag_map.py @@ -0,0 +1,88 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ...symbols import * + +TAG_MAP = { + # Explanation of Unidic tags: + # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf + + # Universal Dependencies Mapping: + # http://universaldependencies.org/ja/overview/morphology.html + # http://universaldependencies.org/ja/pos/all.html + + "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ + "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math + + "感動詞,フィラー,*,*": {POS: INTJ}, + "感動詞,一般,*,*": {POS: INTJ}, + + # this is specifically for unicode full-width space + "空白,*,*,*": {POS: X}, + + "形状詞,一般,*,*":{POS: ADJ}, + "形状詞,タリ,*,*":{POS: ADJ}, + "形状詞,助動詞語幹,*,*":{POS: ADJ}, + "形容詞,一般,*,*":{POS: ADJ}, + "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise + + "助詞,格助詞,*,*":{POS: ADP}, + "助詞,係助詞,*,*":{POS: ADP}, + "助詞,終助詞,*,*":{POS: PART}, + "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い + "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て + "助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb + "助動詞,*,*,*":{POS: AUX}, + "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement + + "接頭辞,*,*,*":{POS: NOUN}, + "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック + "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい + "接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ + "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞,名詞的,一般,*":{POS: NOUN}, + "接尾辞,名詞的,助数詞,*":{POS: NOUN}, + "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ + + "代名詞,*,*,*":{POS: PRON}, + "動詞,一般,*,*":{POS: VERB}, + "動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise + "動詞,非自立可能,*,*,AUX":{POS: AUX}, + "動詞,非自立可能,*,*,VERB":{POS: VERB}, + "副詞,*,*,*":{POS: ADV}, + + "補助記号,AA,一般,*":{POS: SYM}, # text art + "補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji + "補助記号,一般,*,*":{POS: SYM}, + "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket + "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket + "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker + "補助記号,読点,*,*":{POS: PUNCT}, # comma + + "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun + "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name + "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname + "名詞,固有名詞,人名,名":{POS: PROPN}, # first name + "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name + "名詞,固有名詞,地名,国":{POS: PROPN}, # country name + + "名詞,助動詞語幹,*,*":{POS: AUX}, + "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals + + "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, + + "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手 + "名詞,普通名詞,一般,*":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2 + "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, + "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit + "名詞,普通名詞,副詞可能,*":{POS: NOUN}, + + "連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token + "連体詞,*,*,*,ADJ":{POS: ADJ}, + "連体詞,*,*,*,PRON":{POS: PRON}, + "連体詞,*,*,*,DET":{POS: DET}, +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3530ca6e2..1d8e14dbb 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -135,10 +135,9 @@ def da_tokenizer(): @pytest.fixture def ja_tokenizer(): - janome = pytest.importorskip("janome") + janome = pytest.importorskip("MeCab") return util.get_lang_class('ja').Defaults.create_tokenizer() - @pytest.fixture def th_tokenizer(): pythainlp = pytest.importorskip("pythainlp") diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 1e30973a3..e79c3a5ab 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -5,15 +5,41 @@ import pytest TOKENIZER_TESTS = [ - ("日本語だよ", ['日本語', 'だ', 'よ']), + ("日本語だよ", ['日本', '語', 'だ', 'よ']), ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), ("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']), - ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']), + ("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']), ("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち']) ] +TAG_TESTS = [ + ("日本語だよ", ['日本語だよ', '名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']), + ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']), + ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']), + ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点 ']), + ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能']) +] + +POS_TESTS = [ + ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']), + ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']), + ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']), + ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']), + ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN']) +] + @pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens): tokens = [token.text for token in ja_tokenizer(text)] assert tokens == expected_tokens + +@pytest.mark.parametrize('text,expected_tags', TAG_TESTS) +def test_japanese_tokenizer(ja_tokenizer, text, expected_tags): + tags = [token.tag_ for token in ja_tokenizer(text)] + assert tags == expected_tags + +@pytest.mark.parametrize('text,expected_pos', POS_TESTS) +def test_japanese_tokenizer(ja_tokenizer, text, expected_pos): + pos = [token.pos_ for token in ja_tokenizer(text)] + assert pos == expected_pos