Port Japanese mecab tokenizer from v1 (#2036)

* Port Japanese mecab tokenizer from v1

This brings the Mecab-based Japanese tokenization introduced in #1246 to
spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag
information from Mecab is stored in a token extension. A tag map is also
included.

As a reminder, Mecab is required because Universal Dependencies are
based on Unidic tags, and Janome doesn't support Unidic.

Things to check:

1. Is this the right way to use a token extension?

2. What's the right way to implement a JapaneseTagger? The approach in
 #1246 relied on `tag_from_strings` which is just gone now. I guess the
best thing is to just try training spaCy's default Tagger?

-POLM

* Add tagging/make_doc and tests
This commit is contained in:
Paul O'Leary McCann 2018-05-04 01:38:26 +09:00 committed by Matthew Honnibal
parent cc8e804648
commit bd72fbf09c
4 changed files with 192 additions and 16 deletions

View File

@ -3,23 +3,87 @@ from __future__ import unicode_literals, print_function
from ...language import Language from ...language import Language
from ...attrs import LANG from ...attrs import LANG
from ...tokens import Doc from ...tokens import Doc, Token
from ...tokenizer import Tokenizer from ...tokenizer import Tokenizer
from .tag_map import TAG_MAP
import re
from collections import namedtuple
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
# XXX Is this the right place for this?
Token.set_extension('mecab_tag', default=None)
def try_mecab_import():
"""Mecab is required for Japanese support, so check for it.
It it's not available blow up and explain how to fix it."""
try:
import MeCab
return MeCab
except ImportError:
raise ImportError("Japanese support requires MeCab: "
"https://github.com/SamuraiT/mecab-python3")
def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings.
"""
# NOTE: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
# PoS mappings.
if token.pos == '連体詞,*,*,*':
if re.match('^[こそあど此其彼]の', token.surface):
return token.pos + ',DET'
if re.match('^[こそあど此其彼]', token.surface):
return token.pos + ',PRON'
else:
return token.pos + ',ADJ'
return token.pos
def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = []
while node.posid != 0:
surface = node.surface
base = surface # a default value. Updated if available later.
parts = node.feature.split(',')
pos = ','.join(parts[0:4])
if len(parts) > 6:
# this information is only available for words in the tokenizer dictionary
reading = parts[6]
base = parts[7]
words.append( ShortUnitWord(surface, base, pos) )
node = node.next
return words
class JapaneseTokenizer(object): class JapaneseTokenizer(object):
def __init__(self, cls, nlp=None): def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
try:
from janome.tokenizer import Tokenizer MeCab = try_mecab_import()
except ImportError: self.tokenizer = MeCab.Tagger()
raise ImportError("The Japanese tokenizer requires the Janome "
"library: https://github.com/mocobeta/janome")
self.tokenizer = Tokenizer()
def __call__(self, text): def __call__(self, text):
words = [x.surface for x in self.tokenizer.tokenize(text)] dtokens = detailed_tokens(self.tokenizer, text)
return Doc(self.vocab, words=words, spaces=[False]*len(words)) words = [x.surface for x in dtokens]
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos
token.tag_ = resolve_pos(dtoken)
return doc
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to # add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
# allow serialization (see #1557) # allow serialization (see #1557)
@ -35,22 +99,21 @@ class JapaneseTokenizer(object):
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
return self return self
class JapaneseDefaults(Language.Defaults): class JapaneseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'ja' lex_attr_getters[LANG] = lambda text: 'ja'
tag_map = TAG_MAP
@classmethod @classmethod
def create_tokenizer(cls, nlp=None): def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp) return JapaneseTokenizer(cls, nlp)
class Japanese(Language): class Japanese(Language):
lang = 'ja' lang = 'ja'
Defaults = JapaneseDefaults Defaults = JapaneseDefaults
Tokenizer = JapaneseTokenizer
def make_doc(self, text): def make_doc(self, text):
return self.tokenizer(text) return self.tokenizer(text)
__all__ = ['Japanese'] __all__ = ['Japanese']

88
spacy/lang/ja/tag_map.py Normal file
View File

@ -0,0 +1,88 @@
# encoding: utf8
from __future__ import unicode_literals
from ...symbols import *
TAG_MAP = {
# Explanation of Unidic tags:
# https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
# Universal Dependencies Mapping:
# http://universaldependencies.org/ja/overview/morphology.html
# http://universaldependencies.org/ja/pos/all.html
"記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
"記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
"感動詞,フィラー,*,*": {POS: INTJ},
"感動詞,一般,*,*": {POS: INTJ},
# this is specifically for unicode full-width space
"空白,*,*,*": {POS: X},
"形状詞,一般,*,*":{POS: ADJ},
"形状詞,タリ,*,*":{POS: ADJ},
"形状詞,助動詞語幹,*,*":{POS: ADJ},
"形容詞,一般,*,*":{POS: ADJ},
"形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
"助詞,格助詞,*,*":{POS: ADP},
"助詞,係助詞,*,*":{POS: ADP},
"助詞,終助詞,*,*":{POS: PART},
"助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
"助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
"助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb
"助動詞,*,*,*":{POS: AUX},
"接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
"接頭辞,*,*,*":{POS: NOUN},
"接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック
"接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
"接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ
"接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
"接尾辞,名詞的,一般,*":{POS: NOUN},
"接尾辞,名詞的,助数詞,*":{POS: NOUN},
"接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
"代名詞,*,*,*":{POS: PRON},
"動詞,一般,*,*":{POS: VERB},
"動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
"動詞,非自立可能,*,*,AUX":{POS: AUX},
"動詞,非自立可能,*,*,VERB":{POS: VERB},
"副詞,*,*,*":{POS: ADV},
"補助記号,,一般,*":{POS: SYM}, # text art
"補助記号,,顔文字,*":{POS: SYM}, # kaomoji
"補助記号,一般,*,*":{POS: SYM},
"補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
"補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
"補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
"補助記号,読点,*,*":{POS: PUNCT}, # comma
"名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
"名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
"名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
"名詞,固有名詞,人名,名":{POS: PROPN}, # first name
"名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
"名詞,固有名詞,地名,国":{POS: PROPN}, # country name
"名詞,助動詞語幹,*,*":{POS: AUX},
"名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
"名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
"名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN},
"名詞,普通名詞,サ変可能,*,VERB":{POS: VERB},
"名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
"名詞,普通名詞,一般,*":{POS: NOUN},
"名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
"名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN},
"名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ},
"名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
"名詞,普通名詞,副詞可能,*":{POS: NOUN},
"連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
"連体詞,*,*,*,ADJ":{POS: ADJ},
"連体詞,*,*,*,PRON":{POS: PRON},
"連体詞,*,*,*,DET":{POS: DET},
}

View File

@ -135,10 +135,9 @@ def da_tokenizer():
@pytest.fixture @pytest.fixture
def ja_tokenizer(): def ja_tokenizer():
janome = pytest.importorskip("janome") janome = pytest.importorskip("MeCab")
return util.get_lang_class('ja').Defaults.create_tokenizer() return util.get_lang_class('ja').Defaults.create_tokenizer()
@pytest.fixture @pytest.fixture
def th_tokenizer(): def th_tokenizer():
pythainlp = pytest.importorskip("pythainlp") pythainlp = pytest.importorskip("pythainlp")

View File

@ -5,15 +5,41 @@ import pytest
TOKENIZER_TESTS = [ TOKENIZER_TESTS = [
("日本語だよ", ['日本', '', '']), ("日本語だよ", ['日本', '', '', '']),
("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']), ("東京タワーの近くに住んでいます。", ['東京', 'タワー', '', '近く', '', '住ん', '', '', 'ます', '']),
("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']), ("吾輩は猫である。", ['吾輩', '', '', '', 'ある', '']),
("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', '仕置き', '', '!']), ("月に代わって、お仕置きよ!", ['', '', '代わっ', '', '', '', '仕置き', '', '!']),
("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち']) ("すもももももももものうち", ['すもも', '', 'もも', '', 'もも', '', 'うち'])
] ]
TAG_TESTS = [
("日本語だよ", ['日本語だよ', '名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点 ']),
("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
]
POS_TESTS = [
('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
]
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) @pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens): def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
tokens = [token.text for token in ja_tokenizer(text)] tokens = [token.text for token in ja_tokenizer(text)]
assert tokens == expected_tokens assert tokens == expected_tokens
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_tags):
tags = [token.tag_ for token in ja_tokenizer(text)]
assert tags == expected_tags
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
def test_japanese_tokenizer(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos