mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Port Japanese mecab tokenizer from v1 (#2036)
* Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests
This commit is contained in:
parent
cc8e804648
commit
bd72fbf09c
|
@ -3,23 +3,87 @@ from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc, Token
|
||||||
from ...tokenizer import Tokenizer
|
from ...tokenizer import Tokenizer
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
|
||||||
|
|
||||||
|
# XXX Is this the right place for this?
|
||||||
|
Token.set_extension('mecab_tag', default=None)
|
||||||
|
|
||||||
|
def try_mecab_import():
|
||||||
|
"""Mecab is required for Japanese support, so check for it.
|
||||||
|
|
||||||
|
It it's not available blow up and explain how to fix it."""
|
||||||
|
try:
|
||||||
|
import MeCab
|
||||||
|
return MeCab
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Japanese support requires MeCab: "
|
||||||
|
"https://github.com/SamuraiT/mecab-python3")
|
||||||
|
|
||||||
|
def resolve_pos(token):
|
||||||
|
"""If necessary, add a field to the POS tag for UD mapping.
|
||||||
|
|
||||||
|
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||||
|
be mapped differently depending on the literal token or its context
|
||||||
|
in the sentence. This function adds information to the POS tag to
|
||||||
|
resolve ambiguous mappings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# NOTE: This is a first take. The rules here are crude approximations.
|
||||||
|
# For many of these, full dependencies are needed to properly resolve
|
||||||
|
# PoS mappings.
|
||||||
|
|
||||||
|
if token.pos == '連体詞,*,*,*':
|
||||||
|
if re.match('^[こそあど此其彼]の', token.surface):
|
||||||
|
return token.pos + ',DET'
|
||||||
|
if re.match('^[こそあど此其彼]', token.surface):
|
||||||
|
return token.pos + ',PRON'
|
||||||
|
else:
|
||||||
|
return token.pos + ',ADJ'
|
||||||
|
return token.pos
|
||||||
|
|
||||||
|
def detailed_tokens(tokenizer, text):
|
||||||
|
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||||
|
|
||||||
|
node = tokenizer.parseToNode(text)
|
||||||
|
node = node.next # first node is beginning of sentence and empty, skip it
|
||||||
|
words = []
|
||||||
|
while node.posid != 0:
|
||||||
|
surface = node.surface
|
||||||
|
base = surface # a default value. Updated if available later.
|
||||||
|
parts = node.feature.split(',')
|
||||||
|
pos = ','.join(parts[0:4])
|
||||||
|
|
||||||
|
if len(parts) > 6:
|
||||||
|
# this information is only available for words in the tokenizer dictionary
|
||||||
|
reading = parts[6]
|
||||||
|
base = parts[7]
|
||||||
|
|
||||||
|
words.append( ShortUnitWord(surface, base, pos) )
|
||||||
|
node = node.next
|
||||||
|
return words
|
||||||
|
|
||||||
class JapaneseTokenizer(object):
|
class JapaneseTokenizer(object):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, cls, nlp=None):
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
try:
|
|
||||||
from janome.tokenizer import Tokenizer
|
MeCab = try_mecab_import()
|
||||||
except ImportError:
|
self.tokenizer = MeCab.Tagger()
|
||||||
raise ImportError("The Japanese tokenizer requires the Janome "
|
|
||||||
"library: https://github.com/mocobeta/janome")
|
|
||||||
self.tokenizer = Tokenizer()
|
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
dtokens = detailed_tokens(self.tokenizer, text)
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
words = [x.surface for x in dtokens]
|
||||||
|
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
for token, dtoken in zip(doc, dtokens):
|
||||||
|
token._.mecab_tag = dtoken.pos
|
||||||
|
token.tag_ = resolve_pos(dtoken)
|
||||||
|
return doc
|
||||||
|
|
||||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||||
# allow serialization (see #1557)
|
# allow serialization (see #1557)
|
||||||
|
@ -35,22 +99,21 @@ class JapaneseTokenizer(object):
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class JapaneseDefaults(Language.Defaults):
|
class JapaneseDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters[LANG] = lambda text: 'ja'
|
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
return JapaneseTokenizer(cls, nlp)
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
lang = 'ja'
|
lang = 'ja'
|
||||||
Defaults = JapaneseDefaults
|
Defaults = JapaneseDefaults
|
||||||
|
Tokenizer = JapaneseTokenizer
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
return self.tokenizer(text)
|
return self.tokenizer(text)
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Japanese']
|
__all__ = ['Japanese']
|
||||||
|
|
88
spacy/lang/ja/tag_map.py
Normal file
88
spacy/lang/ja/tag_map.py
Normal file
|
@ -0,0 +1,88 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ...symbols import *
|
||||||
|
|
||||||
|
TAG_MAP = {
|
||||||
|
# Explanation of Unidic tags:
|
||||||
|
# https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
|
||||||
|
|
||||||
|
# Universal Dependencies Mapping:
|
||||||
|
# http://universaldependencies.org/ja/overview/morphology.html
|
||||||
|
# http://universaldependencies.org/ja/pos/all.html
|
||||||
|
|
||||||
|
"記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
|
||||||
|
"記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
|
||||||
|
|
||||||
|
"感動詞,フィラー,*,*": {POS: INTJ},
|
||||||
|
"感動詞,一般,*,*": {POS: INTJ},
|
||||||
|
|
||||||
|
# this is specifically for unicode full-width space
|
||||||
|
"空白,*,*,*": {POS: X},
|
||||||
|
|
||||||
|
"形状詞,一般,*,*":{POS: ADJ},
|
||||||
|
"形状詞,タリ,*,*":{POS: ADJ},
|
||||||
|
"形状詞,助動詞語幹,*,*":{POS: ADJ},
|
||||||
|
"形容詞,一般,*,*":{POS: ADJ},
|
||||||
|
"形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
|
||||||
|
|
||||||
|
"助詞,格助詞,*,*":{POS: ADP},
|
||||||
|
"助詞,係助詞,*,*":{POS: ADP},
|
||||||
|
"助詞,終助詞,*,*":{POS: PART},
|
||||||
|
"助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
|
||||||
|
"助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
|
||||||
|
"助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb
|
||||||
|
"助動詞,*,*,*":{POS: AUX},
|
||||||
|
"接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
|
||||||
|
|
||||||
|
"接頭辞,*,*,*":{POS: NOUN},
|
||||||
|
"接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック
|
||||||
|
"接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
|
||||||
|
"接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ
|
||||||
|
"接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
|
||||||
|
"接尾辞,名詞的,一般,*":{POS: NOUN},
|
||||||
|
"接尾辞,名詞的,助数詞,*":{POS: NOUN},
|
||||||
|
"接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
|
||||||
|
|
||||||
|
"代名詞,*,*,*":{POS: PRON},
|
||||||
|
"動詞,一般,*,*":{POS: VERB},
|
||||||
|
"動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
|
||||||
|
"動詞,非自立可能,*,*,AUX":{POS: AUX},
|
||||||
|
"動詞,非自立可能,*,*,VERB":{POS: VERB},
|
||||||
|
"副詞,*,*,*":{POS: ADV},
|
||||||
|
|
||||||
|
"補助記号,AA,一般,*":{POS: SYM}, # text art
|
||||||
|
"補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji
|
||||||
|
"補助記号,一般,*,*":{POS: SYM},
|
||||||
|
"補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
|
||||||
|
"補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
|
||||||
|
"補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
|
||||||
|
"補助記号,読点,*,*":{POS: PUNCT}, # comma
|
||||||
|
|
||||||
|
"名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
|
||||||
|
"名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
|
||||||
|
"名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
|
||||||
|
"名詞,固有名詞,人名,名":{POS: PROPN}, # first name
|
||||||
|
"名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
|
||||||
|
"名詞,固有名詞,地名,国":{POS: PROPN}, # country name
|
||||||
|
|
||||||
|
"名詞,助動詞語幹,*,*":{POS: AUX},
|
||||||
|
"名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
|
||||||
|
|
||||||
|
"名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
|
||||||
|
"名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN},
|
||||||
|
"名詞,普通名詞,サ変可能,*,VERB":{POS: VERB},
|
||||||
|
|
||||||
|
"名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
|
||||||
|
"名詞,普通名詞,一般,*":{POS: NOUN},
|
||||||
|
"名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
|
||||||
|
"名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN},
|
||||||
|
"名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ},
|
||||||
|
"名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
|
||||||
|
"名詞,普通名詞,副詞可能,*":{POS: NOUN},
|
||||||
|
|
||||||
|
"連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
|
||||||
|
"連体詞,*,*,*,ADJ":{POS: ADJ},
|
||||||
|
"連体詞,*,*,*,PRON":{POS: PRON},
|
||||||
|
"連体詞,*,*,*,DET":{POS: DET},
|
||||||
|
}
|
|
@ -135,10 +135,9 @@ def da_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ja_tokenizer():
|
def ja_tokenizer():
|
||||||
janome = pytest.importorskip("janome")
|
janome = pytest.importorskip("MeCab")
|
||||||
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def th_tokenizer():
|
def th_tokenizer():
|
||||||
pythainlp = pytest.importorskip("pythainlp")
|
pythainlp = pytest.importorskip("pythainlp")
|
||||||
|
|
|
@ -5,15 +5,41 @@ import pytest
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_TESTS = [
|
TOKENIZER_TESTS = [
|
||||||
("日本語だよ", ['日本語', 'だ', 'よ']),
|
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
||||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||||
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
|
||||||
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
TAG_TESTS = [
|
||||||
|
("日本語だよ", ['日本語だよ', '名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
|
||||||
|
("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
|
||||||
|
("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
|
||||||
|
("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点 ']),
|
||||||
|
("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
|
||||||
|
]
|
||||||
|
|
||||||
|
POS_TESTS = [
|
||||||
|
('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
|
||||||
|
('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
|
||||||
|
('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
|
||||||
|
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
||||||
|
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||||
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||||
tokens = [token.text for token in ja_tokenizer(text)]
|
tokens = [token.text for token in ja_tokenizer(text)]
|
||||||
assert tokens == expected_tokens
|
assert tokens == expected_tokens
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
|
||||||
|
def test_japanese_tokenizer(ja_tokenizer, text, expected_tags):
|
||||||
|
tags = [token.tag_ for token in ja_tokenizer(text)]
|
||||||
|
assert tags == expected_tags
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
|
||||||
|
def test_japanese_tokenizer(ja_tokenizer, text, expected_pos):
|
||||||
|
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||||
|
assert pos == expected_pos
|
||||||
|
|
Loading…
Reference in New Issue
Block a user