mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Port Japanese mecab tokenizer from v1 (#2036)
* Port Japanese mecab tokenizer from v1 This brings the Mecab-based Japanese tokenization introduced in #1246 to spaCy v2. There isn't a JapaneseTagger implementation yet, but POS tag information from Mecab is stored in a token extension. A tag map is also included. As a reminder, Mecab is required because Universal Dependencies are based on Unidic tags, and Janome doesn't support Unidic. Things to check: 1. Is this the right way to use a token extension? 2. What's the right way to implement a JapaneseTagger? The approach in #1246 relied on `tag_from_strings` which is just gone now. I guess the best thing is to just try training spaCy's default Tagger? -POLM * Add tagging/make_doc and tests
This commit is contained in:
parent
cc8e804648
commit
bd72fbf09c
|
@ -3,23 +3,87 @@ from __future__ import unicode_literals, print_function
|
|||
|
||||
from ...language import Language
|
||||
from ...attrs import LANG
|
||||
from ...tokens import Doc
|
||||
from ...tokens import Doc, Token
|
||||
from ...tokenizer import Tokenizer
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
import re
|
||||
from collections import namedtuple
|
||||
|
||||
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'lemma', 'pos'])
|
||||
|
||||
# XXX Is this the right place for this?
|
||||
Token.set_extension('mecab_tag', default=None)
|
||||
|
||||
def try_mecab_import():
|
||||
"""Mecab is required for Japanese support, so check for it.
|
||||
|
||||
It it's not available blow up and explain how to fix it."""
|
||||
try:
|
||||
import MeCab
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError("Japanese support requires MeCab: "
|
||||
"https://github.com/SamuraiT/mecab-python3")
|
||||
|
||||
def resolve_pos(token):
|
||||
"""If necessary, add a field to the POS tag for UD mapping.
|
||||
|
||||
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||
be mapped differently depending on the literal token or its context
|
||||
in the sentence. This function adds information to the POS tag to
|
||||
resolve ambiguous mappings.
|
||||
"""
|
||||
|
||||
# NOTE: This is a first take. The rules here are crude approximations.
|
||||
# For many of these, full dependencies are needed to properly resolve
|
||||
# PoS mappings.
|
||||
|
||||
if token.pos == '連体詞,*,*,*':
|
||||
if re.match('^[こそあど此其彼]の', token.surface):
|
||||
return token.pos + ',DET'
|
||||
if re.match('^[こそあど此其彼]', token.surface):
|
||||
return token.pos + ',PRON'
|
||||
else:
|
||||
return token.pos + ',ADJ'
|
||||
return token.pos
|
||||
|
||||
def detailed_tokens(tokenizer, text):
|
||||
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||
|
||||
node = tokenizer.parseToNode(text)
|
||||
node = node.next # first node is beginning of sentence and empty, skip it
|
||||
words = []
|
||||
while node.posid != 0:
|
||||
surface = node.surface
|
||||
base = surface # a default value. Updated if available later.
|
||||
parts = node.feature.split(',')
|
||||
pos = ','.join(parts[0:4])
|
||||
|
||||
if len(parts) > 6:
|
||||
# this information is only available for words in the tokenizer dictionary
|
||||
reading = parts[6]
|
||||
base = parts[7]
|
||||
|
||||
words.append( ShortUnitWord(surface, base, pos) )
|
||||
node = node.next
|
||||
return words
|
||||
|
||||
class JapaneseTokenizer(object):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome "
|
||||
"library: https://github.com/mocobeta/janome")
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
MeCab = try_mecab_import()
|
||||
self.tokenizer = MeCab.Tagger()
|
||||
|
||||
def __call__(self, text):
|
||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
dtokens = detailed_tokens(self.tokenizer, text)
|
||||
words = [x.surface for x in dtokens]
|
||||
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
token._.mecab_tag = dtoken.pos
|
||||
token.tag_ = resolve_pos(dtoken)
|
||||
return doc
|
||||
|
||||
# add dummy methods for to_bytes, from_bytes, to_disk and from_disk to
|
||||
# allow serialization (see #1557)
|
||||
|
@ -35,22 +99,21 @@ class JapaneseTokenizer(object):
|
|||
def from_disk(self, path, **exclude):
|
||||
return self
|
||||
|
||||
|
||||
class JapaneseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'ja'
|
||||
tag_map = TAG_MAP
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
Defaults = JapaneseDefaults
|
||||
Tokenizer = JapaneseTokenizer
|
||||
|
||||
def make_doc(self, text):
|
||||
return self.tokenizer(text)
|
||||
|
||||
|
||||
__all__ = ['Japanese']
|
||||
|
|
88
spacy/lang/ja/tag_map.py
Normal file
88
spacy/lang/ja/tag_map.py
Normal file
|
@ -0,0 +1,88 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import *
|
||||
|
||||
TAG_MAP = {
|
||||
# Explanation of Unidic tags:
|
||||
# https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
|
||||
|
||||
# Universal Dependencies Mapping:
|
||||
# http://universaldependencies.org/ja/overview/morphology.html
|
||||
# http://universaldependencies.org/ja/pos/all.html
|
||||
|
||||
"記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
|
||||
"記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
|
||||
|
||||
"感動詞,フィラー,*,*": {POS: INTJ},
|
||||
"感動詞,一般,*,*": {POS: INTJ},
|
||||
|
||||
# this is specifically for unicode full-width space
|
||||
"空白,*,*,*": {POS: X},
|
||||
|
||||
"形状詞,一般,*,*":{POS: ADJ},
|
||||
"形状詞,タリ,*,*":{POS: ADJ},
|
||||
"形状詞,助動詞語幹,*,*":{POS: ADJ},
|
||||
"形容詞,一般,*,*":{POS: ADJ},
|
||||
"形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
|
||||
|
||||
"助詞,格助詞,*,*":{POS: ADP},
|
||||
"助詞,係助詞,*,*":{POS: ADP},
|
||||
"助詞,終助詞,*,*":{POS: PART},
|
||||
"助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
|
||||
"助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
|
||||
"助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb
|
||||
"助動詞,*,*,*":{POS: AUX},
|
||||
"接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
|
||||
|
||||
"接頭辞,*,*,*":{POS: NOUN},
|
||||
"接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック
|
||||
"接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
|
||||
"接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ
|
||||
"接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
|
||||
"接尾辞,名詞的,一般,*":{POS: NOUN},
|
||||
"接尾辞,名詞的,助数詞,*":{POS: NOUN},
|
||||
"接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
|
||||
|
||||
"代名詞,*,*,*":{POS: PRON},
|
||||
"動詞,一般,*,*":{POS: VERB},
|
||||
"動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
|
||||
"動詞,非自立可能,*,*,AUX":{POS: AUX},
|
||||
"動詞,非自立可能,*,*,VERB":{POS: VERB},
|
||||
"副詞,*,*,*":{POS: ADV},
|
||||
|
||||
"補助記号,AA,一般,*":{POS: SYM}, # text art
|
||||
"補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji
|
||||
"補助記号,一般,*,*":{POS: SYM},
|
||||
"補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
|
||||
"補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
|
||||
"補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
|
||||
"補助記号,読点,*,*":{POS: PUNCT}, # comma
|
||||
|
||||
"名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
|
||||
"名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
|
||||
"名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
|
||||
"名詞,固有名詞,人名,名":{POS: PROPN}, # first name
|
||||
"名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
|
||||
"名詞,固有名詞,地名,国":{POS: PROPN}, # country name
|
||||
|
||||
"名詞,助動詞語幹,*,*":{POS: AUX},
|
||||
"名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
|
||||
|
||||
"名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
|
||||
"名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN},
|
||||
"名詞,普通名詞,サ変可能,*,VERB":{POS: VERB},
|
||||
|
||||
"名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
|
||||
"名詞,普通名詞,一般,*":{POS: NOUN},
|
||||
"名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
|
||||
"名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN},
|
||||
"名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ},
|
||||
"名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
|
||||
"名詞,普通名詞,副詞可能,*":{POS: NOUN},
|
||||
|
||||
"連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
|
||||
"連体詞,*,*,*,ADJ":{POS: ADJ},
|
||||
"連体詞,*,*,*,PRON":{POS: PRON},
|
||||
"連体詞,*,*,*,DET":{POS: DET},
|
||||
}
|
|
@ -135,10 +135,9 @@ def da_tokenizer():
|
|||
|
||||
@pytest.fixture
|
||||
def ja_tokenizer():
|
||||
janome = pytest.importorskip("janome")
|
||||
janome = pytest.importorskip("MeCab")
|
||||
return util.get_lang_class('ja').Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def th_tokenizer():
|
||||
pythainlp = pytest.importorskip("pythainlp")
|
||||
|
|
|
@ -5,15 +5,41 @@ import pytest
|
|||
|
||||
|
||||
TOKENIZER_TESTS = [
|
||||
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
|
||||
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||
]
|
||||
|
||||
TAG_TESTS = [
|
||||
("日本語だよ", ['日本語だよ', '名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
|
||||
("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
|
||||
("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
|
||||
("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点 ']),
|
||||
("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
|
||||
]
|
||||
|
||||
POS_TESTS = [
|
||||
('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
|
||||
('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
|
||||
('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
|
||||
('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
|
||||
('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS)
|
||||
def test_japanese_tokenizer(ja_tokenizer, text, expected_tokens):
|
||||
tokens = [token.text for token in ja_tokenizer(text)]
|
||||
assert tokens == expected_tokens
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tags', TAG_TESTS)
|
||||
def test_japanese_tokenizer(ja_tokenizer, text, expected_tags):
|
||||
tags = [token.tag_ for token in ja_tokenizer(text)]
|
||||
assert tags == expected_tags
|
||||
|
||||
@pytest.mark.parametrize('text,expected_pos', POS_TESTS)
|
||||
def test_japanese_tokenizer(ja_tokenizer, text, expected_pos):
|
||||
pos = [token.pos_ for token in ja_tokenizer(text)]
|
||||
assert pos == expected_pos
|
||||
|
|
Loading…
Reference in New Issue
Block a user