mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Merge pull request #1246 from polm/ja-pos-tagger
[wip] Sample implementation of Japanese Tagger (ref #1214)
This commit is contained in:
commit
331d338b8b
|
@ -5,37 +5,115 @@ from os import path
|
||||||
|
|
||||||
from ..language import Language, BaseDefaults
|
from ..language import Language, BaseDefaults
|
||||||
from ..tokenizer import Tokenizer
|
from ..tokenizer import Tokenizer
|
||||||
|
from ..tagger import Tagger
|
||||||
from ..attrs import LANG
|
from ..attrs import LANG
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
from .language_data import *
|
from .language_data import *
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
|
||||||
|
|
||||||
|
def try_mecab_import():
|
||||||
|
"""Mecab is required for Japanese support, so check for it.
|
||||||
|
|
||||||
|
It it's not available blow up and explain how to fix it."""
|
||||||
|
try:
|
||||||
|
import MeCab
|
||||||
|
return MeCab
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Japanese support requires MeCab: "
|
||||||
|
"https://github.com/SamuraiT/mecab-python3")
|
||||||
|
|
||||||
class JapaneseTokenizer(object):
|
class JapaneseTokenizer(object):
|
||||||
def __init__(self, cls, nlp=None):
|
def __init__(self, cls, nlp=None):
|
||||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||||
try:
|
MeCab = try_mecab_import()
|
||||||
from janome.tokenizer import Tokenizer
|
self.tokenizer = MeCab.Tagger()
|
||||||
except ImportError:
|
|
||||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
|
||||||
"https://github.com/mocobeta/janome")
|
|
||||||
self.tokenizer = Tokenizer()
|
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
|
def resolve_pos(token):
|
||||||
|
"""If necessary, add a field to the POS tag for UD mapping.
|
||||||
|
|
||||||
|
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||||
|
be mapped differently depending on the literal token or its context
|
||||||
|
in the sentence. This function adds information to the POS tag to
|
||||||
|
resolve ambiguous mappings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# NOTE: This is a first take. The rules here are crude approximations.
|
||||||
|
# For many of these, full dependencies are needed to properly resolve
|
||||||
|
# PoS mappings.
|
||||||
|
|
||||||
|
if token.part_of_speech == '連体詞,*,*,*':
|
||||||
|
if re.match('^[こそあど此其彼]の', token.surface):
|
||||||
|
return token.part_of_speech + ',DET'
|
||||||
|
if re.match('^[こそあど此其彼]', token.surface):
|
||||||
|
return token.part_of_speech + ',PRON'
|
||||||
|
else:
|
||||||
|
return token.part_of_speech + ',ADJ'
|
||||||
|
return token.part_of_speech
|
||||||
|
|
||||||
|
def detailed_tokens(tokenizer, text):
|
||||||
|
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||||
|
|
||||||
|
node = tokenizer.parseToNode(text)
|
||||||
|
node = node.next # first node is beginning of sentence and empty, skip it
|
||||||
|
words = []
|
||||||
|
while node.posid != 0:
|
||||||
|
surface = node.surface
|
||||||
|
base = surface
|
||||||
|
parts = node.feature.split(',')
|
||||||
|
pos = ','.join(parts[0:4])
|
||||||
|
|
||||||
|
if len(parts) > 6:
|
||||||
|
# this information is only available for words in the tokenizer dictionary
|
||||||
|
reading = parts[6]
|
||||||
|
base = parts[7]
|
||||||
|
|
||||||
|
words.append( ShortUnitWord(surface, base, pos) )
|
||||||
|
node = node.next
|
||||||
|
return words
|
||||||
|
|
||||||
|
class JapaneseTagger(object):
|
||||||
|
def __init__(self, vocab):
|
||||||
|
MeCab = try_mecab_import()
|
||||||
|
self.tagger = Tagger(vocab)
|
||||||
|
self.tokenizer = MeCab.Tagger()
|
||||||
|
|
||||||
|
def __call__(self, tokens):
|
||||||
|
# two parts to this:
|
||||||
|
# 1. get raw JP tags
|
||||||
|
# 2. add features to tags as necessary for UD
|
||||||
|
|
||||||
|
dtokens = detailed_tokens(self.tokenizer, tokens.text)
|
||||||
|
rawtags = list(map(resolve_pos, dtokens))
|
||||||
|
self.tagger.tag_from_strings(tokens, rawtags)
|
||||||
|
|
||||||
class JapaneseDefaults(BaseDefaults):
|
class JapaneseDefaults(BaseDefaults):
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create_tokenizer(cls, nlp=None):
|
def create_tokenizer(cls, nlp=None):
|
||||||
return JapaneseTokenizer(cls, nlp)
|
return JapaneseTokenizer(cls, nlp)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create_tagger(cls, tokenizer):
|
||||||
|
return JapaneseTagger(tokenizer.vocab)
|
||||||
|
|
||||||
class Japanese(Language):
|
class Japanese(Language):
|
||||||
lang = 'ja'
|
lang = 'ja'
|
||||||
|
|
||||||
Defaults = JapaneseDefaults
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
words = self.tokenizer(text)
|
words = [str(t) for t in self.tokenizer(text)]
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
|
||||||
|
tagger(doc)
|
||||||
|
return doc
|
||||||
|
|
|
@ -3,22 +3,86 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from ..symbols import *
|
from ..symbols import *
|
||||||
|
|
||||||
|
|
||||||
TAG_MAP = {
|
TAG_MAP = {
|
||||||
"ADV": {POS: ADV},
|
# Explanation of Unidic tags:
|
||||||
"NOUN": {POS: NOUN},
|
# https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
|
||||||
"ADP": {POS: ADP},
|
|
||||||
"PRON": {POS: PRON},
|
# Universal Dependencies Mapping:
|
||||||
"SCONJ": {POS: SCONJ},
|
# http://universaldependencies.org/ja/overview/morphology.html
|
||||||
"PROPN": {POS: PROPN},
|
# http://universaldependencies.org/ja/pos/all.html
|
||||||
"DET": {POS: DET},
|
|
||||||
"SYM": {POS: SYM},
|
"記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
|
||||||
"INTJ": {POS: INTJ},
|
"記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
|
||||||
"PUNCT": {POS: PUNCT},
|
|
||||||
"NUM": {POS: NUM},
|
"感動詞,フィラー,*,*": {POS: INTJ},
|
||||||
"AUX": {POS: AUX},
|
"感動詞,一般,*,*": {POS: INTJ},
|
||||||
"X": {POS: X},
|
|
||||||
"CONJ": {POS: CONJ},
|
# this is specifically for unicode full-width space
|
||||||
"ADJ": {POS: ADJ},
|
"空白,*,*,*": {POS: X},
|
||||||
"VERB": {POS: VERB}
|
|
||||||
|
"形状詞,一般,*,*":{POS: ADJ},
|
||||||
|
"形状詞,タリ,*,*":{POS: ADJ},
|
||||||
|
"形状詞,助動詞語幹,*,*":{POS: ADJ},
|
||||||
|
"形容詞,一般,*,*":{POS: ADJ},
|
||||||
|
"形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
|
||||||
|
|
||||||
|
"助詞,格助詞,*,*":{POS: ADP},
|
||||||
|
"助詞,係助詞,*,*":{POS: ADP},
|
||||||
|
"助詞,終助詞,*,*":{POS: PART},
|
||||||
|
"助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
|
||||||
|
"助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
|
||||||
|
"助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb
|
||||||
|
"助動詞,*,*,*":{POS: AUX},
|
||||||
|
"接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
|
||||||
|
|
||||||
|
"接頭辞,*,*,*":{POS: NOUN},
|
||||||
|
"接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック
|
||||||
|
"接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
|
||||||
|
"接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ
|
||||||
|
"接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
|
||||||
|
"接尾辞,名詞的,一般,*":{POS: NOUN},
|
||||||
|
"接尾辞,名詞的,助数詞,*":{POS: NOUN},
|
||||||
|
"接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
|
||||||
|
|
||||||
|
"代名詞,*,*,*":{POS: PRON},
|
||||||
|
"動詞,一般,*,*":{POS: VERB},
|
||||||
|
"動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
|
||||||
|
"動詞,非自立可能,*,*,AUX":{POS: AUX},
|
||||||
|
"動詞,非自立可能,*,*,VERB":{POS: VERB},
|
||||||
|
"副詞,*,*,*":{POS: ADV},
|
||||||
|
|
||||||
|
"補助記号,AA,一般,*":{POS: SYM}, # text art
|
||||||
|
"補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji
|
||||||
|
"補助記号,一般,*,*":{POS: SYM},
|
||||||
|
"補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
|
||||||
|
"補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
|
||||||
|
"補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
|
||||||
|
"補助記号,読点,*,*":{POS: PUNCT}, # comma
|
||||||
|
|
||||||
|
"名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
|
||||||
|
"名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
|
||||||
|
"名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
|
||||||
|
"名詞,固有名詞,人名,名":{POS: PROPN}, # first name
|
||||||
|
"名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
|
||||||
|
"名詞,固有名詞,地名,国":{POS: PROPN}, # country name
|
||||||
|
|
||||||
|
"名詞,助動詞語幹,*,*":{POS: AUX},
|
||||||
|
"名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
|
||||||
|
|
||||||
|
"名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
|
||||||
|
"名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN},
|
||||||
|
"名詞,普通名詞,サ変可能,*,VERB":{POS: VERB},
|
||||||
|
|
||||||
|
"名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
|
||||||
|
"名詞,普通名詞,一般,*":{POS: NOUN},
|
||||||
|
"名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
|
||||||
|
"名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN},
|
||||||
|
"名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ},
|
||||||
|
"名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
|
||||||
|
"名詞,普通名詞,副詞可能,*":{POS: NOUN},
|
||||||
|
|
||||||
|
"連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
|
||||||
|
"連体詞,*,*,*,ADJ":{POS: ADJ},
|
||||||
|
"連体詞,*,*,*,PRON":{POS: PRON},
|
||||||
|
"連体詞,*,*,*,DET":{POS: DET},
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,9 +80,13 @@ def fi_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def ja_tokenizer():
|
def ja_tokenizer():
|
||||||
janome = pytest.importorskip("janome")
|
pytest.importorskip("MeCab")
|
||||||
return Japanese.Defaults.create_tokenizer()
|
return Japanese.Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def japanese():
|
||||||
|
pytest.importorskip("MeCab")
|
||||||
|
return Japanese()
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def sv_tokenizer():
|
def sv_tokenizer():
|
||||||
|
|
38
spacy/tests/ja/test_tagger.py
Normal file
38
spacy/tests/ja/test_tagger.py
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
TAGGER_TESTS = [
|
||||||
|
('あれならそこにあるよ',
|
||||||
|
(('代名詞,*,*,*', 'PRON'),
|
||||||
|
('助動詞,*,*,*', 'AUX'),
|
||||||
|
('代名詞,*,*,*', 'PRON'),
|
||||||
|
('助詞,格助詞,*,*', 'ADP'),
|
||||||
|
('動詞,非自立可能,*,*', 'VERB'),
|
||||||
|
('助詞,終助詞,*,*', 'PART'))),
|
||||||
|
('このファイルには小さなテストが入っているよ',
|
||||||
|
(('連体詞,*,*,*,DET', 'DET'),
|
||||||
|
('名詞,普通名詞,サ変可能,*', 'NOUN'),
|
||||||
|
('助詞,格助詞,*,*', 'ADP'),
|
||||||
|
('助詞,係助詞,*,*', 'ADP'),
|
||||||
|
('連体詞,*,*,*,ADJ', 'ADJ'),
|
||||||
|
('名詞,普通名詞,サ変可能,*', 'NOUN'),
|
||||||
|
('助詞,格助詞,*,*', 'ADP'),
|
||||||
|
('動詞,一般,*,*', 'VERB'),
|
||||||
|
('助詞,接続助詞,*,*', 'SCONJ'),
|
||||||
|
('動詞,非自立可能,*,*', 'VERB'),
|
||||||
|
('助詞,終助詞,*,*', 'PART'))),
|
||||||
|
('プププランドに行きたい',
|
||||||
|
(('名詞,普通名詞,一般,*', 'NOUN'),
|
||||||
|
('助詞,格助詞,*,*', 'ADP'),
|
||||||
|
('動詞,非自立可能,*,*', 'VERB'),
|
||||||
|
('助動詞,*,*,*', 'AUX')))
|
||||||
|
]
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)
|
||||||
|
def test_japanese_tagger(japanese, text, expected_tags):
|
||||||
|
tokens = japanese.make_doc(text)
|
||||||
|
assert len(tokens) == len(expected_tags)
|
||||||
|
for token, res in zip(tokens, expected_tags):
|
||||||
|
assert token.tag_ == res[0] and token.pos_ == res[1]
|
|
@ -4,10 +4,10 @@ from __future__ import unicode_literals
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
TOKENIZER_TESTS = [
|
TOKENIZER_TESTS = [
|
||||||
("日本語だよ", ['日本語', 'だ', 'よ']),
|
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
||||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||||
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
|
||||||
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user