mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 15:54:13 +03:00
Merge pull request #1246 from polm/ja-pos-tagger
[wip] Sample implementation of Japanese Tagger (ref #1214)
This commit is contained in:
commit
331d338b8b
|
@ -5,37 +5,115 @@ from os import path
|
|||
|
||||
from ..language import Language, BaseDefaults
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..tagger import Tagger
|
||||
from ..attrs import LANG
|
||||
from ..tokens import Doc
|
||||
|
||||
from .language_data import *
|
||||
|
||||
import re
|
||||
from collections import namedtuple
|
||||
|
||||
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
|
||||
|
||||
def try_mecab_import():
|
||||
"""Mecab is required for Japanese support, so check for it.
|
||||
|
||||
It it's not available blow up and explain how to fix it."""
|
||||
try:
|
||||
import MeCab
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError("Japanese support requires MeCab: "
|
||||
"https://github.com/SamuraiT/mecab-python3")
|
||||
|
||||
class JapaneseTokenizer(object):
|
||||
def __init__(self, cls, nlp=None):
|
||||
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
||||
try:
|
||||
from janome.tokenizer import Tokenizer
|
||||
except ImportError:
|
||||
raise ImportError("The Japanese tokenizer requires the Janome library: "
|
||||
"https://github.com/mocobeta/janome")
|
||||
self.tokenizer = Tokenizer()
|
||||
MeCab = try_mecab_import()
|
||||
self.tokenizer = MeCab.Tagger()
|
||||
|
||||
def __call__(self, text):
|
||||
words = [x.surface for x in self.tokenizer.tokenize(text)]
|
||||
words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
def resolve_pos(token):
|
||||
"""If necessary, add a field to the POS tag for UD mapping.
|
||||
|
||||
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
||||
be mapped differently depending on the literal token or its context
|
||||
in the sentence. This function adds information to the POS tag to
|
||||
resolve ambiguous mappings.
|
||||
"""
|
||||
|
||||
# NOTE: This is a first take. The rules here are crude approximations.
|
||||
# For many of these, full dependencies are needed to properly resolve
|
||||
# PoS mappings.
|
||||
|
||||
if token.part_of_speech == '連体詞,*,*,*':
|
||||
if re.match('^[こそあど此其彼]の', token.surface):
|
||||
return token.part_of_speech + ',DET'
|
||||
if re.match('^[こそあど此其彼]', token.surface):
|
||||
return token.part_of_speech + ',PRON'
|
||||
else:
|
||||
return token.part_of_speech + ',ADJ'
|
||||
return token.part_of_speech
|
||||
|
||||
def detailed_tokens(tokenizer, text):
|
||||
"""Format Mecab output into a nice data structure, based on Janome."""
|
||||
|
||||
node = tokenizer.parseToNode(text)
|
||||
node = node.next # first node is beginning of sentence and empty, skip it
|
||||
words = []
|
||||
while node.posid != 0:
|
||||
surface = node.surface
|
||||
base = surface
|
||||
parts = node.feature.split(',')
|
||||
pos = ','.join(parts[0:4])
|
||||
|
||||
if len(parts) > 6:
|
||||
# this information is only available for words in the tokenizer dictionary
|
||||
reading = parts[6]
|
||||
base = parts[7]
|
||||
|
||||
words.append( ShortUnitWord(surface, base, pos) )
|
||||
node = node.next
|
||||
return words
|
||||
|
||||
class JapaneseTagger(object):
|
||||
def __init__(self, vocab):
|
||||
MeCab = try_mecab_import()
|
||||
self.tagger = Tagger(vocab)
|
||||
self.tokenizer = MeCab.Tagger()
|
||||
|
||||
def __call__(self, tokens):
|
||||
# two parts to this:
|
||||
# 1. get raw JP tags
|
||||
# 2. add features to tags as necessary for UD
|
||||
|
||||
dtokens = detailed_tokens(self.tokenizer, tokens.text)
|
||||
rawtags = list(map(resolve_pos, dtokens))
|
||||
self.tagger.tag_from_strings(tokens, rawtags)
|
||||
|
||||
class JapaneseDefaults(BaseDefaults):
|
||||
tag_map = TAG_MAP
|
||||
|
||||
@classmethod
|
||||
def create_tokenizer(cls, nlp=None):
|
||||
return JapaneseTokenizer(cls, nlp)
|
||||
|
||||
@classmethod
|
||||
def create_tagger(cls, tokenizer):
|
||||
return JapaneseTagger(tokenizer.vocab)
|
||||
|
||||
class Japanese(Language):
|
||||
lang = 'ja'
|
||||
|
||||
Defaults = JapaneseDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
words = self.tokenizer(text)
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
|
||||
|
||||
words = [str(t) for t in self.tokenizer(text)]
|
||||
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
|
||||
tagger(doc)
|
||||
return doc
|
||||
|
|
|
@ -3,22 +3,86 @@ from __future__ import unicode_literals
|
|||
|
||||
from ..symbols import *
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
"ADV": {POS: ADV},
|
||||
"NOUN": {POS: NOUN},
|
||||
"ADP": {POS: ADP},
|
||||
"PRON": {POS: PRON},
|
||||
"SCONJ": {POS: SCONJ},
|
||||
"PROPN": {POS: PROPN},
|
||||
"DET": {POS: DET},
|
||||
"SYM": {POS: SYM},
|
||||
"INTJ": {POS: INTJ},
|
||||
"PUNCT": {POS: PUNCT},
|
||||
"NUM": {POS: NUM},
|
||||
"AUX": {POS: AUX},
|
||||
"X": {POS: X},
|
||||
"CONJ": {POS: CONJ},
|
||||
"ADJ": {POS: ADJ},
|
||||
"VERB": {POS: VERB}
|
||||
# Explanation of Unidic tags:
|
||||
# https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
|
||||
|
||||
# Universal Dependencies Mapping:
|
||||
# http://universaldependencies.org/ja/overview/morphology.html
|
||||
# http://universaldependencies.org/ja/pos/all.html
|
||||
|
||||
"記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ
|
||||
"記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math
|
||||
|
||||
"感動詞,フィラー,*,*": {POS: INTJ},
|
||||
"感動詞,一般,*,*": {POS: INTJ},
|
||||
|
||||
# this is specifically for unicode full-width space
|
||||
"空白,*,*,*": {POS: X},
|
||||
|
||||
"形状詞,一般,*,*":{POS: ADJ},
|
||||
"形状詞,タリ,*,*":{POS: ADJ},
|
||||
"形状詞,助動詞語幹,*,*":{POS: ADJ},
|
||||
"形容詞,一般,*,*":{POS: ADJ},
|
||||
"形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise
|
||||
|
||||
"助詞,格助詞,*,*":{POS: ADP},
|
||||
"助詞,係助詞,*,*":{POS: ADP},
|
||||
"助詞,終助詞,*,*":{POS: PART},
|
||||
"助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い
|
||||
"助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て
|
||||
"助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb
|
||||
"助動詞,*,*,*":{POS: AUX},
|
||||
"接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement
|
||||
|
||||
"接頭辞,*,*,*":{POS: NOUN},
|
||||
"接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック
|
||||
"接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい
|
||||
"接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ
|
||||
"接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,*
|
||||
"接尾辞,名詞的,一般,*":{POS: NOUN},
|
||||
"接尾辞,名詞的,助数詞,*":{POS: NOUN},
|
||||
"接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ
|
||||
|
||||
"代名詞,*,*,*":{POS: PRON},
|
||||
"動詞,一般,*,*":{POS: VERB},
|
||||
"動詞,非自立可能,*,*":{POS: VERB}, # XXX VERB if alone, AUX otherwise
|
||||
"動詞,非自立可能,*,*,AUX":{POS: AUX},
|
||||
"動詞,非自立可能,*,*,VERB":{POS: VERB},
|
||||
"副詞,*,*,*":{POS: ADV},
|
||||
|
||||
"補助記号,AA,一般,*":{POS: SYM}, # text art
|
||||
"補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji
|
||||
"補助記号,一般,*,*":{POS: SYM},
|
||||
"補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket
|
||||
"補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket
|
||||
"補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker
|
||||
"補助記号,読点,*,*":{POS: PUNCT}, # comma
|
||||
|
||||
"名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun
|
||||
"名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name
|
||||
"名詞,固有名詞,人名,姓":{POS: PROPN}, # surname
|
||||
"名詞,固有名詞,人名,名":{POS: PROPN}, # first name
|
||||
"名詞,固有名詞,地名,一般":{POS: PROPN}, # place name
|
||||
"名詞,固有名詞,地名,国":{POS: PROPN}, # country name
|
||||
|
||||
"名詞,助動詞語幹,*,*":{POS: AUX},
|
||||
"名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals
|
||||
|
||||
"名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun
|
||||
"名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN},
|
||||
"名詞,普通名詞,サ変可能,*,VERB":{POS: VERB},
|
||||
|
||||
"名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手
|
||||
"名詞,普通名詞,一般,*":{POS: NOUN},
|
||||
"名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2
|
||||
"名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN},
|
||||
"名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ},
|
||||
"名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit
|
||||
"名詞,普通名詞,副詞可能,*":{POS: NOUN},
|
||||
|
||||
"連体詞,*,*,*":{POS: ADJ}, # XXX this has exceptions based on literal token
|
||||
"連体詞,*,*,*,ADJ":{POS: ADJ},
|
||||
"連体詞,*,*,*,PRON":{POS: PRON},
|
||||
"連体詞,*,*,*,DET":{POS: DET},
|
||||
}
|
||||
|
|
|
@ -80,9 +80,13 @@ def fi_tokenizer():
|
|||
|
||||
@pytest.fixture
|
||||
def ja_tokenizer():
|
||||
janome = pytest.importorskip("janome")
|
||||
pytest.importorskip("MeCab")
|
||||
return Japanese.Defaults.create_tokenizer()
|
||||
|
||||
@pytest.fixture
|
||||
def japanese():
|
||||
pytest.importorskip("MeCab")
|
||||
return Japanese()
|
||||
|
||||
@pytest.fixture
|
||||
def sv_tokenizer():
|
||||
|
|
38
spacy/tests/ja/test_tagger.py
Normal file
38
spacy/tests/ja/test_tagger.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
TAGGER_TESTS = [
|
||||
('あれならそこにあるよ',
|
||||
(('代名詞,*,*,*', 'PRON'),
|
||||
('助動詞,*,*,*', 'AUX'),
|
||||
('代名詞,*,*,*', 'PRON'),
|
||||
('助詞,格助詞,*,*', 'ADP'),
|
||||
('動詞,非自立可能,*,*', 'VERB'),
|
||||
('助詞,終助詞,*,*', 'PART'))),
|
||||
('このファイルには小さなテストが入っているよ',
|
||||
(('連体詞,*,*,*,DET', 'DET'),
|
||||
('名詞,普通名詞,サ変可能,*', 'NOUN'),
|
||||
('助詞,格助詞,*,*', 'ADP'),
|
||||
('助詞,係助詞,*,*', 'ADP'),
|
||||
('連体詞,*,*,*,ADJ', 'ADJ'),
|
||||
('名詞,普通名詞,サ変可能,*', 'NOUN'),
|
||||
('助詞,格助詞,*,*', 'ADP'),
|
||||
('動詞,一般,*,*', 'VERB'),
|
||||
('助詞,接続助詞,*,*', 'SCONJ'),
|
||||
('動詞,非自立可能,*,*', 'VERB'),
|
||||
('助詞,終助詞,*,*', 'PART'))),
|
||||
('プププランドに行きたい',
|
||||
(('名詞,普通名詞,一般,*', 'NOUN'),
|
||||
('助詞,格助詞,*,*', 'ADP'),
|
||||
('動詞,非自立可能,*,*', 'VERB'),
|
||||
('助動詞,*,*,*', 'AUX')))
|
||||
]
|
||||
|
||||
@pytest.mark.parametrize('text,expected_tags', TAGGER_TESTS)
|
||||
def test_japanese_tagger(japanese, text, expected_tags):
|
||||
tokens = japanese.make_doc(text)
|
||||
assert len(tokens) == len(expected_tags)
|
||||
for token, res in zip(tokens, expected_tags):
|
||||
assert token.tag_ == res[0] and token.pos_ == res[1]
|
|
@ -4,10 +4,10 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
TOKENIZER_TESTS = [
|
||||
("日本語だよ", ['日本語', 'だ', 'よ']),
|
||||
("日本語だよ", ['日本', '語', 'だ', 'よ']),
|
||||
("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']),
|
||||
("吾輩は猫である。", ['吾輩', 'は', '猫', 'で', 'ある', '。']),
|
||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お仕置き', 'よ', '!']),
|
||||
("月に代わって、お仕置きよ!", ['月', 'に', '代わっ', 'て', '、', 'お', '仕置き', 'よ', '!']),
|
||||
("すもももももももものうち", ['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち'])
|
||||
]
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user