From 6e9e686568ab1f70d0b517e0d5f2bcbb894eb17a Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 8 Aug 2017 01:27:15 +0900 Subject: [PATCH] Sample implementation of Japanese Tagger (ref #1214) This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM --- spacy/ja/__init__.py | 92 +++++++++++++++++++++++++++++---- spacy/ja/tag_map.py | 97 +++++++++++++++++++++++++++++------ spacy/tests/conftest.py | 5 +- spacy/tests/ja/test_tagger.py | 10 ++++ 4 files changed, 177 insertions(+), 27 deletions(-) create mode 100644 spacy/tests/ja/test_tagger.py diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 1c85ded95..5f49f0b1b 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -5,37 +5,111 @@ from os import path from ..language import Language, BaseDefaults from ..tokenizer import Tokenizer +from ..tagger import Tagger from ..attrs import LANG from ..tokens import Doc from .language_data import * +import re +from collections import namedtuple + +ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) + class JapaneseTokenizer(object): def __init__(self, cls, nlp=None): self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) try: - from janome.tokenizer import Tokenizer + import MeCab except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: " - "https://github.com/mocobeta/janome") - self.tokenizer = Tokenizer() + raise ImportError("The Japanese tokenizer requires the MeCab library: " + "https://github.com/SamuraiT/mecab-python3") + self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in self.tokenizer.tokenize(text)] + words = [x.surface for x in detailed_tokens(self.tokenizer, text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) +def resolve_pos(token): + """If necessary, add a field to the POS tag for UD mapping. + + Under Universal Dependencies, sometimes the same Unidic POS tag can + be mapped differently depending on the literal token or its context + in the sentence. This function adds information to the POS tag to + resolve ambiguous mappings. + """ + + # NOTE: This is a first take. The rules here are crude approximations. + # For many of these, full dependencies are needed to properly resolve + # PoS mappings. + + if token.part_of_speech == '連体詞,*,*,*': + # determiner-likes get DET, otherwise ADJ + if re.match('^[こそあど此其彼]の', token.surface): + return token.part_of_speech + ',DET' + else: + return token.part_of_speech + ',ADJ' + return token.part_of_speech + +def detailed_tokens(tokenizer, text): + """Format Mecab output into a nice data structure, based on Janome.""" + + node = tokenizer.parseToNode(text) + node = node.next # first node is beginning of sentence and empty, skip it + words = [] + while node.posid != 0: + parts = node.feature.split(',') + pos = ','.join(parts[0:4]) + reading = parts[6] + base = parts[7] + surface = parts[8] + + words.append( ShortUnitWord(surface, base, pos) ) + node = node.next + return words + +class JapaneseTagger(object): + def __init__(self, vocab): + try: + import MeCab + except ImportError: + raise ImportError("The Japanese tagger requires the MeCab library: " + "https://github.com/SamuraiT/mecab-python3") + + self.tagger = Tagger(vocab) + self.tokenizer = MeCab.Tagger() + + def __call__(self, tokens): + # two parts to this: + # 1. get raw JP tags + # 2. add features to tags as necessary for UD + + # TODO: if the text has been tokenized, this info is already available + # How to set the data when tokenizing or save it for the tagger to find? + + dtokens = detailed_tokens(self.tokenizer, tokens.text) + rawtags = list(map(resolve_pos, dtokens)) + self.tagger.tag_from_strings(tokens, rawtags) + class JapaneseDefaults(BaseDefaults): + tag_map = TAG_MAP + @classmethod def create_tokenizer(cls, nlp=None): return JapaneseTokenizer(cls, nlp) + @classmethod + def create_tagger(cls, tokenizer): + return JapaneseTagger(tokenizer.vocab) + class Japanese(Language): lang = 'ja' Defaults = JapaneseDefaults def make_doc(self, text): - words = self.tokenizer(text) - return Doc(self.vocab, words=words, spaces=[False]*len(words)) - - + words = [str(t) for t in self.tokenizer(text)] + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + tagger = JapaneseDefaults.create_tagger(self.tokenizer) + tagger(doc) + return doc diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index f5b6b5040..609739c2f 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -3,22 +3,85 @@ from __future__ import unicode_literals from ..symbols import * - TAG_MAP = { - "ADV": {POS: ADV}, - "NOUN": {POS: NOUN}, - "ADP": {POS: ADP}, - "PRON": {POS: PRON}, - "SCONJ": {POS: SCONJ}, - "PROPN": {POS: PROPN}, - "DET": {POS: DET}, - "SYM": {POS: SYM}, - "INTJ": {POS: INTJ}, - "PUNCT": {POS: PUNCT}, - "NUM": {POS: NUM}, - "AUX": {POS: AUX}, - "X": {POS: X}, - "CONJ": {POS: CONJ}, - "ADJ": {POS: ADJ}, - "VERB": {POS: VERB} + # Explanation of Unidic tags: + # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf + + # Universal Dependencies Mapping: + # http://universaldependencies.org/ja/overview/morphology.html + # http://universaldependencies.org/ja/pos/all.html + + "記号,一般,*,*":{POS: PUNCT}, # this includes characters used to represent sounds like ドレミ + "記号,文字,*,*":{POS: PUNCT}, # this is for Greek and Latin characters used as sumbols, as in math + + "感動詞,フィラー,*,*": {POS: INTJ}, + "感動詞,一般,*,*": {POS: INTJ}, + + # this is specifically for unicode full-width space + "空白,*,*,*": {POS: X}, + + "形状詞,一般,*,*":{POS: ADJ}, + "形状詞,タリ,*,*":{POS: ADJ}, + "形状詞,助動詞語幹,*,*":{POS: ADJ}, + "形容詞,一般,*,*":{POS: ADJ}, + "形容詞,非自立可能,*,*":{POS: AUX}, # XXX ADJ if alone, AUX otherwise + + "助詞,格助詞,*,*":{POS: ADP}, + "助詞,係助詞,*,*":{POS: ADP}, + "助詞,終助詞,*,*":{POS: PART}, + "助詞,準体助詞,*,*":{POS: SCONJ}, # の as in 走るのが速い + "助詞,接続助詞,*,*":{POS: SCONJ}, # verb ending て + "助詞,副助詞,*,*":{POS: PART}, # ばかり, つつ after a verb + "助動詞,*,*,*":{POS: AUX}, + "接続詞,*,*,*":{POS: SCONJ}, # XXX: might need refinement + + "接頭辞,*,*,*":{POS: NOUN}, + "接尾辞,形状詞的,*,*":{POS: ADJ}, # がち, チック + "接尾辞,形容詞的,*,*":{POS: ADJ}, # -らしい + "接尾辞,動詞的,*,*":{POS: NOUN}, # -じみ + "接尾辞,名詞的,サ変可能,*":{POS: NOUN}, # XXX see 名詞,普通名詞,サ変可能,* + "接尾辞,名詞的,一般,*":{POS: NOUN}, + "接尾辞,名詞的,助数詞,*":{POS: NOUN}, + "接尾辞,名詞的,副詞可能,*":{POS: NOUN}, # -後, -過ぎ + + "代名詞,*,*,*":{POS: PRON}, + "動詞,一般,*,*":{POS: VERB}, + "動詞,非自立可能,*,*":{POS: AUX}, # XXX VERB if alone, AUX otherwise + "動詞,非自立可能,*,*,AUX":{POS: AUX}, + "動詞,非自立可能,*,*,VERB":{POS: VERB}, + "副詞,*,*,*":{POS: ADV}, + + "補助記号,AA,一般,*":{POS: SYM}, # text art + "補助記号,AA,顔文字,*":{POS: SYM}, # kaomoji + "補助記号,一般,*,*":{POS: SYM}, + "補助記号,括弧開,*,*":{POS: PUNCT}, # open bracket + "補助記号,括弧閉,*,*":{POS: PUNCT}, # close bracket + "補助記号,句点,*,*":{POS: PUNCT}, # period or other EOS marker + "補助記号,読点,*,*":{POS: PUNCT}, # comma + + "名詞,固有名詞,一般,*":{POS: PROPN}, # general proper noun + "名詞,固有名詞,人名,一般":{POS: PROPN}, # person's name + "名詞,固有名詞,人名,姓":{POS: PROPN}, # surname + "名詞,固有名詞,人名,名":{POS: PROPN}, # first name + "名詞,固有名詞,地名,一般":{POS: PROPN}, # place name + "名詞,固有名詞,地名,国":{POS: PROPN}, # country name + + "名詞,助動詞語幹,*,*":{POS: AUX}, + "名詞,数詞,*,*":{POS: NUM}, # includes Chinese numerals + + "名詞,普通名詞,サ変可能,*":{POS: NOUN}, # XXX: sometimes VERB in UDv2; suru-verb noun + "名詞,普通名詞,サ変可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,サ変可能,*,VERB":{POS: VERB}, + + "名詞,普通名詞,サ変形状詞可能,*":{POS: NOUN}, # ex: 下手 + "名詞,普通名詞,一般,*":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*":{POS: NOUN}, # XXX: sometimes ADJ in UDv2 + "名詞,普通名詞,形状詞可能,*,NOUN":{POS: NOUN}, + "名詞,普通名詞,形状詞可能,*,ADJ":{POS: ADJ}, + "名詞,普通名詞,助数詞可能,*":{POS: NOUN}, # counter / unit + "名詞,普通名詞,副詞可能,*":{POS: NOUN}, + + "連体詞,*,*,*":{POS: ADJ}, # XXX note この、その etc. should be DET + "連体詞,*,*,*,ADJ":{POS: ADJ}, + "連体詞,*,*,*,DET":{POS: DET}, } diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6e00b1513..52b9bdd57 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -79,9 +79,12 @@ def fi_tokenizer(): @pytest.fixture def ja_tokenizer(): - janome = pytest.importorskip("janome") + pytest.importorskip("MeCab") return Japanese.Defaults.create_tokenizer() +@pytest.fixture +def japanese(): + return Japanese() @pytest.fixture def sv_tokenizer(): diff --git a/spacy/tests/ja/test_tagger.py b/spacy/tests/ja/test_tagger.py new file mode 100644 index 000000000..43259fb49 --- /dev/null +++ b/spacy/tests/ja/test_tagger.py @@ -0,0 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +def test_japanese_tagger(japanese): + doc = japanese.make_doc("このファイルには小さなテストが入っているよ") + # note these both have the same raw tag, '連体詞,*,*,*' + assert doc[0].pos_ == "DET" + assert doc[4].pos_ == "ADJ"