spaCy/spacy/ja/__init__.py
Paul O'Leary McCann 6e9e686568 Sample implementation of Japanese Tagger (ref #1214)
This is far from complete but it should be enough to check some things.

1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD
tag mappings are based on Unidic. This switches out Mecab for Janome to
get around that.

2. Raw tag extension. A simple tag map can't meet the specifications for
UD tag mappings, so this adds an extra field to ambiguous cases. For
this demo it just deals with the simplest case, which only needs to look
at the literal token. (In reality it may be necessary to look at the
whole sentence, but that's another issue.)

3. General code structure. Seems nobody else has implemented a custom
Tagger yet, so still not sure this is the correct way to pass the
vocabulary around, for example.

Any feedback would be greatly appreciated. -POLM
2017-08-08 01:27:15 +09:00

116 lines
3.7 KiB
Python

# encoding: utf8
from __future__ import unicode_literals, print_function
from os import path
from ..language import Language, BaseDefaults
from ..tokenizer import Tokenizer
from ..tagger import Tagger
from ..attrs import LANG
from ..tokens import Doc
from .language_data import *
import re
from collections import namedtuple
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
class JapaneseTokenizer(object):
def __init__(self, cls, nlp=None):
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
try:
import MeCab
except ImportError:
raise ImportError("The Japanese tokenizer requires the MeCab library: "
"https://github.com/SamuraiT/mecab-python3")
self.tokenizer = MeCab.Tagger()
def __call__(self, text):
words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
return Doc(self.vocab, words=words, spaces=[False]*len(words))
def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping.
Under Universal Dependencies, sometimes the same Unidic POS tag can
be mapped differently depending on the literal token or its context
in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings.
"""
# NOTE: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve
# PoS mappings.
if token.part_of_speech == '連体詞,*,*,*':
# determiner-likes get DET, otherwise ADJ
if re.match('^[こそあど此其彼]の', token.surface):
return token.part_of_speech + ',DET'
else:
return token.part_of_speech + ',ADJ'
return token.part_of_speech
def detailed_tokens(tokenizer, text):
"""Format Mecab output into a nice data structure, based on Janome."""
node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it
words = []
while node.posid != 0:
parts = node.feature.split(',')
pos = ','.join(parts[0:4])
reading = parts[6]
base = parts[7]
surface = parts[8]
words.append( ShortUnitWord(surface, base, pos) )
node = node.next
return words
class JapaneseTagger(object):
def __init__(self, vocab):
try:
import MeCab
except ImportError:
raise ImportError("The Japanese tagger requires the MeCab library: "
"https://github.com/SamuraiT/mecab-python3")
self.tagger = Tagger(vocab)
self.tokenizer = MeCab.Tagger()
def __call__(self, tokens):
# two parts to this:
# 1. get raw JP tags
# 2. add features to tags as necessary for UD
# TODO: if the text has been tokenized, this info is already available
# How to set the data when tokenizing or save it for the tagger to find?
dtokens = detailed_tokens(self.tokenizer, tokens.text)
rawtags = list(map(resolve_pos, dtokens))
self.tagger.tag_from_strings(tokens, rawtags)
class JapaneseDefaults(BaseDefaults):
tag_map = TAG_MAP
@classmethod
def create_tokenizer(cls, nlp=None):
return JapaneseTokenizer(cls, nlp)
@classmethod
def create_tagger(cls, tokenizer):
return JapaneseTagger(tokenizer.vocab)
class Japanese(Language):
lang = 'ja'
Defaults = JapaneseDefaults
def make_doc(self, text):
words = [str(t) for t in self.tokenizer(text)]
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
tagger(doc)
return doc