mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
6e9e686568
This is far from complete but it should be enough to check some things. 1. Mecab transition. Janome doesn't support Unidic, only IPAdic, but UD tag mappings are based on Unidic. This switches out Mecab for Janome to get around that. 2. Raw tag extension. A simple tag map can't meet the specifications for UD tag mappings, so this adds an extra field to ambiguous cases. For this demo it just deals with the simplest case, which only needs to look at the literal token. (In reality it may be necessary to look at the whole sentence, but that's another issue.) 3. General code structure. Seems nobody else has implemented a custom Tagger yet, so still not sure this is the correct way to pass the vocabulary around, for example. Any feedback would be greatly appreciated. -POLM
116 lines
3.7 KiB
Python
116 lines
3.7 KiB
Python
# encoding: utf8
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
from os import path
|
|
|
|
from ..language import Language, BaseDefaults
|
|
from ..tokenizer import Tokenizer
|
|
from ..tagger import Tagger
|
|
from ..attrs import LANG
|
|
from ..tokens import Doc
|
|
|
|
from .language_data import *
|
|
|
|
import re
|
|
from collections import namedtuple
|
|
|
|
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
|
|
|
|
class JapaneseTokenizer(object):
|
|
def __init__(self, cls, nlp=None):
|
|
self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
|
try:
|
|
import MeCab
|
|
except ImportError:
|
|
raise ImportError("The Japanese tokenizer requires the MeCab library: "
|
|
"https://github.com/SamuraiT/mecab-python3")
|
|
self.tokenizer = MeCab.Tagger()
|
|
|
|
def __call__(self, text):
|
|
words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
|
|
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
|
|
|
def resolve_pos(token):
|
|
"""If necessary, add a field to the POS tag for UD mapping.
|
|
|
|
Under Universal Dependencies, sometimes the same Unidic POS tag can
|
|
be mapped differently depending on the literal token or its context
|
|
in the sentence. This function adds information to the POS tag to
|
|
resolve ambiguous mappings.
|
|
"""
|
|
|
|
# NOTE: This is a first take. The rules here are crude approximations.
|
|
# For many of these, full dependencies are needed to properly resolve
|
|
# PoS mappings.
|
|
|
|
if token.part_of_speech == '連体詞,*,*,*':
|
|
# determiner-likes get DET, otherwise ADJ
|
|
if re.match('^[こそあど此其彼]の', token.surface):
|
|
return token.part_of_speech + ',DET'
|
|
else:
|
|
return token.part_of_speech + ',ADJ'
|
|
return token.part_of_speech
|
|
|
|
def detailed_tokens(tokenizer, text):
|
|
"""Format Mecab output into a nice data structure, based on Janome."""
|
|
|
|
node = tokenizer.parseToNode(text)
|
|
node = node.next # first node is beginning of sentence and empty, skip it
|
|
words = []
|
|
while node.posid != 0:
|
|
parts = node.feature.split(',')
|
|
pos = ','.join(parts[0:4])
|
|
reading = parts[6]
|
|
base = parts[7]
|
|
surface = parts[8]
|
|
|
|
words.append( ShortUnitWord(surface, base, pos) )
|
|
node = node.next
|
|
return words
|
|
|
|
class JapaneseTagger(object):
|
|
def __init__(self, vocab):
|
|
try:
|
|
import MeCab
|
|
except ImportError:
|
|
raise ImportError("The Japanese tagger requires the MeCab library: "
|
|
"https://github.com/SamuraiT/mecab-python3")
|
|
|
|
self.tagger = Tagger(vocab)
|
|
self.tokenizer = MeCab.Tagger()
|
|
|
|
def __call__(self, tokens):
|
|
# two parts to this:
|
|
# 1. get raw JP tags
|
|
# 2. add features to tags as necessary for UD
|
|
|
|
# TODO: if the text has been tokenized, this info is already available
|
|
# How to set the data when tokenizing or save it for the tagger to find?
|
|
|
|
dtokens = detailed_tokens(self.tokenizer, tokens.text)
|
|
rawtags = list(map(resolve_pos, dtokens))
|
|
self.tagger.tag_from_strings(tokens, rawtags)
|
|
|
|
class JapaneseDefaults(BaseDefaults):
|
|
tag_map = TAG_MAP
|
|
|
|
@classmethod
|
|
def create_tokenizer(cls, nlp=None):
|
|
return JapaneseTokenizer(cls, nlp)
|
|
|
|
@classmethod
|
|
def create_tagger(cls, tokenizer):
|
|
return JapaneseTagger(tokenizer.vocab)
|
|
|
|
class Japanese(Language):
|
|
lang = 'ja'
|
|
|
|
Defaults = JapaneseDefaults
|
|
|
|
def make_doc(self, text):
|
|
words = [str(t) for t in self.tokenizer(text)]
|
|
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
|
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
|
|
tagger(doc)
|
|
return doc
|