Merge pull request #1420 from polm/master

[ja] Stash tokenizer output for speed
This commit is contained in:
Matthew Honnibal 2017-10-16 10:28:22 +02:00 committed by GitHub
commit 6b0121091c

View File

@ -16,6 +16,8 @@ from collections import namedtuple
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
DETAILS_KEY = 'mecab_details'
def try_mecab_import(): def try_mecab_import():
"""Mecab is required for Japanese support, so check for it. """Mecab is required for Japanese support, so check for it.
@ -34,8 +36,12 @@ class JapaneseTokenizer(object):
self.tokenizer = MeCab.Tagger() self.tokenizer = MeCab.Tagger()
def __call__(self, text): def __call__(self, text):
words = [x.surface for x in detailed_tokens(self.tokenizer, text)] dtokens = detailed_tokens(self.tokenizer, text)
return Doc(self.vocab, words=words, spaces=[False]*len(words)) words = [x.surface for x in dtokens]
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
# stash details tokens for tagger to use
doc.user_data[DETAILS_KEY] = dtokens
return doc
def resolve_pos(token): def resolve_pos(token):
"""If necessary, add a field to the POS tag for UD mapping. """If necessary, add a field to the POS tag for UD mapping.
@ -91,7 +97,7 @@ class JapaneseTagger(object):
# 1. get raw JP tags # 1. get raw JP tags
# 2. add features to tags as necessary for UD # 2. add features to tags as necessary for UD
dtokens = detailed_tokens(self.tokenizer, tokens.text) dtokens = tokens.user_data[DETAILS_KEY]
rawtags = list(map(resolve_pos, dtokens)) rawtags = list(map(resolve_pos, dtokens))
self.tagger.tag_from_strings(tokens, rawtags) self.tagger.tag_from_strings(tokens, rawtags)
@ -112,8 +118,7 @@ class Japanese(Language):
Defaults = JapaneseDefaults Defaults = JapaneseDefaults
def make_doc(self, text): def make_doc(self, text):
words = [str(t) for t in self.tokenizer(text)] jdoc = self.tokenizer(text)
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
tagger = JapaneseDefaults.create_tagger(self.tokenizer) tagger = JapaneseDefaults.create_tagger(self.tokenizer)
tagger(doc) tagger(jdoc)
return doc return jdoc