mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 04:08:09 +03:00
Merge pull request #1420 from polm/master
[ja] Stash tokenizer output for speed
This commit is contained in:
commit
6b0121091c
|
@ -16,6 +16,8 @@ from collections import namedtuple
|
|||
|
||||
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
|
||||
|
||||
DETAILS_KEY = 'mecab_details'
|
||||
|
||||
def try_mecab_import():
|
||||
"""Mecab is required for Japanese support, so check for it.
|
||||
|
||||
|
@ -34,8 +36,12 @@ class JapaneseTokenizer(object):
|
|||
self.tokenizer = MeCab.Tagger()
|
||||
|
||||
def __call__(self, text):
|
||||
words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
|
||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
dtokens = detailed_tokens(self.tokenizer, text)
|
||||
words = [x.surface for x in dtokens]
|
||||
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
# stash details tokens for tagger to use
|
||||
doc.user_data[DETAILS_KEY] = dtokens
|
||||
return doc
|
||||
|
||||
def resolve_pos(token):
|
||||
"""If necessary, add a field to the POS tag for UD mapping.
|
||||
|
@ -91,7 +97,7 @@ class JapaneseTagger(object):
|
|||
# 1. get raw JP tags
|
||||
# 2. add features to tags as necessary for UD
|
||||
|
||||
dtokens = detailed_tokens(self.tokenizer, tokens.text)
|
||||
dtokens = tokens.user_data[DETAILS_KEY]
|
||||
rawtags = list(map(resolve_pos, dtokens))
|
||||
self.tagger.tag_from_strings(tokens, rawtags)
|
||||
|
||||
|
@ -112,8 +118,7 @@ class Japanese(Language):
|
|||
Defaults = JapaneseDefaults
|
||||
|
||||
def make_doc(self, text):
|
||||
words = [str(t) for t in self.tokenizer(text)]
|
||||
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
||||
jdoc = self.tokenizer(text)
|
||||
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
|
||||
tagger(doc)
|
||||
return doc
|
||||
tagger(jdoc)
|
||||
return jdoc
|
||||
|
|
Loading…
Reference in New Issue
Block a user