mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-24 15:47:33 +03:00
[ja] Stash tokenizer output for speed
Before this commit, the Mecab tokenizer had to be called twice when creating a Doc- once during tokenization and once during tagging. This creates a JapaneseDoc wrapper class for Doc that stashes the parsed tokenizer output to remove redundant processing. -POLM
This commit is contained in:
parent
a31d33be06
commit
43eedf73f2
|
@ -16,6 +16,13 @@ from collections import namedtuple
|
||||||
|
|
||||||
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
|
ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech'])
|
||||||
|
|
||||||
|
class JapaneseDoc(Doc):
|
||||||
|
def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||||
|
super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces)
|
||||||
|
# This saves tokenizer output so mecab doesn't have to be called again
|
||||||
|
# when determining POS tags.
|
||||||
|
self.detailed_tokens = detailed_tokens
|
||||||
|
|
||||||
def try_mecab_import():
|
def try_mecab_import():
|
||||||
"""Mecab is required for Japanese support, so check for it.
|
"""Mecab is required for Japanese support, so check for it.
|
||||||
|
|
||||||
|
@ -34,8 +41,9 @@ class JapaneseTokenizer(object):
|
||||||
self.tokenizer = MeCab.Tagger()
|
self.tokenizer = MeCab.Tagger()
|
||||||
|
|
||||||
def __call__(self, text):
|
def __call__(self, text):
|
||||||
words = [x.surface for x in detailed_tokens(self.tokenizer, text)]
|
dtokens = detailed_tokens(self.tokenizer, text)
|
||||||
return Doc(self.vocab, words=words, spaces=[False]*len(words))
|
words = [x.surface for x in dtokens]
|
||||||
|
return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words))
|
||||||
|
|
||||||
def resolve_pos(token):
|
def resolve_pos(token):
|
||||||
"""If necessary, add a field to the POS tag for UD mapping.
|
"""If necessary, add a field to the POS tag for UD mapping.
|
||||||
|
@ -91,7 +99,7 @@ class JapaneseTagger(object):
|
||||||
# 1. get raw JP tags
|
# 1. get raw JP tags
|
||||||
# 2. add features to tags as necessary for UD
|
# 2. add features to tags as necessary for UD
|
||||||
|
|
||||||
dtokens = detailed_tokens(self.tokenizer, tokens.text)
|
dtokens = tokens.detailed_tokens
|
||||||
rawtags = list(map(resolve_pos, dtokens))
|
rawtags = list(map(resolve_pos, dtokens))
|
||||||
self.tagger.tag_from_strings(tokens, rawtags)
|
self.tagger.tag_from_strings(tokens, rawtags)
|
||||||
|
|
||||||
|
@ -112,8 +120,7 @@ class Japanese(Language):
|
||||||
Defaults = JapaneseDefaults
|
Defaults = JapaneseDefaults
|
||||||
|
|
||||||
def make_doc(self, text):
|
def make_doc(self, text):
|
||||||
words = [str(t) for t in self.tokenizer(text)]
|
jdoc = self.tokenizer(text)
|
||||||
doc = Doc(self.vocab, words=words, spaces=[False]*len(words))
|
|
||||||
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
|
tagger = JapaneseDefaults.create_tagger(self.tokenizer)
|
||||||
tagger(doc)
|
tagger(jdoc)
|
||||||
return doc
|
return jdoc
|
||||||
|
|
Loading…
Reference in New Issue
Block a user