diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 2f85406c0..26e39a593 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,6 +16,8 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) +DETAILS_KEY = 'mecab_details' + def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -34,8 +36,12 @@ class JapaneseTokenizer(object): self.tokenizer = MeCab.Tagger() def __call__(self, text): - words = [x.surface for x in detailed_tokens(self.tokenizer, text)] - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + dtokens = detailed_tokens(self.tokenizer, text) + words = [x.surface for x in dtokens] + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + # stash details tokens for tagger to use + doc.user_data[DETAILS_KEY] = dtokens + return doc def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -91,7 +97,7 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - dtokens = detailed_tokens(self.tokenizer, tokens.text) + dtokens = tokens.user_data[DETAILS_KEY] rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags) @@ -112,8 +118,7 @@ class Japanese(Language): Defaults = JapaneseDefaults def make_doc(self, text): - words = [str(t) for t in self.tokenizer(text)] - doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + jdoc = self.tokenizer(text) tagger = JapaneseDefaults.create_tagger(self.tokenizer) - tagger(doc) - return doc + tagger(jdoc) + return jdoc