diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index b2ec281f7..26e39a593 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -16,12 +16,7 @@ from collections import namedtuple ShortUnitWord = namedtuple('ShortUnitWord', ['surface', 'base_form', 'part_of_speech']) -class JapaneseDoc(Doc): - def __init__(self, detailed_tokens, vocab, words=None, spaces=None, orths_and_spaces=None): - super(JapaneseDoc, self).__init__(vocab, words, spaces, orths_and_spaces) - # This saves tokenizer output so mecab doesn't have to be called again - # when determining POS tags. - self.detailed_tokens = detailed_tokens +DETAILS_KEY = 'mecab_details' def try_mecab_import(): """Mecab is required for Japanese support, so check for it. @@ -43,7 +38,10 @@ class JapaneseTokenizer(object): def __call__(self, text): dtokens = detailed_tokens(self.tokenizer, text) words = [x.surface for x in dtokens] - return JapaneseDoc(dtokens, self.vocab, words=words, spaces=[False]*len(words)) + doc = Doc(self.vocab, words=words, spaces=[False]*len(words)) + # stash details tokens for tagger to use + doc.user_data[DETAILS_KEY] = dtokens + return doc def resolve_pos(token): """If necessary, add a field to the POS tag for UD mapping. @@ -99,7 +97,7 @@ class JapaneseTagger(object): # 1. get raw JP tags # 2. add features to tags as necessary for UD - dtokens = tokens.detailed_tokens + dtokens = tokens.user_data[DETAILS_KEY] rawtags = list(map(resolve_pos, dtokens)) self.tagger.tag_from_strings(tokens, rawtags)