Don't set extension attribute in Japanese (closes #3398)

This commit is contained in:
Ines Montani 2019-03-12 13:30:33 +01:00
parent 72fb324d95
commit 2912ddc9a6

View File

@ -8,16 +8,13 @@ from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from ...attrs import LANG from ...attrs import LANG
from ...language import Language from ...language import Language
from ...tokens import Doc, Token from ...tokens import Doc
from ...compat import copy_reg from ...compat import copy_reg
from ...util import DummyTokenizer from ...util import DummyTokenizer
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"]) ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
# TODO: Is this the right place for this?
Token.set_extension("mecab_tag", default=None)
def try_mecab_import(): def try_mecab_import():
"""Mecab is required for Japanese support, so check for it. """Mecab is required for Japanese support, so check for it.
@ -82,10 +79,12 @@ class JapaneseTokenizer(DummyTokenizer):
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
spaces = [False] * len(words) spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = []
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
token._.mecab_tag = dtoken.pos mecab_tags.append(dtoken.pos)
token.tag_ = resolve_pos(dtoken) token.tag_ = resolve_pos(dtoken)
token.lemma_ = dtoken.lemma token.lemma_ = dtoken.lemma
doc.user_data["mecab_tags"] = mecab_tags
return doc return doc