mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Don't set extension attribute in Japanese (closes #3398)
This commit is contained in:
parent
72fb324d95
commit
2912ddc9a6
|
@ -8,16 +8,13 @@ from .stop_words import STOP_WORDS
|
|||
from .tag_map import TAG_MAP
|
||||
from ...attrs import LANG
|
||||
from ...language import Language
|
||||
from ...tokens import Doc, Token
|
||||
from ...tokens import Doc
|
||||
from ...compat import copy_reg
|
||||
from ...util import DummyTokenizer
|
||||
|
||||
|
||||
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
||||
|
||||
# TODO: Is this the right place for this?
|
||||
Token.set_extension("mecab_tag", default=None)
|
||||
|
||||
|
||||
def try_mecab_import():
|
||||
"""Mecab is required for Japanese support, so check for it.
|
||||
|
@ -82,10 +79,12 @@ class JapaneseTokenizer(DummyTokenizer):
|
|||
words = [x.surface for x in dtokens]
|
||||
spaces = [False] * len(words)
|
||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||
mecab_tags = []
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
token._.mecab_tag = dtoken.pos
|
||||
mecab_tags.append(dtoken.pos)
|
||||
token.tag_ = resolve_pos(dtoken)
|
||||
token.lemma_ = dtoken.lemma
|
||||
doc.user_data["mecab_tags"] = mecab_tags
|
||||
return doc
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user