mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Don't set extension attribute in Japanese (closes #3398)
This commit is contained in:
parent
72fb324d95
commit
2912ddc9a6
|
@ -8,16 +8,13 @@ from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from ...attrs import LANG
|
from ...attrs import LANG
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...tokens import Doc, Token
|
from ...tokens import Doc
|
||||||
from ...compat import copy_reg
|
from ...compat import copy_reg
|
||||||
from ...util import DummyTokenizer
|
from ...util import DummyTokenizer
|
||||||
|
|
||||||
|
|
||||||
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
ShortUnitWord = namedtuple("ShortUnitWord", ["surface", "lemma", "pos"])
|
||||||
|
|
||||||
# TODO: Is this the right place for this?
|
|
||||||
Token.set_extension("mecab_tag", default=None)
|
|
||||||
|
|
||||||
|
|
||||||
def try_mecab_import():
|
def try_mecab_import():
|
||||||
"""Mecab is required for Japanese support, so check for it.
|
"""Mecab is required for Japanese support, so check for it.
|
||||||
|
@ -82,10 +79,12 @@ class JapaneseTokenizer(DummyTokenizer):
|
||||||
words = [x.surface for x in dtokens]
|
words = [x.surface for x in dtokens]
|
||||||
spaces = [False] * len(words)
|
spaces = [False] * len(words)
|
||||||
doc = Doc(self.vocab, words=words, spaces=spaces)
|
doc = Doc(self.vocab, words=words, spaces=spaces)
|
||||||
|
mecab_tags = []
|
||||||
for token, dtoken in zip(doc, dtokens):
|
for token, dtoken in zip(doc, dtokens):
|
||||||
token._.mecab_tag = dtoken.pos
|
mecab_tags.append(dtoken.pos)
|
||||||
token.tag_ = resolve_pos(dtoken)
|
token.tag_ = resolve_pos(dtoken)
|
||||||
token.lemma_ = dtoken.lemma
|
token.lemma_ = dtoken.lemma
|
||||||
|
doc.user_data["mecab_tags"] = mecab_tags
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user