mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Handle unknown tags in KoreanTokenizer tag map (#10536)
This commit is contained in:
parent
c17980e535
commit
e908a67829
|
@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
|
||||||
from ...language import Language, BaseDefaults
|
from ...language import Language, BaseDefaults
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ...scorer import Scorer
|
from ...scorer import Scorer
|
||||||
from ...symbols import POS
|
from ...symbols import POS, X
|
||||||
from ...training import validate_examples
|
from ...training import validate_examples
|
||||||
from ...util import DummyTokenizer, registry, load_config_from_str
|
from ...util import DummyTokenizer, registry, load_config_from_str
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
|
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
for token, dtoken in zip(doc, dtokens):
|
for token, dtoken in zip(doc, dtokens):
|
||||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||||
token.pos = TAG_MAP[token.tag_][POS]
|
if token.tag_ in TAG_MAP:
|
||||||
|
token.pos = TAG_MAP[token.tag_][POS]
|
||||||
|
else:
|
||||||
|
token.pos = X
|
||||||
token.lemma_ = dtoken["lemma"]
|
token.lemma_ = dtoken["lemma"]
|
||||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||||
return doc
|
return doc
|
||||||
|
|
|
@ -49,6 +49,12 @@ def test_ko_empty_doc(ko_tokenizer):
|
||||||
assert len(tokens) == 0
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10535)
|
||||||
|
def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
||||||
|
tokens = ko_tokenizer("미닛 리피터")
|
||||||
|
assert tokens[1].pos_ == "X"
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
SPACY_TOKENIZER_TESTS = [
|
SPACY_TOKENIZER_TESTS = [
|
||||||
("있다.", "있다 ."),
|
("있다.", "있다 ."),
|
||||||
|
|
Loading…
Reference in New Issue
Block a user