Handle unknown tags in KoreanTokenizer tag map (#10536)

This commit is contained in:
Adriane Boyd 2022-03-24 11:25:36 +01:00 committed by GitHub
parent c17980e535
commit e908a67829
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 2 deletions

View File

@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults
from ...tokens import Doc
from ...scorer import Scorer
from ...symbols import POS
from ...symbols import POS, X
from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc

View File

@ -49,6 +49,12 @@ def test_ko_empty_doc(ko_tokenizer):
assert len(tokens) == 0
@pytest.mark.issue(10535)
def test_ko_tokenizer_unknown_tag(ko_tokenizer):
tokens = ko_tokenizer("미닛 리피터")
assert tokens[1].pos_ == "X"
# fmt: off
SPACY_TOKENIZER_TESTS = [
("있다.", "있다 ."),