Handle unknown tags in KoreanTokenizer tag map (#10536)

This commit is contained in:
Adriane Boyd 2022-03-24 11:25:36 +01:00 committed by GitHub
parent c17980e535
commit e908a67829
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 2 deletions

View File

@ -7,7 +7,7 @@ from .lex_attrs import LEX_ATTRS
from ...language import Language, BaseDefaults from ...language import Language, BaseDefaults
from ...tokens import Doc from ...tokens import Doc
from ...scorer import Scorer from ...scorer import Scorer
from ...symbols import POS from ...symbols import POS, X
from ...training import validate_examples from ...training import validate_examples
from ...util import DummyTokenizer, registry, load_config_from_str from ...util import DummyTokenizer, registry, load_config_from_str
from ...vocab import Vocab from ...vocab import Vocab
@ -57,7 +57,10 @@ class KoreanTokenizer(DummyTokenizer):
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+") first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
token.pos = TAG_MAP[token.tag_][POS] if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"] token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc return doc

View File

@ -49,6 +49,12 @@ def test_ko_empty_doc(ko_tokenizer):
assert len(tokens) == 0 assert len(tokens) == 0
@pytest.mark.issue(10535)
def test_ko_tokenizer_unknown_tag(ko_tokenizer):
tokens = ko_tokenizer("미닛 리피터")
assert tokens[1].pos_ == "X"
# fmt: off # fmt: off
SPACY_TOKENIZER_TESTS = [ SPACY_TOKENIZER_TESTS = [
("있다.", "있다 ."), ("있다.", "있다 ."),