Update Korean defaults for Tokenizer (#10322)

Update Korean defaults for `Tokenizer` for tokenization following UD
Korean Kaist.
This commit is contained in:
Adriane Boyd 2022-02-21 10:26:19 +01:00 committed by GitHub
parent f32ee2e533
commit 30030176ee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 0 deletions

View File

@ -1,5 +1,6 @@
from typing import Iterator, Any, Dict from typing import Iterator, Any, Dict
from .punctuation import TOKENIZER_INFIXES
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
@ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults):
lex_attr_getters = LEX_ATTRS lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
infixes = TOKENIZER_INFIXES
class Korean(Language): class Korean(Language):

View File

@ -0,0 +1,12 @@
from ..char_classes import LIST_QUOTES
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
_infixes = (
["·", "", "\(", "\)"]
+ [r"(?<=[0-9])~(?=[0-9-])"]
+ LIST_QUOTES
+ BASE_TOKENIZER_INFIXES
)
TOKENIZER_INFIXES = _infixes

View File

@ -227,6 +227,19 @@ def ko_tokenizer():
return get_lang_class("ko")().tokenizer return get_lang_class("ko")().tokenizer
@pytest.fixture(scope="session")
def ko_tokenizer_tokenizer():
config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.Tokenizer.v1",
}
}
}
nlp = get_lang_class("ko").from_config(config)
return nlp.tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def lb_tokenizer(): def lb_tokenizer():
return get_lang_class("lb")().tokenizer return get_lang_class("lb")().tokenizer

View File

@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
def test_ko_empty_doc(ko_tokenizer): def test_ko_empty_doc(ko_tokenizer):
tokens = ko_tokenizer("") tokens = ko_tokenizer("")
assert len(tokens) == 0 assert len(tokens) == 0
# fmt: off
SPACY_TOKENIZER_TESTS = [
("있다.", "있다 ."),
("''", "''"),
("부 (富) 는", "부 ( 富 ) 는"),
("부(富)는", "부 ( 富 ) 는"),
("1982~1983.", "1982 ~ 1983 ."),
("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."),
("그렇구나~", "그렇구나~"),
("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"),
]
# fmt: on
@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS)
def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens):
tokens = [token.text for token in ko_tokenizer_tokenizer(text)]
assert tokens == expected_tokens.split()