mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update Korean defaults for Tokenizer (#10322)
Update Korean defaults for `Tokenizer` for tokenization following UD Korean Kaist.
This commit is contained in:
parent
f32ee2e533
commit
30030176ee
|
@ -1,5 +1,6 @@
|
||||||
from typing import Iterator, Any, Dict
|
from typing import Iterator, Any, Dict
|
||||||
|
|
||||||
|
from .punctuation import TOKENIZER_INFIXES
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
|
@ -85,6 +86,7 @@ class KoreanDefaults(BaseDefaults):
|
||||||
lex_attr_getters = LEX_ATTRS
|
lex_attr_getters = LEX_ATTRS
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
class Korean(Language):
|
class Korean(Language):
|
||||||
|
|
12
spacy/lang/ko/punctuation.py
Normal file
12
spacy/lang/ko/punctuation.py
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
from ..char_classes import LIST_QUOTES
|
||||||
|
from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
_infixes = (
|
||||||
|
["·", "ㆍ", "\(", "\)"]
|
||||||
|
+ [r"(?<=[0-9])~(?=[0-9-])"]
|
||||||
|
+ LIST_QUOTES
|
||||||
|
+ BASE_TOKENIZER_INFIXES
|
||||||
|
)
|
||||||
|
|
||||||
|
TOKENIZER_INFIXES = _infixes
|
|
@ -227,6 +227,19 @@ def ko_tokenizer():
|
||||||
return get_lang_class("ko")().tokenizer
|
return get_lang_class("ko")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def ko_tokenizer_tokenizer():
|
||||||
|
config = {
|
||||||
|
"nlp": {
|
||||||
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.Tokenizer.v1",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nlp = get_lang_class("ko").from_config(config)
|
||||||
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lb_tokenizer():
|
def lb_tokenizer():
|
||||||
return get_lang_class("lb")().tokenizer
|
return get_lang_class("lb")().tokenizer
|
||||||
|
|
|
@ -47,3 +47,23 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
||||||
def test_ko_empty_doc(ko_tokenizer):
|
def test_ko_empty_doc(ko_tokenizer):
|
||||||
tokens = ko_tokenizer("")
|
tokens = ko_tokenizer("")
|
||||||
assert len(tokens) == 0
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# fmt: off
|
||||||
|
SPACY_TOKENIZER_TESTS = [
|
||||||
|
("있다.", "있다 ."),
|
||||||
|
("'예'는", "' 예 ' 는"),
|
||||||
|
("부 (富) 는", "부 ( 富 ) 는"),
|
||||||
|
("부(富)는", "부 ( 富 ) 는"),
|
||||||
|
("1982~1983.", "1982 ~ 1983 ."),
|
||||||
|
("사과·배·복숭아·수박은 모두 과일이다.", "사과 · 배 · 복숭아 · 수박은 모두 과일이다 ."),
|
||||||
|
("그렇구나~", "그렇구나~"),
|
||||||
|
("『9시 반의 당구』,", "『 9시 반의 당구 』 ,"),
|
||||||
|
]
|
||||||
|
# fmt: on
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", SPACY_TOKENIZER_TESTS)
|
||||||
|
def test_ko_spacy_tokenizer(ko_tokenizer_tokenizer, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in ko_tokenizer_tokenizer(text)]
|
||||||
|
assert tokens == expected_tokens.split()
|
||||||
|
|
Loading…
Reference in New Issue
Block a user