Switch to mecab-ko as default Korean tokenizer

Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.
This commit is contained in:
Adriane Boyd 2022-08-11 10:56:03 +02:00
parent b64243ed55
commit a2a0e1abf1
7 changed files with 204 additions and 33 deletions

View File

@ -114,7 +114,7 @@ ja =
sudachipy>=0.5.2,!=0.6.1 sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220 sudachidict_core>=20211220
ko = ko =
natto-py>=0.9.0 mecab-ko>=1.0.0
th = th =
pythainlp>=2.0 pythainlp>=2.0

View File

@ -32,7 +32,85 @@ def create_tokenizer():
class KoreanTokenizer(DummyTokenizer): class KoreanTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab): def __init__(self, vocab: Vocab):
self.vocab = vocab self.vocab = vocab
self._mecab = try_mecab_import() # type: ignore[func-returns-value] mecab = try_mecab_import()
self.mecab_tokenizer = mecab.Tagger()
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for line in self.mecab_tokenizer.parse(text).split("\n"):
if line == "EOS":
break
surface, _, expr = line.partition("\t")
features = expr.split("/")[0].split(",")
tag = features[0]
lemma = "*"
if len(features) >= 7:
lemma = features[7]
if lemma == "*":
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
class KoreanDefaults(BaseDefaults):
config = load_config_from_str(DEFAULT_CONFIG)
lex_attr_getters = LEX_ATTRS
stop_words = STOP_WORDS
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
infixes = TOKENIZER_INFIXES
class Korean(Language):
lang = "ko"
Defaults = KoreanDefaults
def try_mecab_import() -> None:
try:
import mecab_ko as MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
"the python package `mecab-ko`: pip install mecab-ko"
) from None
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
def create_natto_tokenizer():
def korean_natto_tokenizer_factory(nlp):
return KoreanNattoTokenizer(nlp.vocab)
return korean_natto_tokenizer_factory
class KoreanNattoTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab):
self.vocab = vocab
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
self._mecab_tokenizer = None self._mecab_tokenizer = None
@property @property
@ -66,8 +144,8 @@ class KoreanTokenizer(DummyTokenizer):
return doc return doc
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True): for node in self.mecab_tokenizer.parse(text, as_nodes=True):
if node.is_eos(): if node.is_eos():
break break
@ -75,7 +153,7 @@ class KoreanTokenizer(DummyTokenizer):
feature = node.feature feature = node.feature
tag, _, expr = feature.partition(",") tag, _, expr = feature.partition(",")
lemma, _, remainder = expr.partition("/") lemma, _, remainder = expr.partition("/")
if lemma == "*": if lemma == "*" or lemma == "":
lemma = surface lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag} yield {"surface": surface, "lemma": lemma, "tag": tag}
@ -83,32 +161,18 @@ class KoreanTokenizer(DummyTokenizer):
validate_examples(examples, "KoreanTokenizer.score") validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples) return Scorer.score_tokenization(examples)
def _try_mecab_import(self):
try:
from natto import MeCab
class KoreanDefaults(BaseDefaults): return MeCab
config = load_config_from_str(DEFAULT_CONFIG) except ImportError:
lex_attr_getters = LEX_ATTRS raise ImportError(
stop_words = STOP_WORDS 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
infixes = TOKENIZER_INFIXES "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
) from None
class Korean(Language):
lang = "ko"
Defaults = KoreanDefaults
def try_mecab_import() -> None:
try:
from natto import MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
) from None
def check_spaces(text, tokens): def check_spaces(text, tokens):

View File

@ -239,7 +239,7 @@ def hsb_tokenizer():
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def ko_tokenizer(): def ko_tokenizer():
pytest.importorskip("natto") pytest.importorskip("mecab_ko")
return get_lang_class("ko")().tokenizer return get_lang_class("ko")().tokenizer
@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer():
return nlp.tokenizer return nlp.tokenizer
@pytest.fixture(scope="session")
def ko_tokenizer_natto():
pytest.importorskip("natto")
config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
}
}
}
nlp = get_lang_class("ko").from_config(config)
return nlp.tokenizer
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def lb_tokenizer(): def lb_tokenizer():
return get_lang_class("lb")().tokenizer return get_lang_class("lb")().tokenizer

View File

@ -7,3 +7,11 @@ import pytest
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_ test_lemma = ko_tokenizer(word)[0].lemma_
assert test_lemma == lemma assert test_lemma == lemma
@pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
test_lemma = ko_tokenizer_natto(word)[0].lemma_
assert test_lemma == lemma

View File

@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer) b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b) ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes() assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer_natto.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
b = pickle.dumps(ko_tokenizer_natto)
ko_tokenizer_natto_re = pickle.loads(b)
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()

View File

@ -1,5 +1,6 @@
import pytest import pytest
# fmt: off # fmt: off
TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 ."), ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 ."),
@ -19,6 +20,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
# fmt: on # fmt: on
# tests for ko_tokenizer (default KoreanTokenizer)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens): def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@ -55,6 +58,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
assert tokens[1].pos_ == "X" assert tokens[1].pos_ == "X"
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
tokens = [token.text for token in ko_tokenizer_natto(text)]
assert tokens == expected_tokens.split()
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
tags = ko_tokenizer_natto(text).user_data["full_tags"]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
assert pos == expected_pos.split()
def test_ko_empty_doc(ko_tokenizer_natto):
tokens = ko_tokenizer_natto("")
assert len(tokens) == 0
@pytest.mark.issue(10535)
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
tokens = ko_tokenizer_natto("미닛 리피터")
assert tokens[1].pos_ == "X"
# fmt: off # fmt: off
SPACY_TOKENIZER_TESTS = [ SPACY_TOKENIZER_TESTS = [
("있다.", "있다 ."), ("있다.", "있다 ."),

View File

@ -268,18 +268,42 @@ used for training the current [Japanese pipelines](/models/ja).
### Korean language support {#korean} ### Korean language support {#korean}
> #### mecab-ko tokenizer There are currently three built-in options for Korean tokenization, two based on
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
using the rule-based tokenizer.
> #### Default mecab-ko tokenizer
> >
> ```python > ```python
> nlp = spacy.blank("ko") > nlp = spacy.blank("ko")
> ``` > ```
The default MeCab-based Korean tokenizer requires: The default MeCab-based Korean tokenizer requires the python package
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further requirements.
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
- [natto-py](https://github.com/buruzaemon/natto-py) - [natto-py](https://github.com/buruzaemon/natto-py)
> #### natto-py MeCab-ko tokenizer
>
> ```python
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
> nlp = spacy.blank("ko", config=config)
> ```
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
```ini
### config.cfg
[nlp]
lang = "ko"
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
```
For some Korean datasets and tasks, the For some Korean datasets and tasks, the
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
than MeCab. To configure a Korean pipeline with the rule-based tokenizer: than MeCab. To configure a Korean pipeline with the rule-based tokenizer: