Switch to mecab-ko as default Korean tokenizer (#11294)

* Switch to mecab-ko as default Korean tokenizer

Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.

* Temporarily run tests with mecab-ko tokenizer

* Fix types

* Fix duplicate test names

* Update requirements test

* Revert "Temporarily run tests with mecab-ko tokenizer"

This reverts commit d2083e7044.

* Add mecab_args setting, fix pickle for KoreanNattoTokenizer

* Fix length check

* Update docs

* Formatting

* Update natto-py error message

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
This commit is contained in:
Adriane Boyd 2022-08-26 10:11:18 +02:00 committed by GitHub
parent 1eb7ce5ef7
commit 2a558a7cdc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 212 additions and 34 deletions

View File

@ -114,7 +114,7 @@ ja =
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py>=0.9.0
mecab-ko>=1.0.0
th =
pythainlp>=2.0

View File

@ -18,34 +18,23 @@ DEFAULT_CONFIG = """
[nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
mecab_args = ""
"""
@registry.tokenizers("spacy.ko.KoreanTokenizer")
def create_tokenizer():
def create_tokenizer(mecab_args: str):
def korean_tokenizer_factory(nlp):
return KoreanTokenizer(nlp.vocab)
return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
return korean_tokenizer_factory
class KoreanTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab):
def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
self.vocab = vocab
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
self._mecab_tokenizer = None
@property
def mecab_tokenizer(self):
# This is a property so that initializing a pipeline with blank:ko is
# possible without actually requiring mecab-ko, e.g. to run
# `spacy init vectors ko` for a pipeline that will have a different
# tokenizer in the end. The languages need to match for the vectors
# to be imported and there's no way to pass a custom config to
# `init vectors`.
if self._mecab_tokenizer is None:
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
return self._mecab_tokenizer
mecab = try_mecab_import()
self.mecab_tokenizer = mecab.Tagger(mecab_args)
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer):
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
if node.is_eos():
for line in self.mecab_tokenizer.parse(text).split("\n"):
if line == "EOS":
break
surface = node.surface
feature = node.feature
tag, _, expr = feature.partition(",")
lemma, _, remainder = expr.partition("/")
surface, _, expr = line.partition("\t")
features = expr.split("/")[0].split(",")
tag = features[0]
lemma = "*"
if len(features) >= 8:
lemma = features[7]
if lemma == "*":
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
@ -97,20 +88,94 @@ class Korean(Language):
Defaults = KoreanDefaults
def try_mecab_import() -> None:
def try_mecab_import():
try:
from natto import MeCab
import mecab_ko as MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
"the python package `mecab-ko`: pip install mecab-ko"
) from None
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
def create_natto_tokenizer():
def korean_natto_tokenizer_factory(nlp):
return KoreanNattoTokenizer(nlp.vocab)
return korean_natto_tokenizer_factory
class KoreanNattoTokenizer(DummyTokenizer):
def __init__(self, vocab: Vocab):
self.vocab = vocab
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
self._mecab_tokenizer = None
@property
def mecab_tokenizer(self):
# This is a property so that initializing a pipeline with blank:ko is
# possible without actually requiring mecab-ko, e.g. to run
# `spacy init vectors ko` for a pipeline that will have a different
# tokenizer in the end. The languages need to match for the vectors
# to be imported and there's no way to pass a custom config to
# `init vectors`.
if self._mecab_tokenizer is None:
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
return self._mecab_tokenizer
def __reduce__(self):
return KoreanNattoTokenizer, (self.vocab,)
def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
if node.is_eos():
break
surface = node.surface
feature = node.feature
tag, _, expr = feature.partition(",")
lemma, _, remainder = expr.partition("/")
if lemma == "*" or lemma == "":
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
return Scorer.score_tokenization(examples)
def _try_mecab_import(self):
try:
from natto import MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
) from None
def check_spaces(text, tokens):
prev_end = -1
start = 0

View File

@ -239,7 +239,7 @@ def hsb_tokenizer():
@pytest.fixture(scope="session")
def ko_tokenizer():
pytest.importorskip("natto")
pytest.importorskip("mecab_ko")
return get_lang_class("ko")().tokenizer
@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer():
return nlp.tokenizer
@pytest.fixture(scope="session")
def ko_tokenizer_natto():
pytest.importorskip("natto")
config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
}
}
}
nlp = get_lang_class("ko").from_config(config)
return nlp.tokenizer
@pytest.fixture(scope="session")
def lb_tokenizer():
return get_lang_class("lb")().tokenizer

View File

@ -7,3 +7,11 @@ import pytest
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_
assert test_lemma == lemma
@pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
)
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
test_lemma = ko_tokenizer_natto(word)[0].lemma_
assert test_lemma == lemma

View File

@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer_natto.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
b = pickle.dumps(ko_tokenizer_natto)
ko_tokenizer_natto_re = pickle.loads(b)
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()

View File

@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
# fmt: on
# tests for ko_tokenizer (default KoreanTokenizer)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
assert pos == expected_pos.split()
def test_ko_empty_doc(ko_tokenizer):
def test_ko_tokenizer_empty_doc(ko_tokenizer):
tokens = ko_tokenizer("")
assert len(tokens) == 0
@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
assert tokens[1].pos_ == "X"
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
tokens = [token.text for token in ko_tokenizer_natto(text)]
assert tokens == expected_tokens.split()
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
tags = ko_tokenizer_natto(text).user_data["full_tags"]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
assert pos == expected_pos.split()
def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
tokens = ko_tokenizer_natto("")
assert len(tokens) == 0
@pytest.mark.issue(10535)
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
tokens = ko_tokenizer_natto("미닛 리피터")
assert tokens[1].pos_ == "X"
# fmt: off
SPACY_TOKENIZER_TESTS = [
("있다.", "있다 ."),

View File

@ -21,7 +21,7 @@ def test_build_dependencies():
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [
"fugashi",
"natto-py",
"mecab-ko",
"pythainlp",
"sudachipy",
"sudachidict_core",

View File

@ -268,18 +268,49 @@ used for training the current [Japanese pipelines](/models/ja).
### Korean language support {#korean}
> #### mecab-ko tokenizer
There are currently three built-in options for Korean tokenization, two based on
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
using the rule-based tokenizer.
> #### Default mecab-ko tokenizer
>
> ```python
> # uses mecab-ko-dic
> nlp = spacy.blank("ko")
>
> # with custom mecab args
> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
> nlp = spacy.blank("ko", config=config)
> ```
The default MeCab-based Korean tokenizer requires:
The default MeCab-based Korean tokenizer requires the python package
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
requirements.
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
- [natto-py](https://github.com/buruzaemon/natto-py)
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
> #### natto-py MeCab-ko tokenizer
>
> ```python
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
> nlp = spacy.blank("ko", config=config)
> ```
```ini
### config.cfg
[nlp]
lang = "ko"
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
```
For some Korean datasets and tasks, the
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
than MeCab. To configure a Korean pipeline with the rule-based tokenizer: