mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 05:04:09 +03:00
Switch to mecab-ko as default Korean tokenizer (#11294)
* Switch to mecab-ko as default Korean tokenizer
Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.
Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.
* Temporarily run tests with mecab-ko tokenizer
* Fix types
* Fix duplicate test names
* Update requirements test
* Revert "Temporarily run tests with mecab-ko tokenizer"
This reverts commit d2083e7044
.
* Add mecab_args setting, fix pickle for KoreanNattoTokenizer
* Fix length check
* Update docs
* Formatting
* Update natto-py error message
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
This commit is contained in:
parent
1eb7ce5ef7
commit
2a558a7cdc
|
@ -114,7 +114,7 @@ ja =
|
|||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py>=0.9.0
|
||||
mecab-ko>=1.0.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
|
@ -18,34 +18,23 @@ DEFAULT_CONFIG = """
|
|||
|
||||
[nlp.tokenizer]
|
||||
@tokenizers = "spacy.ko.KoreanTokenizer"
|
||||
mecab_args = ""
|
||||
"""
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.ko.KoreanTokenizer")
|
||||
def create_tokenizer():
|
||||
def create_tokenizer(mecab_args: str):
|
||||
def korean_tokenizer_factory(nlp):
|
||||
return KoreanTokenizer(nlp.vocab)
|
||||
return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args)
|
||||
|
||||
return korean_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
def __init__(self, vocab: Vocab, *, mecab_args: str = ""):
|
||||
self.vocab = vocab
|
||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
mecab = try_mecab_import()
|
||||
self.mecab_tokenizer = mecab.Tagger(mecab_args)
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
for line in self.mecab_tokenizer.parse(text).split("\n"):
|
||||
if line == "EOS":
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
surface, _, expr = line.partition("\t")
|
||||
features = expr.split("/")[0].split(",")
|
||||
tag = features[0]
|
||||
lemma = "*"
|
||||
if len(features) >= 8:
|
||||
lemma = features[7]
|
||||
if lemma == "*":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
@ -97,20 +88,94 @@ class Korean(Language):
|
|||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
def try_mecab_import():
|
||||
try:
|
||||
from natto import MeCab
|
||||
import mecab_ko as MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
"the python package `mecab-ko`: pip install mecab-ko"
|
||||
) from None
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
|
||||
def create_natto_tokenizer():
|
||||
def korean_natto_tokenizer_factory(nlp):
|
||||
return KoreanNattoTokenizer(nlp.vocab)
|
||||
|
||||
return korean_natto_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanNattoTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
return self._mecab_tokenizer
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanNattoTokenizer, (self.vocab,)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
if lemma == "*" or lemma == "":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def _try_mecab_import(self):
|
||||
try:
|
||||
from natto import MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
prev_end = -1
|
||||
start = 0
|
||||
|
|
|
@ -239,7 +239,7 @@ def hsb_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer():
|
||||
pytest.importorskip("natto")
|
||||
pytest.importorskip("mecab_ko")
|
||||
return get_lang_class("ko")().tokenizer
|
||||
|
||||
|
||||
|
@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer():
|
|||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer_natto():
|
||||
pytest.importorskip("natto")
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("ko").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lb_tokenizer():
|
||||
return get_lang_class("lb")().tokenizer
|
||||
|
|
|
@ -7,3 +7,11 @@ import pytest
|
|||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||
)
|
||||
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
|
||||
test_lemma = ko_tokenizer_natto(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
|
|
@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
|
|||
b = pickle.dumps(ko_tokenizer)
|
||||
ko_tokenizer_re = pickle.loads(b)
|
||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
|
||||
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
ko_tokenizer_natto.to_disk(file_path)
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
|
||||
b = pickle.dumps(ko_tokenizer_natto)
|
||||
ko_tokenizer_natto_re = pickle.loads(b)
|
||||
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
|
||||
|
|
|
@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
|||
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
||||
# fmt: on
|
||||
|
||||
# tests for ko_tokenizer (default KoreanTokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
||||
|
@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
|||
assert pos == expected_pos.split()
|
||||
|
||||
|
||||
def test_ko_empty_doc(ko_tokenizer):
|
||||
def test_ko_tokenizer_empty_doc(ko_tokenizer):
|
||||
tokens = ko_tokenizer("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
|||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
|
||||
tokens = [token.text for token in ko_tokenizer_natto(text)]
|
||||
assert tokens == expected_tokens.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
|
||||
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
|
||||
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
|
||||
tags = ko_tokenizer_natto(text).user_data["full_tags"]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
|
||||
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
|
||||
assert pos == expected_pos.split()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto):
|
||||
tokens = ko_tokenizer_natto("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
@pytest.mark.issue(10535)
|
||||
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
|
||||
tokens = ko_tokenizer_natto("미닛 리피터")
|
||||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# fmt: off
|
||||
SPACY_TOKENIZER_TESTS = [
|
||||
("있다.", "있다 ."),
|
||||
|
|
|
@ -21,7 +21,7 @@ def test_build_dependencies():
|
|||
# ignore language-specific packages that shouldn't be installed by all
|
||||
libs_ignore_setup = [
|
||||
"fugashi",
|
||||
"natto-py",
|
||||
"mecab-ko",
|
||||
"pythainlp",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
|
|
|
@ -268,18 +268,49 @@ used for training the current [Japanese pipelines](/models/ja).
|
|||
|
||||
### Korean language support {#korean}
|
||||
|
||||
> #### mecab-ko tokenizer
|
||||
There are currently three built-in options for Korean tokenization, two based on
|
||||
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
|
||||
using the rule-based tokenizer.
|
||||
|
||||
> #### Default mecab-ko tokenizer
|
||||
>
|
||||
> ```python
|
||||
> # uses mecab-ko-dic
|
||||
> nlp = spacy.blank("ko")
|
||||
>
|
||||
> # with custom mecab args
|
||||
> mecab_args = "-d /path/to/dicdir -u /path/to/userdic"
|
||||
> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}}
|
||||
> nlp = spacy.blank("ko", config=config)
|
||||
> ```
|
||||
|
||||
The default MeCab-based Korean tokenizer requires:
|
||||
The default MeCab-based Korean tokenizer requires the python package
|
||||
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system
|
||||
requirements.
|
||||
|
||||
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
|
||||
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
|
||||
|
||||
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
||||
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
||||
- [natto-py](https://github.com/buruzaemon/natto-py)
|
||||
|
||||
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
|
||||
|
||||
> #### natto-py MeCab-ko tokenizer
|
||||
>
|
||||
> ```python
|
||||
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
|
||||
> nlp = spacy.blank("ko", config=config)
|
||||
> ```
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
[nlp]
|
||||
lang = "ko"
|
||||
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
|
||||
```
|
||||
|
||||
For some Korean datasets and tasks, the
|
||||
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
||||
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
||||
|
|
Loading…
Reference in New Issue
Block a user