mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Switch to mecab-ko as default Korean tokenizer
Switch to the (confusingly-named) mecab-ko python module for default Korean tokenization. Maintain the previous `natto-py` tokenizer as `spacy.KoreanNattoTokenizer.v1`.
This commit is contained in:
parent
b64243ed55
commit
a2a0e1abf1
|
@ -114,7 +114,7 @@ ja =
|
|||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py>=0.9.0
|
||||
mecab-ko>=1.0.0
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
|
@ -32,7 +32,85 @@ def create_tokenizer():
|
|||
class KoreanTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
||||
mecab = try_mecab_import()
|
||||
self.mecab_tokenizer = mecab.Tagger()
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for line in self.mecab_tokenizer.parse(text).split("\n"):
|
||||
if line == "EOS":
|
||||
break
|
||||
surface, _, expr = line.partition("\t")
|
||||
features = expr.split("/")[0].split(",")
|
||||
tag = features[0]
|
||||
lemma = "*"
|
||||
if len(features) >= 7:
|
||||
lemma = features[7]
|
||||
if lemma == "*":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
|
||||
class KoreanDefaults(BaseDefaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
lang = "ko"
|
||||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
try:
|
||||
import mecab_ko as MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"the python package `mecab-ko`: pip install mecab-ko"
|
||||
) from None
|
||||
|
||||
|
||||
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
|
||||
def create_natto_tokenizer():
|
||||
def korean_natto_tokenizer_factory(nlp):
|
||||
return KoreanNattoTokenizer(nlp.vocab)
|
||||
|
||||
return korean_natto_tokenizer_factory
|
||||
|
||||
|
||||
class KoreanNattoTokenizer(DummyTokenizer):
|
||||
def __init__(self, vocab: Vocab):
|
||||
self.vocab = vocab
|
||||
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
|
||||
self._mecab_tokenizer = None
|
||||
|
||||
@property
|
||||
|
@ -66,8 +144,8 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
return doc
|
||||
|
||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
break
|
||||
|
@ -75,7 +153,7 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
if lemma == "*":
|
||||
if lemma == "*" or lemma == "":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
|
||||
|
@ -83,32 +161,18 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
return Scorer.score_tokenization(examples)
|
||||
|
||||
def _try_mecab_import(self):
|
||||
try:
|
||||
from natto import MeCab
|
||||
|
||||
class KoreanDefaults(BaseDefaults):
|
||||
config = load_config_from_str(DEFAULT_CONFIG)
|
||||
lex_attr_getters = LEX_ATTRS
|
||||
stop_words = STOP_WORDS
|
||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||
infixes = TOKENIZER_INFIXES
|
||||
|
||||
|
||||
class Korean(Language):
|
||||
lang = "ko"
|
||||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
try:
|
||||
from natto import MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
) from None
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
|
|
|
@ -239,7 +239,7 @@ def hsb_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer():
|
||||
pytest.importorskip("natto")
|
||||
pytest.importorskip("mecab_ko")
|
||||
return get_lang_class("ko")().tokenizer
|
||||
|
||||
|
||||
|
@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer():
|
|||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer_natto():
|
||||
pytest.importorskip("natto")
|
||||
config = {
|
||||
"nlp": {
|
||||
"tokenizer": {
|
||||
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
|
||||
}
|
||||
}
|
||||
}
|
||||
nlp = get_lang_class("ko").from_config(config)
|
||||
return nlp.tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lb_tokenizer():
|
||||
return get_lang_class("lb")().tokenizer
|
||||
|
|
|
@ -7,3 +7,11 @@ import pytest
|
|||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||
)
|
||||
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
|
||||
test_lemma = ko_tokenizer_natto(word)[0].lemma_
|
||||
assert test_lemma == lemma
|
||||
|
|
|
@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
|
|||
b = pickle.dumps(ko_tokenizer)
|
||||
ko_tokenizer_re = pickle.loads(b)
|
||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
|
||||
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
with make_tempdir() as d:
|
||||
file_path = d / "tokenizer"
|
||||
ko_tokenizer_natto.to_disk(file_path)
|
||||
nlp = Korean()
|
||||
nlp.tokenizer.from_disk(file_path)
|
||||
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||
|
||||
|
||||
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
|
||||
b = pickle.dumps(ko_tokenizer_natto)
|
||||
ko_tokenizer_natto_re = pickle.loads(b)
|
||||
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
import pytest
|
||||
|
||||
|
||||
# fmt: off
|
||||
TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
|
||||
("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 ."),
|
||||
|
@ -19,6 +20,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
|||
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
||||
# fmt: on
|
||||
|
||||
# tests for ko_tokenizer (default KoreanTokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
||||
|
@ -55,6 +58,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
|||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
|
||||
tokens = [token.text for token in ko_tokenizer_natto(text)]
|
||||
assert tokens == expected_tokens.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
|
||||
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
|
||||
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
|
||||
tags = ko_tokenizer_natto(text).user_data["full_tags"]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
|
||||
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
|
||||
assert pos == expected_pos.split()
|
||||
|
||||
|
||||
def test_ko_empty_doc(ko_tokenizer_natto):
|
||||
tokens = ko_tokenizer_natto("")
|
||||
assert len(tokens) == 0
|
||||
|
||||
|
||||
@pytest.mark.issue(10535)
|
||||
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
|
||||
tokens = ko_tokenizer_natto("미닛 리피터")
|
||||
assert tokens[1].pos_ == "X"
|
||||
|
||||
|
||||
# fmt: off
|
||||
SPACY_TOKENIZER_TESTS = [
|
||||
("있다.", "있다 ."),
|
||||
|
|
|
@ -268,18 +268,42 @@ used for training the current [Japanese pipelines](/models/ja).
|
|||
|
||||
### Korean language support {#korean}
|
||||
|
||||
> #### mecab-ko tokenizer
|
||||
There are currently three built-in options for Korean tokenization, two based on
|
||||
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
|
||||
using the rule-based tokenizer.
|
||||
|
||||
> #### Default mecab-ko tokenizer
|
||||
>
|
||||
> ```python
|
||||
> nlp = spacy.blank("ko")
|
||||
> ```
|
||||
|
||||
The default MeCab-based Korean tokenizer requires:
|
||||
The default MeCab-based Korean tokenizer requires the python package
|
||||
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further requirements.
|
||||
|
||||
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
|
||||
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
|
||||
|
||||
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
||||
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
||||
- [natto-py](https://github.com/buruzaemon/natto-py)
|
||||
|
||||
> #### natto-py MeCab-ko tokenizer
|
||||
>
|
||||
> ```python
|
||||
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
|
||||
> nlp = spacy.blank("ko", config=config)
|
||||
> ```
|
||||
|
||||
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
|
||||
|
||||
```ini
|
||||
### config.cfg
|
||||
[nlp]
|
||||
lang = "ko"
|
||||
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
|
||||
```
|
||||
|
||||
For some Korean datasets and tasks, the
|
||||
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
||||
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
||||
|
|
Loading…
Reference in New Issue
Block a user