mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 02:36:32 +03:00
Switch to mecab-ko as default Korean tokenizer
Switch to the (confusingly-named) mecab-ko python module for default Korean tokenization. Maintain the previous `natto-py` tokenizer as `spacy.KoreanNattoTokenizer.v1`.
This commit is contained in:
parent
b64243ed55
commit
a2a0e1abf1
|
@ -114,7 +114,7 @@ ja =
|
||||||
sudachipy>=0.5.2,!=0.6.1
|
sudachipy>=0.5.2,!=0.6.1
|
||||||
sudachidict_core>=20211220
|
sudachidict_core>=20211220
|
||||||
ko =
|
ko =
|
||||||
natto-py>=0.9.0
|
mecab-ko>=1.0.0
|
||||||
th =
|
th =
|
||||||
pythainlp>=2.0
|
pythainlp>=2.0
|
||||||
|
|
||||||
|
|
|
@ -32,7 +32,85 @@ def create_tokenizer():
|
||||||
class KoreanTokenizer(DummyTokenizer):
|
class KoreanTokenizer(DummyTokenizer):
|
||||||
def __init__(self, vocab: Vocab):
|
def __init__(self, vocab: Vocab):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._mecab = try_mecab_import() # type: ignore[func-returns-value]
|
mecab = try_mecab_import()
|
||||||
|
self.mecab_tokenizer = mecab.Tagger()
|
||||||
|
|
||||||
|
def __reduce__(self):
|
||||||
|
return KoreanTokenizer, (self.vocab,)
|
||||||
|
|
||||||
|
def __call__(self, text: str) -> Doc:
|
||||||
|
dtokens = list(self.detailed_tokens(text))
|
||||||
|
surfaces = [dt["surface"] for dt in dtokens]
|
||||||
|
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||||
|
for token, dtoken in zip(doc, dtokens):
|
||||||
|
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||||
|
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||||
|
if token.tag_ in TAG_MAP:
|
||||||
|
token.pos = TAG_MAP[token.tag_][POS]
|
||||||
|
else:
|
||||||
|
token.pos = X
|
||||||
|
token.lemma_ = dtoken["lemma"]
|
||||||
|
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||||
|
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||||
|
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||||
|
for line in self.mecab_tokenizer.parse(text).split("\n"):
|
||||||
|
if line == "EOS":
|
||||||
|
break
|
||||||
|
surface, _, expr = line.partition("\t")
|
||||||
|
features = expr.split("/")[0].split(",")
|
||||||
|
tag = features[0]
|
||||||
|
lemma = "*"
|
||||||
|
if len(features) >= 7:
|
||||||
|
lemma = features[7]
|
||||||
|
if lemma == "*":
|
||||||
|
lemma = surface
|
||||||
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
def score(self, examples):
|
||||||
|
validate_examples(examples, "KoreanTokenizer.score")
|
||||||
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
|
||||||
|
class KoreanDefaults(BaseDefaults):
|
||||||
|
config = load_config_from_str(DEFAULT_CONFIG)
|
||||||
|
lex_attr_getters = LEX_ATTRS
|
||||||
|
stop_words = STOP_WORDS
|
||||||
|
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
||||||
|
infixes = TOKENIZER_INFIXES
|
||||||
|
|
||||||
|
|
||||||
|
class Korean(Language):
|
||||||
|
lang = "ko"
|
||||||
|
Defaults = KoreanDefaults
|
||||||
|
|
||||||
|
|
||||||
|
def try_mecab_import() -> None:
|
||||||
|
try:
|
||||||
|
import mecab_ko as MeCab
|
||||||
|
|
||||||
|
return MeCab
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||||
|
"the python package `mecab-ko`: pip install mecab-ko"
|
||||||
|
) from None
|
||||||
|
|
||||||
|
|
||||||
|
@registry.tokenizers("spacy.KoreanNattoTokenizer.v1")
|
||||||
|
def create_natto_tokenizer():
|
||||||
|
def korean_natto_tokenizer_factory(nlp):
|
||||||
|
return KoreanNattoTokenizer(nlp.vocab)
|
||||||
|
|
||||||
|
return korean_natto_tokenizer_factory
|
||||||
|
|
||||||
|
|
||||||
|
class KoreanNattoTokenizer(DummyTokenizer):
|
||||||
|
def __init__(self, vocab: Vocab):
|
||||||
|
self.vocab = vocab
|
||||||
|
self._mecab = self._try_mecab_import() # type: ignore[func-returns-value]
|
||||||
self._mecab_tokenizer = None
|
self._mecab_tokenizer = None
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -66,8 +144,8 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||||
if node.is_eos():
|
if node.is_eos():
|
||||||
break
|
break
|
||||||
|
@ -75,7 +153,7 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
feature = node.feature
|
feature = node.feature
|
||||||
tag, _, expr = feature.partition(",")
|
tag, _, expr = feature.partition(",")
|
||||||
lemma, _, remainder = expr.partition("/")
|
lemma, _, remainder = expr.partition("/")
|
||||||
if lemma == "*":
|
if lemma == "*" or lemma == "":
|
||||||
lemma = surface
|
lemma = surface
|
||||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||||
|
|
||||||
|
@ -83,32 +161,18 @@ class KoreanTokenizer(DummyTokenizer):
|
||||||
validate_examples(examples, "KoreanTokenizer.score")
|
validate_examples(examples, "KoreanTokenizer.score")
|
||||||
return Scorer.score_tokenization(examples)
|
return Scorer.score_tokenization(examples)
|
||||||
|
|
||||||
|
def _try_mecab_import(self):
|
||||||
|
try:
|
||||||
|
from natto import MeCab
|
||||||
|
|
||||||
class KoreanDefaults(BaseDefaults):
|
return MeCab
|
||||||
config = load_config_from_str(DEFAULT_CONFIG)
|
except ImportError:
|
||||||
lex_attr_getters = LEX_ATTRS
|
raise ImportError(
|
||||||
stop_words = STOP_WORDS
|
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||||
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
|
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||||
infixes = TOKENIZER_INFIXES
|
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||||
|
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||||
|
) from None
|
||||||
class Korean(Language):
|
|
||||||
lang = "ko"
|
|
||||||
Defaults = KoreanDefaults
|
|
||||||
|
|
||||||
|
|
||||||
def try_mecab_import() -> None:
|
|
||||||
try:
|
|
||||||
from natto import MeCab
|
|
||||||
|
|
||||||
return MeCab
|
|
||||||
except ImportError:
|
|
||||||
raise ImportError(
|
|
||||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
|
||||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
|
||||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
|
||||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
|
||||||
) from None
|
|
||||||
|
|
||||||
|
|
||||||
def check_spaces(text, tokens):
|
def check_spaces(text, tokens):
|
||||||
|
|
|
@ -239,7 +239,7 @@ def hsb_tokenizer():
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def ko_tokenizer():
|
def ko_tokenizer():
|
||||||
pytest.importorskip("natto")
|
pytest.importorskip("mecab_ko")
|
||||||
return get_lang_class("ko")().tokenizer
|
return get_lang_class("ko")().tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer():
|
||||||
return nlp.tokenizer
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def ko_tokenizer_natto():
|
||||||
|
pytest.importorskip("natto")
|
||||||
|
config = {
|
||||||
|
"nlp": {
|
||||||
|
"tokenizer": {
|
||||||
|
"@tokenizers": "spacy.KoreanNattoTokenizer.v1",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nlp = get_lang_class("ko").from_config(config)
|
||||||
|
return nlp.tokenizer
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lb_tokenizer():
|
def lb_tokenizer():
|
||||||
return get_lang_class("lb")().tokenizer
|
return get_lang_class("lb")().tokenizer
|
||||||
|
|
|
@ -7,3 +7,11 @@ import pytest
|
||||||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||||
assert test_lemma == lemma
|
assert test_lemma == lemma
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||||
|
)
|
||||||
|
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
|
||||||
|
test_lemma = ko_tokenizer_natto(word)[0].lemma_
|
||||||
|
assert test_lemma == lemma
|
||||||
|
|
|
@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer):
|
||||||
b = pickle.dumps(ko_tokenizer)
|
b = pickle.dumps(ko_tokenizer)
|
||||||
ko_tokenizer_re = pickle.loads(b)
|
ko_tokenizer_re = pickle.loads(b)
|
||||||
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
|
||||||
|
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
with make_tempdir() as d:
|
||||||
|
file_path = d / "tokenizer"
|
||||||
|
ko_tokenizer_natto.to_disk(file_path)
|
||||||
|
nlp = Korean()
|
||||||
|
nlp.tokenizer.from_disk(file_path)
|
||||||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
|
||||||
|
b = pickle.dumps(ko_tokenizer_natto)
|
||||||
|
ko_tokenizer_natto_re = pickle.loads(b)
|
||||||
|
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
|
TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."),
|
||||||
("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 ."),
|
("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 ."),
|
||||||
|
@ -19,6 +20,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
||||||
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
"PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
# tests for ko_tokenizer (default KoreanTokenizer)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
def test_ko_tokenizer(ko_tokenizer, text, expected_tokens):
|
||||||
|
@ -55,6 +58,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer):
|
||||||
assert tokens[1].pos_ == "X"
|
assert tokens[1].pos_ == "X"
|
||||||
|
|
||||||
|
|
||||||
|
# same tests for ko_tokenizer_natto (KoreanNattoTokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
|
||||||
|
def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens):
|
||||||
|
tokens = [token.text for token in ko_tokenizer_natto(text)]
|
||||||
|
assert tokens == expected_tokens.split()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tags", TAG_TESTS)
|
||||||
|
def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags):
|
||||||
|
tags = [token.tag_ for token in ko_tokenizer_natto(text)]
|
||||||
|
assert tags == expected_tags.split()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
|
||||||
|
def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags):
|
||||||
|
tags = ko_tokenizer_natto(text).user_data["full_tags"]
|
||||||
|
assert tags == expected_tags.split()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||||
|
def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos):
|
||||||
|
pos = [token.pos_ for token in ko_tokenizer_natto(text)]
|
||||||
|
assert pos == expected_pos.split()
|
||||||
|
|
||||||
|
|
||||||
|
def test_ko_empty_doc(ko_tokenizer_natto):
|
||||||
|
tokens = ko_tokenizer_natto("")
|
||||||
|
assert len(tokens) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.issue(10535)
|
||||||
|
def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto):
|
||||||
|
tokens = ko_tokenizer_natto("미닛 리피터")
|
||||||
|
assert tokens[1].pos_ == "X"
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
SPACY_TOKENIZER_TESTS = [
|
SPACY_TOKENIZER_TESTS = [
|
||||||
("있다.", "있다 ."),
|
("있다.", "있다 ."),
|
||||||
|
|
|
@ -268,18 +268,42 @@ used for training the current [Japanese pipelines](/models/ja).
|
||||||
|
|
||||||
### Korean language support {#korean}
|
### Korean language support {#korean}
|
||||||
|
|
||||||
> #### mecab-ko tokenizer
|
There are currently three built-in options for Korean tokenization, two based on
|
||||||
|
[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one
|
||||||
|
using the rule-based tokenizer.
|
||||||
|
|
||||||
|
> #### Default mecab-ko tokenizer
|
||||||
>
|
>
|
||||||
> ```python
|
> ```python
|
||||||
> nlp = spacy.blank("ko")
|
> nlp = spacy.blank("ko")
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The default MeCab-based Korean tokenizer requires:
|
The default MeCab-based Korean tokenizer requires the python package
|
||||||
|
[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further requirements.
|
||||||
|
|
||||||
|
The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and
|
||||||
|
earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires:
|
||||||
|
|
||||||
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
||||||
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
||||||
- [natto-py](https://github.com/buruzaemon/natto-py)
|
- [natto-py](https://github.com/buruzaemon/natto-py)
|
||||||
|
|
||||||
|
> #### natto-py MeCab-ko tokenizer
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}}
|
||||||
|
> nlp = spacy.blank("ko", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
To use this tokenizer, edit `[nlp.tokenizer]` in your config:
|
||||||
|
|
||||||
|
```ini
|
||||||
|
### config.cfg
|
||||||
|
[nlp]
|
||||||
|
lang = "ko"
|
||||||
|
tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"}
|
||||||
|
```
|
||||||
|
|
||||||
For some Korean datasets and tasks, the
|
For some Korean datasets and tasks, the
|
||||||
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
||||||
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
than MeCab. To configure a Korean pipeline with the rule-based tokenizer:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user