diff --git a/setup.cfg b/setup.cfg index 708300b04..8bce8cff2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -114,7 +114,7 @@ ja = sudachipy>=0.5.2,!=0.6.1 sudachidict_core>=20211220 ko = - natto-py>=0.9.0 + mecab-ko>=1.0.0 th = pythainlp>=2.0 diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 0e02e4a2d..2c14d97d7 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -32,7 +32,85 @@ def create_tokenizer(): class KoreanTokenizer(DummyTokenizer): def __init__(self, vocab: Vocab): self.vocab = vocab - self._mecab = try_mecab_import() # type: ignore[func-returns-value] + mecab = try_mecab_import() + self.mecab_tokenizer = mecab.Tagger() + + def __reduce__(self): + return KoreanTokenizer, (self.vocab,) + + def __call__(self, text: str) -> Doc: + dtokens = list(self.detailed_tokens(text)) + surfaces = [dt["surface"] for dt in dtokens] + doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) + for token, dtoken in zip(doc, dtokens): + first_tag, sep, eomi_tags = dtoken["tag"].partition("+") + token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + if token.tag_ in TAG_MAP: + token.pos = TAG_MAP[token.tag_][POS] + else: + token.pos = X + token.lemma_ = dtoken["lemma"] + doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] + return doc + + def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: + # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], + # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * + for line in self.mecab_tokenizer.parse(text).split("\n"): + if line == "EOS": + break + surface, _, expr = line.partition("\t") + features = expr.split("/")[0].split(",") + tag = features[0] + lemma = "*" + if len(features) >= 7: + lemma = features[7] + if lemma == "*": + lemma = surface + yield {"surface": surface, "lemma": lemma, "tag": tag} + + def score(self, examples): + validate_examples(examples, "KoreanTokenizer.score") + return Scorer.score_tokenization(examples) + + +class KoreanDefaults(BaseDefaults): + config = load_config_from_str(DEFAULT_CONFIG) + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + infixes = TOKENIZER_INFIXES + + +class Korean(Language): + lang = "ko" + Defaults = KoreanDefaults + + +def try_mecab_import() -> None: + try: + import mecab_ko as MeCab + + return MeCab + except ImportError: + raise ImportError( + 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' + "the python package `mecab-ko`: pip install mecab-ko" + ) from None + + +@registry.tokenizers("spacy.KoreanNattoTokenizer.v1") +def create_natto_tokenizer(): + def korean_natto_tokenizer_factory(nlp): + return KoreanNattoTokenizer(nlp.vocab) + + return korean_natto_tokenizer_factory + + +class KoreanNattoTokenizer(DummyTokenizer): + def __init__(self, vocab: Vocab): + self.vocab = vocab + self._mecab = self._try_mecab_import() # type: ignore[func-returns-value] self._mecab_tokenizer = None @property @@ -66,8 +144,8 @@ class KoreanTokenizer(DummyTokenizer): return doc def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: - # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], - # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * + # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], + # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * for node in self.mecab_tokenizer.parse(text, as_nodes=True): if node.is_eos(): break @@ -75,7 +153,7 @@ class KoreanTokenizer(DummyTokenizer): feature = node.feature tag, _, expr = feature.partition(",") lemma, _, remainder = expr.partition("/") - if lemma == "*": + if lemma == "*" or lemma == "": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} @@ -83,32 +161,18 @@ class KoreanTokenizer(DummyTokenizer): validate_examples(examples, "KoreanTokenizer.score") return Scorer.score_tokenization(examples) + def _try_mecab_import(self): + try: + from natto import MeCab -class KoreanDefaults(BaseDefaults): - config = load_config_from_str(DEFAULT_CONFIG) - lex_attr_getters = LEX_ATTRS - stop_words = STOP_WORDS - writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} - infixes = TOKENIZER_INFIXES - - -class Korean(Language): - lang = "ko" - Defaults = KoreanDefaults - - -def try_mecab_import() -> None: - try: - from natto import MeCab - - return MeCab - except ImportError: - raise ImportError( - 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' - "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " - "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " - "and [natto-py](https://github.com/buruzaemon/natto-py)" - ) from None + return MeCab + except ImportError: + raise ImportError( + 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' + "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " + "and [natto-py](https://github.com/buruzaemon/natto-py)" + ) from None def check_spaces(text, tokens): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index eb643ec2f..c8e72df05 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -239,7 +239,7 @@ def hsb_tokenizer(): @pytest.fixture(scope="session") def ko_tokenizer(): - pytest.importorskip("natto") + pytest.importorskip("mecab_ko") return get_lang_class("ko")().tokenizer @@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer(): return nlp.tokenizer +@pytest.fixture(scope="session") +def ko_tokenizer_natto(): + pytest.importorskip("natto") + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.KoreanNattoTokenizer.v1", + } + } + } + nlp = get_lang_class("ko").from_config(config) + return nlp.tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 7782ca4bc..0c389b9ce 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -7,3 +7,11 @@ import pytest def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): test_lemma = ko_tokenizer(word)[0].lemma_ assert test_lemma == lemma + + +@pytest.mark.parametrize( + "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")] +) +def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma): + test_lemma = ko_tokenizer_natto(word)[0].lemma_ + assert test_lemma == lemma diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py index 75288fcc5..35d28d42a 100644 --- a/spacy/tests/lang/ko/test_serialize.py +++ b/spacy/tests/lang/ko/test_serialize.py @@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer): b = pickle.dumps(ko_tokenizer) ko_tokenizer_re = pickle.loads(b) assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes() + + +def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto): + tokenizer_bytes = ko_tokenizer_natto.to_bytes() + nlp = Korean() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + ko_tokenizer_natto.to_disk(file_path) + nlp = Korean() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto): + b = pickle.dumps(ko_tokenizer_natto) + ko_tokenizer_natto_re = pickle.loads(b) + assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes() diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index 6e06e405e..e2d8dd721 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -1,5 +1,6 @@ import pytest + # fmt: off TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 ."), @@ -19,6 +20,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.", "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] # fmt: on +# tests for ko_tokenizer (default KoreanTokenizer) + @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) def test_ko_tokenizer(ko_tokenizer, text, expected_tokens): @@ -55,6 +58,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer): assert tokens[1].pos_ == "X" +# same tests for ko_tokenizer_natto (KoreanNattoTokenizer) + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer_natto(text)] + assert tokens == expected_tokens.split() + + +@pytest.mark.parametrize("text,expected_tags", TAG_TESTS) +def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags): + tags = [token.tag_ for token in ko_tokenizer_natto(text)] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS) +def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags): + tags = ko_tokenizer_natto(text).user_data["full_tags"] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_pos", POS_TESTS) +def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos): + pos = [token.pos_ for token in ko_tokenizer_natto(text)] + assert pos == expected_pos.split() + + +def test_ko_empty_doc(ko_tokenizer_natto): + tokens = ko_tokenizer_natto("") + assert len(tokens) == 0 + + +@pytest.mark.issue(10535) +def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto): + tokens = ko_tokenizer_natto("미닛 리피터") + assert tokens[1].pos_ == "X" + + # fmt: off SPACY_TOKENIZER_TESTS = [ ("있다.", "있다 ."), diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 56992e7e3..c604f9cfb 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -268,18 +268,42 @@ used for training the current [Japanese pipelines](/models/ja). ### Korean language support {#korean} -> #### mecab-ko tokenizer +There are currently three built-in options for Korean tokenization, two based on +[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one +using the rule-based tokenizer. + +> #### Default mecab-ko tokenizer > > ```python > nlp = spacy.blank("ko") > ``` -The default MeCab-based Korean tokenizer requires: +The default MeCab-based Korean tokenizer requires the python package +[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further requirements. + +The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and +earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires: - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) - [natto-py](https://github.com/buruzaemon/natto-py) +> #### natto-py MeCab-ko tokenizer +> +> ```python +> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}} +> nlp = spacy.blank("ko", config=config) +> ``` + +To use this tokenizer, edit `[nlp.tokenizer]` in your config: + +```ini +### config.cfg +[nlp] +lang = "ko" +tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"} +``` + For some Korean datasets and tasks, the [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited than MeCab. To configure a Korean pipeline with the rule-based tokenizer: