diff --git a/setup.cfg b/setup.cfg index 708300b04..8bce8cff2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -114,7 +114,7 @@ ja = sudachipy>=0.5.2,!=0.6.1 sudachidict_core>=20211220 ko = - natto-py>=0.9.0 + mecab-ko>=1.0.0 th = pythainlp>=2.0 diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 0e02e4a2d..1220aa141 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -18,34 +18,23 @@ DEFAULT_CONFIG = """ [nlp.tokenizer] @tokenizers = "spacy.ko.KoreanTokenizer" +mecab_args = "" """ @registry.tokenizers("spacy.ko.KoreanTokenizer") -def create_tokenizer(): +def create_tokenizer(mecab_args: str): def korean_tokenizer_factory(nlp): - return KoreanTokenizer(nlp.vocab) + return KoreanTokenizer(nlp.vocab, mecab_args=mecab_args) return korean_tokenizer_factory class KoreanTokenizer(DummyTokenizer): - def __init__(self, vocab: Vocab): + def __init__(self, vocab: Vocab, *, mecab_args: str = ""): self.vocab = vocab - self._mecab = try_mecab_import() # type: ignore[func-returns-value] - self._mecab_tokenizer = None - - @property - def mecab_tokenizer(self): - # This is a property so that initializing a pipeline with blank:ko is - # possible without actually requiring mecab-ko, e.g. to run - # `spacy init vectors ko` for a pipeline that will have a different - # tokenizer in the end. The languages need to match for the vectors - # to be imported and there's no way to pass a custom config to - # `init vectors`. - if self._mecab_tokenizer is None: - self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") - return self._mecab_tokenizer + mecab = try_mecab_import() + self.mecab_tokenizer = mecab.Tagger(mecab_args) def __reduce__(self): return KoreanTokenizer, (self.vocab,) @@ -68,13 +57,15 @@ class KoreanTokenizer(DummyTokenizer): def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * - for node in self.mecab_tokenizer.parse(text, as_nodes=True): - if node.is_eos(): + for line in self.mecab_tokenizer.parse(text).split("\n"): + if line == "EOS": break - surface = node.surface - feature = node.feature - tag, _, expr = feature.partition(",") - lemma, _, remainder = expr.partition("/") + surface, _, expr = line.partition("\t") + features = expr.split("/")[0].split(",") + tag = features[0] + lemma = "*" + if len(features) >= 8: + lemma = features[7] if lemma == "*": lemma = surface yield {"surface": surface, "lemma": lemma, "tag": tag} @@ -97,20 +88,94 @@ class Korean(Language): Defaults = KoreanDefaults -def try_mecab_import() -> None: +def try_mecab_import(): try: - from natto import MeCab + import mecab_ko as MeCab return MeCab except ImportError: raise ImportError( 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' - "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " - "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " - "and [natto-py](https://github.com/buruzaemon/natto-py)" + "the python package `mecab-ko`: pip install mecab-ko" ) from None +@registry.tokenizers("spacy.KoreanNattoTokenizer.v1") +def create_natto_tokenizer(): + def korean_natto_tokenizer_factory(nlp): + return KoreanNattoTokenizer(nlp.vocab) + + return korean_natto_tokenizer_factory + + +class KoreanNattoTokenizer(DummyTokenizer): + def __init__(self, vocab: Vocab): + self.vocab = vocab + self._mecab = self._try_mecab_import() # type: ignore[func-returns-value] + self._mecab_tokenizer = None + + @property + def mecab_tokenizer(self): + # This is a property so that initializing a pipeline with blank:ko is + # possible without actually requiring mecab-ko, e.g. to run + # `spacy init vectors ko` for a pipeline that will have a different + # tokenizer in the end. The languages need to match for the vectors + # to be imported and there's no way to pass a custom config to + # `init vectors`. + if self._mecab_tokenizer is None: + self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") + return self._mecab_tokenizer + + def __reduce__(self): + return KoreanNattoTokenizer, (self.vocab,) + + def __call__(self, text: str) -> Doc: + dtokens = list(self.detailed_tokens(text)) + surfaces = [dt["surface"] for dt in dtokens] + doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) + for token, dtoken in zip(doc, dtokens): + first_tag, sep, eomi_tags = dtoken["tag"].partition("+") + token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + if token.tag_ in TAG_MAP: + token.pos = TAG_MAP[token.tag_][POS] + else: + token.pos = X + token.lemma_ = dtoken["lemma"] + doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] + return doc + + def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: + # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], + # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * + for node in self.mecab_tokenizer.parse(text, as_nodes=True): + if node.is_eos(): + break + surface = node.surface + feature = node.feature + tag, _, expr = feature.partition(",") + lemma, _, remainder = expr.partition("/") + if lemma == "*" or lemma == "": + lemma = surface + yield {"surface": surface, "lemma": lemma, "tag": tag} + + def score(self, examples): + validate_examples(examples, "KoreanTokenizer.score") + return Scorer.score_tokenization(examples) + + def _try_mecab_import(self): + try: + from natto import MeCab + + return MeCab + except ImportError: + raise ImportError( + 'The Korean Natto tokenizer ("spacy.ko.KoreanNattoTokenizer") requires ' + "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " + "and [natto-py](https://github.com/buruzaemon/natto-py)" + ) from None + + def check_spaces(text, tokens): prev_end = -1 start = 0 diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index e70fcd6dd..92810118a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -239,7 +239,7 @@ def hsb_tokenizer(): @pytest.fixture(scope="session") def ko_tokenizer(): - pytest.importorskip("natto") + pytest.importorskip("mecab_ko") return get_lang_class("ko")().tokenizer @@ -256,6 +256,20 @@ def ko_tokenizer_tokenizer(): return nlp.tokenizer +@pytest.fixture(scope="session") +def ko_tokenizer_natto(): + pytest.importorskip("natto") + config = { + "nlp": { + "tokenizer": { + "@tokenizers": "spacy.KoreanNattoTokenizer.v1", + } + } + } + nlp = get_lang_class("ko").from_config(config) + return nlp.tokenizer + + @pytest.fixture(scope="session") def lb_tokenizer(): return get_lang_class("lb")().tokenizer diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 7782ca4bc..0c389b9ce 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -7,3 +7,11 @@ import pytest def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): test_lemma = ko_tokenizer(word)[0].lemma_ assert test_lemma == lemma + + +@pytest.mark.parametrize( + "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")] +) +def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma): + test_lemma = ko_tokenizer_natto(word)[0].lemma_ + assert test_lemma == lemma diff --git a/spacy/tests/lang/ko/test_serialize.py b/spacy/tests/lang/ko/test_serialize.py index 75288fcc5..35d28d42a 100644 --- a/spacy/tests/lang/ko/test_serialize.py +++ b/spacy/tests/lang/ko/test_serialize.py @@ -22,3 +22,23 @@ def test_ko_tokenizer_pickle(ko_tokenizer): b = pickle.dumps(ko_tokenizer) ko_tokenizer_re = pickle.loads(b) assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes() + + +def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto): + tokenizer_bytes = ko_tokenizer_natto.to_bytes() + nlp = Korean() + nlp.tokenizer.from_bytes(tokenizer_bytes) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + with make_tempdir() as d: + file_path = d / "tokenizer" + ko_tokenizer_natto.to_disk(file_path) + nlp = Korean() + nlp.tokenizer.from_disk(file_path) + assert tokenizer_bytes == nlp.tokenizer.to_bytes() + + +def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto): + b = pickle.dumps(ko_tokenizer_natto) + ko_tokenizer_natto_re = pickle.loads(b) + assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes() diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index 6e06e405e..e7f8a5c0d 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -19,6 +19,8 @@ POS_TESTS = [("서울 타워 근처에 살고 있습니다.", "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] # fmt: on +# tests for ko_tokenizer (default KoreanTokenizer) + @pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) def test_ko_tokenizer(ko_tokenizer, text, expected_tokens): @@ -44,7 +46,7 @@ def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): assert pos == expected_pos.split() -def test_ko_empty_doc(ko_tokenizer): +def test_ko_tokenizer_empty_doc(ko_tokenizer): tokens = ko_tokenizer("") assert len(tokens) == 0 @@ -55,6 +57,44 @@ def test_ko_tokenizer_unknown_tag(ko_tokenizer): assert tokens[1].pos_ == "X" +# same tests for ko_tokenizer_natto (KoreanNattoTokenizer) + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_ko_tokenizer_natto(ko_tokenizer_natto, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer_natto(text)] + assert tokens == expected_tokens.split() + + +@pytest.mark.parametrize("text,expected_tags", TAG_TESTS) +def test_ko_tokenizer_natto_tags(ko_tokenizer_natto, text, expected_tags): + tags = [token.tag_ for token in ko_tokenizer_natto(text)] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS) +def test_ko_tokenizer_natto_full_tags(ko_tokenizer_natto, text, expected_tags): + tags = ko_tokenizer_natto(text).user_data["full_tags"] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_pos", POS_TESTS) +def test_ko_tokenizer_natto_pos(ko_tokenizer_natto, text, expected_pos): + pos = [token.pos_ for token in ko_tokenizer_natto(text)] + assert pos == expected_pos.split() + + +def test_ko_tokenizer_natto_empty_doc(ko_tokenizer_natto): + tokens = ko_tokenizer_natto("") + assert len(tokens) == 0 + + +@pytest.mark.issue(10535) +def test_ko_tokenizer_natto_unknown_tag(ko_tokenizer_natto): + tokens = ko_tokenizer_natto("미닛 리피터") + assert tokens[1].pos_ == "X" + + # fmt: off SPACY_TOKENIZER_TESTS = [ ("있다.", "있다 ."), diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index e20227455..c0b9d4dc6 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -21,7 +21,7 @@ def test_build_dependencies(): # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ "fugashi", - "natto-py", + "mecab-ko", "pythainlp", "sudachipy", "sudachidict_core", diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 56992e7e3..a2bf72d02 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -268,18 +268,49 @@ used for training the current [Japanese pipelines](/models/ja). ### Korean language support {#korean} -> #### mecab-ko tokenizer +There are currently three built-in options for Korean tokenization, two based on +[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) and one +using the rule-based tokenizer. + +> #### Default mecab-ko tokenizer > > ```python +> # uses mecab-ko-dic > nlp = spacy.blank("ko") +> +> # with custom mecab args +> mecab_args = "-d /path/to/dicdir -u /path/to/userdic" +> config = {"nlp": {"tokenizer": {"mecab_args": mecab_args}}} +> nlp = spacy.blank("ko", config=config) > ``` -The default MeCab-based Korean tokenizer requires: +The default MeCab-based Korean tokenizer requires the python package +[`mecab-ko`](https://pypi.org/project/mecab-ko/) and no further system +requirements. + +The `natto-py` MeCab-based tokenizer (the previous default for spaCy v3.4 and +earlier) is available as `spacy.KoreanNattoTokenizer.v1`. It requires: - [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) - [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) - [natto-py](https://github.com/buruzaemon/natto-py) +To use this tokenizer, edit `[nlp.tokenizer]` in your config: + +> #### natto-py MeCab-ko tokenizer +> +> ```python +> config = {"nlp": {"tokenizer": {"@tokenizers": "spacy.KoreanNattoTokenizer.v1"}}} +> nlp = spacy.blank("ko", config=config) +> ``` + +```ini +### config.cfg +[nlp] +lang = "ko" +tokenizer = {"@tokenizers" = "spacy.KoreanNattoTokenizer.v1"} +``` + For some Korean datasets and tasks, the [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited than MeCab. To configure a Korean pipeline with the rule-based tokenizer: