diff --git a/setup.cfg b/setup.cfg index cddc5148c..5df1728a4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -120,7 +120,7 @@ ja = sudachipy>=0.5.2,!=0.6.1 sudachidict_core>=20211220 ko = - natto-py>=0.9.0 + python-mecab-ko>=1.3.3 th = pythainlp>=2.0 diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 0e02e4a2d..a0de71bca 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -38,46 +38,38 @@ class KoreanTokenizer(DummyTokenizer): @property def mecab_tokenizer(self): # This is a property so that initializing a pipeline with blank:ko is - # possible without actually requiring mecab-ko, e.g. to run + # possible without actually requiring python-mecab-ko, e.g. to run # `spacy init vectors ko` for a pipeline that will have a different # tokenizer in the end. The languages need to match for the vectors # to be imported and there's no way to pass a custom config to # `init vectors`. if self._mecab_tokenizer is None: - self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]") + self._mecab_tokenizer = self._mecab() return self._mecab_tokenizer def __reduce__(self): return KoreanTokenizer, (self.vocab,) def __call__(self, text: str) -> Doc: - dtokens = list(self.detailed_tokens(text)) - surfaces = [dt["surface"] for dt in dtokens] + dtokens = self.mecab_tokenizer.parse(text) + surfaces = [dt.surface for dt in dtokens] + doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) + for token, dtoken in zip(doc, dtokens): - first_tag, sep, eomi_tags = dtoken["tag"].partition("+") - token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) - if token.tag_ in TAG_MAP: - token.pos = TAG_MAP[token.tag_][POS] + token.tag_ = dtoken.pos + first_tag = ( + dtoken.feature.start_pos or dtoken.pos + ) # stem(어간) or pre-final(선어말 어미) + + if first_tag in TAG_MAP: + token.pos = TAG_MAP[first_tag][POS] else: token.pos = X - token.lemma_ = dtoken["lemma"] - doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens] - return doc - def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]: - # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], - # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * - for node in self.mecab_tokenizer.parse(text, as_nodes=True): - if node.is_eos(): - break - surface = node.surface - feature = node.feature - tag, _, expr = feature.partition(",") - lemma, _, remainder = expr.partition("/") - if lemma == "*": - lemma = surface - yield {"surface": surface, "lemma": lemma, "tag": tag} + token.lemma_ = get_lemma(dtoken) + + return doc def score(self, examples): validate_examples(examples, "KoreanTokenizer.score") @@ -97,20 +89,29 @@ class Korean(Language): Defaults = KoreanDefaults -def try_mecab_import() -> None: +def try_mecab_import(): try: - from natto import MeCab + from mecab import MeCab return MeCab except ImportError: raise ImportError( 'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires ' - "[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " - "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " - "and [natto-py](https://github.com/buruzaemon/natto-py)" + "[python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko). " + "Install with `pip install python-mecab-ko` or " + "install spaCy with `pip install spacy[ko]`." ) from None +def get_lemma(m): + expr = m.feature.expression + + if expr is None: + return m.surface + else: + return "+".join([e.split("/")[0] for e in expr.split("+")]) + + def check_spaces(text, tokens): prev_end = -1 start = 0 diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 3a5c8e451..506e01233 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -239,7 +239,7 @@ def hsb_tokenizer(): @pytest.fixture(scope="session") def ko_tokenizer(): - pytest.importorskip("natto") + pytest.importorskip("mecab") return get_lang_class("ko")().tokenizer diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 7782ca4bc..de909b480 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -2,7 +2,14 @@ import pytest @pytest.mark.parametrize( - "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")] + "word,lemma", + [ + ("새로운", "새롭+ᆫ"), + ("빨간", "빨갛+ᆫ"), + ("클수록", "크+ᆯ수록"), + ("뭡니까", "뭣+이+ᄇ니까"), + ("됐다", "되+었"), + ], ) def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): test_lemma = ko_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index 6e06e405e..bb02aa93c 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -8,10 +8,7 @@ TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타 TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", "NNP NNG NNG JKB VV EC VX EF SF"), ("영등포구에 있는 맛집 좀 알려주세요.", - "NNP JKB VV ETM NNG MAG VV VX EP SF")] - -FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.", - "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] + "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] POS_TESTS = [("서울 타워 근처에 살고 있습니다.", "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"), @@ -32,12 +29,6 @@ def test_ko_tokenizer_tags(ko_tokenizer, text, expected_tags): assert tags == expected_tags.split() -@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS) -def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags): - tags = ko_tokenizer(text).user_data["full_tags"] - assert tags == expected_tags.split() - - @pytest.mark.parametrize("text,expected_pos", POS_TESTS) def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): pos = [token.pos_ for token in ko_tokenizer(text)] diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py index b403f274f..2bf1b756b 100644 --- a/spacy/tests/package/test_requirements.py +++ b/spacy/tests/package/test_requirements.py @@ -22,7 +22,7 @@ def test_build_dependencies(): # ignore language-specific packages that shouldn't be installed by all libs_ignore_setup = [ "fugashi", - "natto-py", + "python-mecab-ko", "pythainlp", "sudachipy", "sudachidict_core", diff --git a/website/docs/usage/models.mdx b/website/docs/usage/models.mdx index 3b8a5fa3f..320241309 100644 --- a/website/docs/usage/models.mdx +++ b/website/docs/usage/models.mdx @@ -272,9 +272,7 @@ used for training the current [Japanese pipelines](/models/ja). The default MeCab-based Korean tokenizer requires: -- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md) -- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic) -- [natto-py](https://github.com/buruzaemon/natto-py) +- [python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko) For some Korean datasets and tasks, the [rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited diff --git a/website/meta/languages.json b/website/meta/languages.json index 46c0d3adb..4ca7bdbbe 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -198,16 +198,8 @@ "name": "Korean", "dependencies": [ { - "name": "mecab-ko", - "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" - }, - { - "name": "mecab-ko-dic", - "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" - }, - { - "name": "natto-py", - "url": "https://github.com/buruzaemon/natto-py" + "name": "python-mecab-ko", + "url": "https://github.com/jonghwanhyeon/python-mecab-ko" } ], "example": "이것은 문장입니다.",