Make Korean tokenizer easier to use

This commit is contained in:
Mycroft Kang 2023-02-25 00:46:42 +09:00
parent 1e8bac99f3
commit c50e1150d5
8 changed files with 45 additions and 56 deletions

View File

@ -120,7 +120,7 @@ ja =
sudachipy>=0.5.2,!=0.6.1
sudachidict_core>=20211220
ko =
natto-py>=0.9.0
python-mecab-ko>=1.3.3
th =
pythainlp>=2.0

View File

@ -38,46 +38,38 @@ class KoreanTokenizer(DummyTokenizer):
@property
def mecab_tokenizer(self):
# This is a property so that initializing a pipeline with blank:ko is
# possible without actually requiring mecab-ko, e.g. to run
# possible without actually requiring python-mecab-ko, e.g. to run
# `spacy init vectors ko` for a pipeline that will have a different
# tokenizer in the end. The languages need to match for the vectors
# to be imported and there's no way to pass a custom config to
# `init vectors`.
if self._mecab_tokenizer is None:
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
self._mecab_tokenizer = self._mecab()
return self._mecab_tokenizer
def __reduce__(self):
return KoreanTokenizer, (self.vocab,)
def __call__(self, text: str) -> Doc:
dtokens = list(self.detailed_tokens(text))
surfaces = [dt["surface"] for dt in dtokens]
dtokens = self.mecab_tokenizer.parse(text)
surfaces = [dt.surface for dt in dtokens]
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
for token, dtoken in zip(doc, dtokens):
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
if token.tag_ in TAG_MAP:
token.pos = TAG_MAP[token.tag_][POS]
token.tag_ = dtoken.pos
first_tag = (
dtoken.feature.start_pos or dtoken.pos
) # stem(어간) or pre-final(선어말 어미)
if first_tag in TAG_MAP:
token.pos = TAG_MAP[first_tag][POS]
else:
token.pos = X
token.lemma_ = dtoken["lemma"]
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
return doc
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
if node.is_eos():
break
surface = node.surface
feature = node.feature
tag, _, expr = feature.partition(",")
lemma, _, remainder = expr.partition("/")
if lemma == "*":
lemma = surface
yield {"surface": surface, "lemma": lemma, "tag": tag}
token.lemma_ = get_lemma(dtoken)
return doc
def score(self, examples):
validate_examples(examples, "KoreanTokenizer.score")
@ -97,20 +89,29 @@ class Korean(Language):
Defaults = KoreanDefaults
def try_mecab_import() -> None:
def try_mecab_import():
try:
from natto import MeCab
from mecab import MeCab
return MeCab
except ImportError:
raise ImportError(
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
"and [natto-py](https://github.com/buruzaemon/natto-py)"
"[python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko). "
"Install with `pip install python-mecab-ko` or "
"install spaCy with `pip install spacy[ko]`."
) from None
def get_lemma(m):
expr = m.feature.expression
if expr is None:
return m.surface
else:
return "+".join([e.split("/")[0] for e in expr.split("+")])
def check_spaces(text, tokens):
prev_end = -1
start = 0

View File

@ -239,7 +239,7 @@ def hsb_tokenizer():
@pytest.fixture(scope="session")
def ko_tokenizer():
pytest.importorskip("natto")
pytest.importorskip("mecab")
return get_lang_class("ko")().tokenizer

View File

@ -2,7 +2,14 @@ import pytest
@pytest.mark.parametrize(
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", ""), ("뭡니까", ""), ("됐다", "")]
"word,lemma",
[
("새로운", "새롭+ᆫ"),
("빨간", "빨갛+ᆫ"),
("클수록", "크+ᆯ수록"),
("뭡니까", "뭣+이+ᄇ니까"),
("됐다", "되+었"),
],
)
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
test_lemma = ko_tokenizer(word)[0].lemma_

View File

@ -8,10 +8,7 @@ TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타
TAG_TESTS = [("서울 타워 근처에 살고 있습니다.",
"NNP NNG NNG JKB VV EC VX EF SF"),
("영등포구에 있는 맛집 좀 알려주세요.",
"NNP JKB VV ETM NNG MAG VV VX EP SF")]
FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.",
"NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
"NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
"PROPN NOUN NOUN ADP VERB X AUX X PUNCT"),
@ -32,12 +29,6 @@ def test_ko_tokenizer_tags(ko_tokenizer, text, expected_tags):
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags):
tags = ko_tokenizer(text).user_data["full_tags"]
assert tags == expected_tags.split()
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ko_tokenizer(text)]

View File

@ -22,7 +22,7 @@ def test_build_dependencies():
# ignore language-specific packages that shouldn't be installed by all
libs_ignore_setup = [
"fugashi",
"natto-py",
"python-mecab-ko",
"pythainlp",
"sudachipy",
"sudachidict_core",

View File

@ -272,9 +272,7 @@ used for training the current [Japanese pipelines](/models/ja).
The default MeCab-based Korean tokenizer requires:
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
- [natto-py](https://github.com/buruzaemon/natto-py)
- [python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko)
For some Korean datasets and tasks, the
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited

View File

@ -198,16 +198,8 @@
"name": "Korean",
"dependencies": [
{
"name": "mecab-ko",
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
},
{
"name": "mecab-ko-dic",
"url": "https://bitbucket.org/eunjeon/mecab-ko-dic"
},
{
"name": "natto-py",
"url": "https://github.com/buruzaemon/natto-py"
"name": "python-mecab-ko",
"url": "https://github.com/jonghwanhyeon/python-mecab-ko"
}
],
"example": "이것은 문장입니다.",