mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-02 19:30:19 +03:00
Make Korean tokenizer easier to use
This commit is contained in:
parent
1e8bac99f3
commit
c50e1150d5
|
@ -120,7 +120,7 @@ ja =
|
|||
sudachipy>=0.5.2,!=0.6.1
|
||||
sudachidict_core>=20211220
|
||||
ko =
|
||||
natto-py>=0.9.0
|
||||
python-mecab-ko>=1.3.3
|
||||
th =
|
||||
pythainlp>=2.0
|
||||
|
||||
|
|
|
@ -38,46 +38,38 @@ class KoreanTokenizer(DummyTokenizer):
|
|||
@property
|
||||
def mecab_tokenizer(self):
|
||||
# This is a property so that initializing a pipeline with blank:ko is
|
||||
# possible without actually requiring mecab-ko, e.g. to run
|
||||
# possible without actually requiring python-mecab-ko, e.g. to run
|
||||
# `spacy init vectors ko` for a pipeline that will have a different
|
||||
# tokenizer in the end. The languages need to match for the vectors
|
||||
# to be imported and there's no way to pass a custom config to
|
||||
# `init vectors`.
|
||||
if self._mecab_tokenizer is None:
|
||||
self._mecab_tokenizer = self._mecab("-F%f[0],%f[7]")
|
||||
self._mecab_tokenizer = self._mecab()
|
||||
return self._mecab_tokenizer
|
||||
|
||||
def __reduce__(self):
|
||||
return KoreanTokenizer, (self.vocab,)
|
||||
|
||||
def __call__(self, text: str) -> Doc:
|
||||
dtokens = list(self.detailed_tokens(text))
|
||||
surfaces = [dt["surface"] for dt in dtokens]
|
||||
dtokens = self.mecab_tokenizer.parse(text)
|
||||
surfaces = [dt.surface for dt in dtokens]
|
||||
|
||||
doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces)))
|
||||
|
||||
for token, dtoken in zip(doc, dtokens):
|
||||
first_tag, sep, eomi_tags = dtoken["tag"].partition("+")
|
||||
token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미)
|
||||
if token.tag_ in TAG_MAP:
|
||||
token.pos = TAG_MAP[token.tag_][POS]
|
||||
token.tag_ = dtoken.pos
|
||||
first_tag = (
|
||||
dtoken.feature.start_pos or dtoken.pos
|
||||
) # stem(어간) or pre-final(선어말 어미)
|
||||
|
||||
if first_tag in TAG_MAP:
|
||||
token.pos = TAG_MAP[first_tag][POS]
|
||||
else:
|
||||
token.pos = X
|
||||
token.lemma_ = dtoken["lemma"]
|
||||
doc.user_data["full_tags"] = [dt["tag"] for dt in dtokens]
|
||||
return doc
|
||||
|
||||
def detailed_tokens(self, text: str) -> Iterator[Dict[str, Any]]:
|
||||
# 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3],
|
||||
# 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], *
|
||||
for node in self.mecab_tokenizer.parse(text, as_nodes=True):
|
||||
if node.is_eos():
|
||||
break
|
||||
surface = node.surface
|
||||
feature = node.feature
|
||||
tag, _, expr = feature.partition(",")
|
||||
lemma, _, remainder = expr.partition("/")
|
||||
if lemma == "*":
|
||||
lemma = surface
|
||||
yield {"surface": surface, "lemma": lemma, "tag": tag}
|
||||
token.lemma_ = get_lemma(dtoken)
|
||||
|
||||
return doc
|
||||
|
||||
def score(self, examples):
|
||||
validate_examples(examples, "KoreanTokenizer.score")
|
||||
|
@ -97,20 +89,29 @@ class Korean(Language):
|
|||
Defaults = KoreanDefaults
|
||||
|
||||
|
||||
def try_mecab_import() -> None:
|
||||
def try_mecab_import():
|
||||
try:
|
||||
from natto import MeCab
|
||||
from mecab import MeCab
|
||||
|
||||
return MeCab
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
'The Korean tokenizer ("spacy.ko.KoreanTokenizer") requires '
|
||||
"[mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
|
||||
"[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
|
||||
"and [natto-py](https://github.com/buruzaemon/natto-py)"
|
||||
"[python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko). "
|
||||
"Install with `pip install python-mecab-ko` or "
|
||||
"install spaCy with `pip install spacy[ko]`."
|
||||
) from None
|
||||
|
||||
|
||||
def get_lemma(m):
|
||||
expr = m.feature.expression
|
||||
|
||||
if expr is None:
|
||||
return m.surface
|
||||
else:
|
||||
return "+".join([e.split("/")[0] for e in expr.split("+")])
|
||||
|
||||
|
||||
def check_spaces(text, tokens):
|
||||
prev_end = -1
|
||||
start = 0
|
||||
|
|
|
@ -239,7 +239,7 @@ def hsb_tokenizer():
|
|||
|
||||
@pytest.fixture(scope="session")
|
||||
def ko_tokenizer():
|
||||
pytest.importorskip("natto")
|
||||
pytest.importorskip("mecab")
|
||||
return get_lang_class("ko")().tokenizer
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,14 @@ import pytest
|
|||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
||||
"word,lemma",
|
||||
[
|
||||
("새로운", "새롭+ᆫ"),
|
||||
("빨간", "빨갛+ᆫ"),
|
||||
("클수록", "크+ᆯ수록"),
|
||||
("뭡니까", "뭣+이+ᄇ니까"),
|
||||
("됐다", "되+었"),
|
||||
],
|
||||
)
|
||||
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
||||
test_lemma = ko_tokenizer(word)[0].lemma_
|
||||
|
|
|
@ -8,10 +8,7 @@ TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타
|
|||
TAG_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
||||
"NNP NNG NNG JKB VV EC VX EF SF"),
|
||||
("영등포구에 있는 맛집 좀 알려주세요.",
|
||||
"NNP JKB VV ETM NNG MAG VV VX EP SF")]
|
||||
|
||||
FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.",
|
||||
"NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
|
||||
"NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")]
|
||||
|
||||
POS_TESTS = [("서울 타워 근처에 살고 있습니다.",
|
||||
"PROPN NOUN NOUN ADP VERB X AUX X PUNCT"),
|
||||
|
@ -32,12 +29,6 @@ def test_ko_tokenizer_tags(ko_tokenizer, text, expected_tags):
|
|||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS)
|
||||
def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags):
|
||||
tags = ko_tokenizer(text).user_data["full_tags"]
|
||||
assert tags == expected_tags.split()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
|
||||
def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos):
|
||||
pos = [token.pos_ for token in ko_tokenizer(text)]
|
||||
|
|
|
@ -22,7 +22,7 @@ def test_build_dependencies():
|
|||
# ignore language-specific packages that shouldn't be installed by all
|
||||
libs_ignore_setup = [
|
||||
"fugashi",
|
||||
"natto-py",
|
||||
"python-mecab-ko",
|
||||
"pythainlp",
|
||||
"sudachipy",
|
||||
"sudachidict_core",
|
||||
|
|
|
@ -272,9 +272,7 @@ used for training the current [Japanese pipelines](/models/ja).
|
|||
|
||||
The default MeCab-based Korean tokenizer requires:
|
||||
|
||||
- [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md)
|
||||
- [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic)
|
||||
- [natto-py](https://github.com/buruzaemon/natto-py)
|
||||
- [python-mecab-ko](https://github.com/jonghwanhyeon/python-mecab-ko)
|
||||
|
||||
For some Korean datasets and tasks, the
|
||||
[rule-based tokenizer](/usage/linguistic-features#tokenization) is better-suited
|
||||
|
|
|
@ -198,16 +198,8 @@
|
|||
"name": "Korean",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "mecab-ko",
|
||||
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
|
||||
},
|
||||
{
|
||||
"name": "mecab-ko-dic",
|
||||
"url": "https://bitbucket.org/eunjeon/mecab-ko-dic"
|
||||
},
|
||||
{
|
||||
"name": "natto-py",
|
||||
"url": "https://github.com/buruzaemon/natto-py"
|
||||
"name": "python-mecab-ko",
|
||||
"url": "https://github.com/jonghwanhyeon/python-mecab-ko"
|
||||
}
|
||||
],
|
||||
"example": "이것은 문장입니다.",
|
||||
|
|
Loading…
Reference in New Issue
Block a user