mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 19:06:33 +03:00
2a558a7cdc
* Switch to mecab-ko as default Korean tokenizer
Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.
Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.
* Temporarily run tests with mecab-ko tokenizer
* Fix types
* Fix duplicate test names
* Update requirements test
* Revert "Temporarily run tests with mecab-ko tokenizer"
This reverts commit d2083e7044
.
* Add mecab_args setting, fix pickle for KoreanNattoTokenizer
* Fix length check
* Update docs
* Formatting
* Update natto-py error message
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
18 lines
622 B
Python
18 lines
622 B
Python
import pytest
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
|
)
|
|
def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma):
|
|
test_lemma = ko_tokenizer(word)[0].lemma_
|
|
assert test_lemma == lemma
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")]
|
|
)
|
|
def test_ko_lemmatizer_natto_assigns(ko_tokenizer_natto, word, lemma):
|
|
test_lemma = ko_tokenizer_natto(word)[0].lemma_
|
|
assert test_lemma == lemma
|