mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
2a558a7cdc
* Switch to mecab-ko as default Korean tokenizer
Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.
Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.
* Temporarily run tests with mecab-ko tokenizer
* Fix types
* Fix duplicate test names
* Update requirements test
* Revert "Temporarily run tests with mecab-ko tokenizer"
This reverts commit d2083e7044
.
* Add mecab_args setting, fix pickle for KoreanNattoTokenizer
* Fix length check
* Update docs
* Formatting
* Update natto-py error message
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
45 lines
1.4 KiB
Python
45 lines
1.4 KiB
Python
import pickle
|
|
|
|
from spacy.lang.ko import Korean
|
|
from ...util import make_tempdir
|
|
|
|
|
|
def test_ko_tokenizer_serialize(ko_tokenizer):
|
|
tokenizer_bytes = ko_tokenizer.to_bytes()
|
|
nlp = Korean()
|
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
|
|
with make_tempdir() as d:
|
|
file_path = d / "tokenizer"
|
|
ko_tokenizer.to_disk(file_path)
|
|
nlp = Korean()
|
|
nlp.tokenizer.from_disk(file_path)
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
|
|
|
|
def test_ko_tokenizer_pickle(ko_tokenizer):
|
|
b = pickle.dumps(ko_tokenizer)
|
|
ko_tokenizer_re = pickle.loads(b)
|
|
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
|
|
|
|
|
|
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
|
|
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
|
|
nlp = Korean()
|
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
|
|
with make_tempdir() as d:
|
|
file_path = d / "tokenizer"
|
|
ko_tokenizer_natto.to_disk(file_path)
|
|
nlp = Korean()
|
|
nlp.tokenizer.from_disk(file_path)
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
|
|
|
|
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
|
|
b = pickle.dumps(ko_tokenizer_natto)
|
|
ko_tokenizer_natto_re = pickle.loads(b)
|
|
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()
|