spaCy/spacy/tests/lang/ko/test_serialize.py
Adriane Boyd 2a558a7cdc
Switch to mecab-ko as default Korean tokenizer (#11294)
* Switch to mecab-ko as default Korean tokenizer

Switch to the (confusingly-named) mecab-ko python module for default Korean
tokenization.

Maintain the previous `natto-py` tokenizer as
`spacy.KoreanNattoTokenizer.v1`.

* Temporarily run tests with mecab-ko tokenizer

* Fix types

* Fix duplicate test names

* Update requirements test

* Revert "Temporarily run tests with mecab-ko tokenizer"

This reverts commit d2083e7044.

* Add mecab_args setting, fix pickle for KoreanNattoTokenizer

* Fix length check

* Update docs

* Formatting

* Update natto-py error message

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>

Co-authored-by: Paul O'Leary McCann <polm@dampfkraft.com>
2022-08-26 10:11:18 +02:00

45 lines
1.4 KiB
Python

import pickle
from spacy.lang.ko import Korean
from ...util import make_tempdir
def test_ko_tokenizer_serialize(ko_tokenizer):
tokenizer_bytes = ko_tokenizer.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_pickle(ko_tokenizer):
b = pickle.dumps(ko_tokenizer)
ko_tokenizer_re = pickle.loads(b)
assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
def test_ko_tokenizer_natto_serialize(ko_tokenizer_natto):
tokenizer_bytes = ko_tokenizer_natto.to_bytes()
nlp = Korean()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
ko_tokenizer_natto.to_disk(file_path)
nlp = Korean()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_ko_tokenizer_natto_pickle(ko_tokenizer_natto):
b = pickle.dumps(ko_tokenizer_natto)
ko_tokenizer_natto_re = pickle.loads(b)
assert ko_tokenizer_natto.to_bytes() == ko_tokenizer_natto_re.to_bytes()