spaCy/spacy/tests/lang/th/test_serialize.py
Adriane Boyd c5de9b463a
Update custom tokenizer APIs and pickling (#8972)
* Fix incorrect pickling of Japanese and Korean pipelines, which led to
the entire pipeline being reset if pickled

* Enable pickling of Vietnamese tokenizer

* Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and
Vietnamese so that only the `Vocab` is required for initialization
2021-08-19 14:37:47 +02:00

25 lines
706 B
Python

import pickle
from spacy.lang.th import Thai
from ...util import make_tempdir
def test_th_tokenizer_serialize(th_tokenizer):
tokenizer_bytes = th_tokenizer.to_bytes()
nlp = Thai()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
th_tokenizer.to_disk(file_path)
nlp = Thai()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_th_tokenizer_pickle(th_tokenizer):
b = pickle.dumps(th_tokenizer)
th_tokenizer_re = pickle.loads(b)
assert th_tokenizer.to_bytes() == th_tokenizer_re.to_bytes()