import pytest from spacy.lang.zh import Chinese from ...util import make_tempdir def zh_tokenizer_serialize(zh_tokenizer): tokenizer_bytes = zh_tokenizer.to_bytes() nlp = Chinese() nlp.tokenizer.from_bytes(tokenizer_bytes) assert tokenizer_bytes == nlp.tokenizer.to_bytes() with make_tempdir() as d: file_path = d / "tokenizer" zh_tokenizer.to_disk(file_path) nlp = Chinese() nlp.tokenizer.from_disk(file_path) assert tokenizer_bytes == nlp.tokenizer.to_bytes() def test_zh_tokenizer_serialize_char(zh_tokenizer_char): zh_tokenizer_serialize(zh_tokenizer_char) def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba): zh_tokenizer_serialize(zh_tokenizer_jieba) @pytest.mark.slow def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg): nlp = Chinese( meta={ "tokenizer": { "config": { "segmenter": "pkuseg", "pkuseg_model": "medicine", } } } ) zh_tokenizer_serialize(nlp.tokenizer)