spaCy/spacy/tests/lang/zh/test_serialize.py
Daniël de Kok e2b70df012
Configure isort to use the Black profile, recursively isort the spacy module (#12721)
* Use isort with Black profile

* isort all the things

* Fix import cycles as a result of import sorting

* Add DOCBIN_ALL_ATTRS type definition

* Add isort to requirements

* Remove isort from build dependencies check

* Typo
2023-06-14 17:48:41 +02:00

48 lines
1.2 KiB
Python

import pytest
from spacy.lang.zh import Chinese
from ...util import make_tempdir
def zh_tokenizer_serialize(zh_tokenizer):
tokenizer_bytes = zh_tokenizer.to_bytes()
nlp = Chinese()
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
zh_tokenizer.to_disk(file_path)
nlp = Chinese()
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
zh_tokenizer_serialize(zh_tokenizer_char)
def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
zh_tokenizer_serialize(zh_tokenizer_jieba)
@pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
config = {
"nlp": {
"tokenizer": {
"@tokenizers": "spacy.zh.ChineseTokenizer",
"segmenter": "pkuseg",
}
},
"initialize": {
"tokenizer": {
"pkuseg_model": "medicine",
}
},
}
nlp = Chinese.from_config(config)
nlp.initialize()
zh_tokenizer_serialize(nlp.tokenizer)