spaCy/spacy/tests/lang/zh/test_serialize.py
2020-05-21 14:14:01 +02:00

49 lines
1.4 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.lang.zh import Chinese
from ...util import make_tempdir
def zh_tokenizer_serialize(zh_tokenizer):
tokenizer_bytes = zh_tokenizer.to_bytes()
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
nlp.tokenizer.from_bytes(tokenizer_bytes)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
with make_tempdir() as d:
file_path = d / "tokenizer"
zh_tokenizer.to_disk(file_path)
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
nlp.tokenizer.from_disk(file_path)
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
zh_tokenizer_serialize(zh_tokenizer_char)
def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
zh_tokenizer_serialize(zh_tokenizer_jieba)
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
@pytest.mark.slow
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
nlp = Chinese(
meta={
"tokenizer": {
"config": {
"use_jieba": False,
"use_pkuseg": True,
"pkuseg_model": "medicine",
}
}
}
)
zh_tokenizer_serialize(nlp.tokenizer)