mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
|
# coding: utf-8
|
||
|
from __future__ import unicode_literals
|
||
|
|
||
|
import pytest
|
||
|
from spacy.lang.zh import Chinese
|
||
|
from ...util import make_tempdir
|
||
|
|
||
|
|
||
|
def zh_tokenizer_serialize(zh_tokenizer):
|
||
|
tokenizer_bytes = zh_tokenizer.to_bytes()
|
||
|
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
||
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||
|
|
||
|
with make_tempdir() as d:
|
||
|
file_path = d / "tokenizer"
|
||
|
zh_tokenizer.to_disk(file_path)
|
||
|
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
|
||
|
nlp.tokenizer.from_disk(file_path)
|
||
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
||
|
|
||
|
|
||
|
def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
|
||
|
zh_tokenizer_serialize(zh_tokenizer_char)
|
||
|
|
||
|
|
||
|
def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
||
|
zh_tokenizer_serialize(zh_tokenizer_jieba)
|
||
|
|
||
|
|
||
|
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
|
||
|
zh_tokenizer_serialize(zh_tokenizer_pkuseg)
|
||
|
|
||
|
|
||
|
@pytest.mark.slow
|
||
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
||
|
nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False, "use_pkuseg": True, "pkuseg_model": "medicine"}}})
|
||
|
zh_tokenizer_serialize(nlp.tokenizer)
|