2020-04-18 18:01:53 +03:00
|
|
|
import pytest
|
2023-06-14 18:48:41 +03:00
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
from spacy.lang.zh import Chinese
|
2023-06-14 18:48:41 +03:00
|
|
|
|
2020-04-18 18:01:53 +03:00
|
|
|
from ...util import make_tempdir
|
|
|
|
|
|
|
|
|
|
|
|
def zh_tokenizer_serialize(zh_tokenizer):
|
|
|
|
tokenizer_bytes = zh_tokenizer.to_bytes()
|
2020-07-19 14:34:37 +03:00
|
|
|
nlp = Chinese()
|
2020-04-18 18:01:53 +03:00
|
|
|
nlp.tokenizer.from_bytes(tokenizer_bytes)
|
|
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
|
|
|
|
|
|
with make_tempdir() as d:
|
|
|
|
file_path = d / "tokenizer"
|
|
|
|
zh_tokenizer.to_disk(file_path)
|
2020-07-19 14:34:37 +03:00
|
|
|
nlp = Chinese()
|
2020-04-18 18:01:53 +03:00
|
|
|
nlp.tokenizer.from_disk(file_path)
|
|
|
|
assert tokenizer_bytes == nlp.tokenizer.to_bytes()
|
|
|
|
|
|
|
|
|
|
|
|
def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
|
|
|
|
zh_tokenizer_serialize(zh_tokenizer_char)
|
|
|
|
|
|
|
|
|
|
|
|
def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
|
|
|
|
zh_tokenizer_serialize(zh_tokenizer_jieba)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.slow
|
|
|
|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
|
2020-09-27 15:00:18 +03:00
|
|
|
config = {
|
2020-09-30 12:46:45 +03:00
|
|
|
"nlp": {
|
|
|
|
"tokenizer": {
|
|
|
|
"@tokenizers": "spacy.zh.ChineseTokenizer",
|
|
|
|
"segmenter": "pkuseg",
|
|
|
|
}
|
|
|
|
},
|
2020-10-03 18:20:18 +03:00
|
|
|
"initialize": {
|
|
|
|
"tokenizer": {
|
2020-09-30 12:46:45 +03:00
|
|
|
"pkuseg_model": "medicine",
|
|
|
|
}
|
|
|
|
},
|
2020-09-27 15:00:18 +03:00
|
|
|
}
|
2020-09-30 12:46:45 +03:00
|
|
|
nlp = Chinese.from_config(config)
|
|
|
|
nlp.initialize()
|
2020-04-18 18:01:53 +03:00
|
|
|
zh_tokenizer_serialize(nlp.tokenizer)
|