mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			46 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			46 lines
		
	
	
		
			1.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| from spacy.lang.zh import Chinese
 | |
| from ...util import make_tempdir
 | |
| 
 | |
| 
 | |
| def zh_tokenizer_serialize(zh_tokenizer):
 | |
|     tokenizer_bytes = zh_tokenizer.to_bytes()
 | |
|     nlp = Chinese()
 | |
|     nlp.tokenizer.from_bytes(tokenizer_bytes)
 | |
|     assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | |
| 
 | |
|     with make_tempdir() as d:
 | |
|         file_path = d / "tokenizer"
 | |
|         zh_tokenizer.to_disk(file_path)
 | |
|         nlp = Chinese()
 | |
|         nlp.tokenizer.from_disk(file_path)
 | |
|         assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | |
| 
 | |
| 
 | |
| def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
 | |
|     zh_tokenizer_serialize(zh_tokenizer_char)
 | |
| 
 | |
| 
 | |
| def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
 | |
|     zh_tokenizer_serialize(zh_tokenizer_jieba)
 | |
| 
 | |
| 
 | |
| @pytest.mark.slow
 | |
| def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | |
|     config = {
 | |
|         "nlp": {
 | |
|             "tokenizer": {
 | |
|                 "@tokenizers": "spacy.zh.ChineseTokenizer",
 | |
|                 "segmenter": "pkuseg",
 | |
|             }
 | |
|         },
 | |
|         "initialize": {
 | |
|             "tokenizer": {
 | |
|                 "pkuseg_model": "medicine",
 | |
|             }
 | |
|         },
 | |
|     }
 | |
|     nlp = Chinese.from_config(config)
 | |
|     nlp.initialize()
 | |
|     zh_tokenizer_serialize(nlp.tokenizer)
 |