mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 18:07:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			46 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			46 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import pytest
 | 
						|
from spacy.lang.zh import Chinese
 | 
						|
from ...util import make_tempdir
 | 
						|
 | 
						|
 | 
						|
def zh_tokenizer_serialize(zh_tokenizer):
 | 
						|
    tokenizer_bytes = zh_tokenizer.to_bytes()
 | 
						|
    nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
 | 
						|
    nlp.tokenizer.from_bytes(tokenizer_bytes)
 | 
						|
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
						|
 | 
						|
    with make_tempdir() as d:
 | 
						|
        file_path = d / "tokenizer"
 | 
						|
        zh_tokenizer.to_disk(file_path)
 | 
						|
        nlp = Chinese(meta={"tokenizer": {"config": {"use_jieba": False}}})
 | 
						|
        nlp.tokenizer.from_disk(file_path)
 | 
						|
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
						|
 | 
						|
 | 
						|
def test_zh_tokenizer_serialize_char(zh_tokenizer_char):
 | 
						|
    zh_tokenizer_serialize(zh_tokenizer_char)
 | 
						|
 | 
						|
 | 
						|
def test_zh_tokenizer_serialize_jieba(zh_tokenizer_jieba):
 | 
						|
    zh_tokenizer_serialize(zh_tokenizer_jieba)
 | 
						|
 | 
						|
 | 
						|
def test_zh_tokenizer_serialize_pkuseg(zh_tokenizer_pkuseg):
 | 
						|
    zh_tokenizer_serialize(zh_tokenizer_pkuseg)
 | 
						|
 | 
						|
 | 
						|
@pytest.mark.slow
 | 
						|
def test_zh_tokenizer_serialize_pkuseg_with_processors(zh_tokenizer_pkuseg):
 | 
						|
    nlp = Chinese(
 | 
						|
        meta={
 | 
						|
            "tokenizer": {
 | 
						|
                "config": {
 | 
						|
                    "use_jieba": False,
 | 
						|
                    "use_pkuseg": True,
 | 
						|
                    "pkuseg_model": "medicine",
 | 
						|
                }
 | 
						|
            }
 | 
						|
        }
 | 
						|
    )
 | 
						|
    zh_tokenizer_serialize(nlp.tokenizer)
 |