mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Fix incorrect pickling of Japanese and Korean pipelines, which led to the entire pipeline being reset if pickled * Enable pickling of Vietnamese tokenizer * Update tokenizer APIs for Chinese, Japanese, Korean, Thai, and Vietnamese so that only the `Vocab` is required for initialization
		
			
				
	
	
		
			25 lines
		
	
	
		
			712 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			25 lines
		
	
	
		
			712 B
		
	
	
	
		
			Python
		
	
	
	
	
	
import pickle
 | 
						|
 | 
						|
from spacy.lang.ko import Korean
 | 
						|
from ...util import make_tempdir
 | 
						|
 | 
						|
 | 
						|
def test_ko_tokenizer_serialize(ko_tokenizer):
 | 
						|
    tokenizer_bytes = ko_tokenizer.to_bytes()
 | 
						|
    nlp = Korean()
 | 
						|
    nlp.tokenizer.from_bytes(tokenizer_bytes)
 | 
						|
    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
						|
 | 
						|
    with make_tempdir() as d:
 | 
						|
        file_path = d / "tokenizer"
 | 
						|
        ko_tokenizer.to_disk(file_path)
 | 
						|
        nlp = Korean()
 | 
						|
        nlp.tokenizer.from_disk(file_path)
 | 
						|
        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
						|
 | 
						|
 | 
						|
def test_ko_tokenizer_pickle(ko_tokenizer):
 | 
						|
    b = pickle.dumps(ko_tokenizer)
 | 
						|
    ko_tokenizer_re = pickle.loads(b)
 | 
						|
    assert ko_tokenizer.to_bytes() == ko_tokenizer_re.to_bytes()
 |