mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	failing unit test for Issue 4190
This commit is contained in:
		
							parent
							
								
									b91425f803
								
							
						
					
					
						commit
						7bec0ebbcb
					
				
							
								
								
									
										57
									
								
								spacy/tests/regression/test_issue4190.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								spacy/tests/regression/test_issue4190.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,57 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from spacy.lang.en import English | ||||||
|  | 
 | ||||||
|  | import spacy | ||||||
|  | from spacy.tokenizer import Tokenizer | ||||||
|  | 
 | ||||||
|  | from spacy.tests.util import make_tempdir | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_issue4190(): | ||||||
|  |     test_string = "Test c." | ||||||
|  | 
 | ||||||
|  |     # Load default language | ||||||
|  |     nlp_1 = English() | ||||||
|  |     doc_1a = nlp_1(test_string) | ||||||
|  |     result_1a = [token.text for token in doc_1a] | ||||||
|  | 
 | ||||||
|  |     # Modify tokenizer | ||||||
|  |     customize_tokenizer(nlp_1) | ||||||
|  |     doc_1b = nlp_1(test_string) | ||||||
|  |     result_1b = [token.text for token in doc_1b] | ||||||
|  | 
 | ||||||
|  |     # Save and Reload | ||||||
|  |     with make_tempdir() as model_dir: | ||||||
|  |         nlp_1.to_disk(model_dir) | ||||||
|  |         nlp_2 = spacy.load(model_dir) | ||||||
|  | 
 | ||||||
|  |     # This should be the modified tokenizer | ||||||
|  |     doc_2 = nlp_2(test_string) | ||||||
|  |     result_2 = [token.text for token in doc_2] | ||||||
|  | 
 | ||||||
|  |     assert result_1b == result_2 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def customize_tokenizer(nlp): | ||||||
|  |     prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) | ||||||
|  |     suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) | ||||||
|  |     infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) | ||||||
|  | 
 | ||||||
|  |     # remove all exceptions where a single letter is followed by a period (e.g. 'h.') | ||||||
|  |     exceptions = { | ||||||
|  |         k: v | ||||||
|  |         for k, v in dict(nlp.Defaults.tokenizer_exceptions).items() | ||||||
|  |         if not (len(k) == 2 and k[1] == ".") | ||||||
|  |     } | ||||||
|  |     new_tokenizer = Tokenizer( | ||||||
|  |         nlp.vocab, | ||||||
|  |         exceptions, | ||||||
|  |         prefix_search=prefix_re.search, | ||||||
|  |         suffix_search=suffix_re.search, | ||||||
|  |         infix_finditer=infix_re.finditer, | ||||||
|  |         token_match=nlp.tokenizer.token_match, | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  |     nlp.tokenizer = new_tokenizer | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user