mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Add test for Issue #351: Indices off when leading whitespace
This commit is contained in:
		
							parent
							
								
									76021cb853
								
							
						
					
					
						commit
						b4bfc6ae55
					
				|  | @ -33,3 +33,35 @@ def test_newline_double_space(en_tokenizer): | ||||||
| def test_newline_space_wrap(en_tokenizer): | def test_newline_space_wrap(en_tokenizer): | ||||||
|     tokens = en_tokenizer('hello \n possums') |     tokens = en_tokenizer('hello \n possums') | ||||||
|     assert len(tokens) == 3 |     assert len(tokens) == 3 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_leading_space_offsets(en_tokenizer): | ||||||
|  |     '''Issue #351 | ||||||
|  |     # this works | ||||||
|  | 
 | ||||||
|  |     text1 = u"This is a cat." | ||||||
|  |     a = english_spacy(text1) | ||||||
|  | 
 | ||||||
|  |     tok0 = list(a.sents)[0][0] | ||||||
|  |     print tok0, tok0.idx, text1[tok0.idx] | ||||||
|  | 
 | ||||||
|  |     tok1 = list(a.sents)[0][1] | ||||||
|  |     print tok1, tok1.idx, text1[tok1.idx] | ||||||
|  | 
 | ||||||
|  |     print "==" | ||||||
|  | 
 | ||||||
|  |     # this does not work | ||||||
|  | 
 | ||||||
|  |     text2 = u"   This is a cat." | ||||||
|  |     b = english_spacy(text2) | ||||||
|  | 
 | ||||||
|  |     tok0 = list(b.sents)[0][0] | ||||||
|  | print tok0, tok0.idx, text2[tok0.idx] | ||||||
|  | 
 | ||||||
|  |     tok1 = list(b.sents)[0][1] | ||||||
|  |     print tok1, tok1.idx, text2[tok1.idx] | ||||||
|  |     ''' | ||||||
|  |     doc = en_tokenizer(u"   This is a cat.") | ||||||
|  |     assert doc[0].idx == 0 | ||||||
|  |     assert len(doc[0]) == 3 | ||||||
|  |     assert doc[1].idx == 3 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user