mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Add test for Issue #351: Indices off when leading whitespace
This commit is contained in:
		
							parent
							
								
									76021cb853
								
							
						
					
					
						commit
						b4bfc6ae55
					
				|  | @ -33,3 +33,35 @@ def test_newline_double_space(en_tokenizer): | |||
| def test_newline_space_wrap(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello \n possums') | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| 
 | ||||
| def test_leading_space_offsets(en_tokenizer): | ||||
|     '''Issue #351 | ||||
|     # this works | ||||
| 
 | ||||
|     text1 = u"This is a cat." | ||||
|     a = english_spacy(text1) | ||||
| 
 | ||||
|     tok0 = list(a.sents)[0][0] | ||||
|     print tok0, tok0.idx, text1[tok0.idx] | ||||
| 
 | ||||
|     tok1 = list(a.sents)[0][1] | ||||
|     print tok1, tok1.idx, text1[tok1.idx] | ||||
| 
 | ||||
|     print "==" | ||||
| 
 | ||||
|     # this does not work | ||||
| 
 | ||||
|     text2 = u"   This is a cat." | ||||
|     b = english_spacy(text2) | ||||
| 
 | ||||
|     tok0 = list(b.sents)[0][0] | ||||
| print tok0, tok0.idx, text2[tok0.idx] | ||||
| 
 | ||||
|     tok1 = list(b.sents)[0][1] | ||||
|     print tok1, tok1.idx, text2[tok1.idx] | ||||
|     ''' | ||||
|     doc = en_tokenizer(u"   This is a cat.") | ||||
|     assert doc[0].idx == 0 | ||||
|     assert len(doc[0]) == 3 | ||||
|     assert doc[1].idx == 3 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user