mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Fix Issue #360: Tokenizer failed when the infix regex matched the start of the string while trying to tokenize multi-infix tokens.
This commit is contained in:
		
							parent
							
								
									eab2376547
								
							
						
					
					
						commit
						cc8bf62208
					
				|  | @ -24,6 +24,12 @@ def test_ellipsis(en_tokenizer): | |||
|     tokens = en_tokenizer('best...known') | ||||
|     assert len(tokens) == 3 | ||||
| 
 | ||||
| def test_big_ellipsis(en_tokenizer): | ||||
|     '''Test regression identified in Issue #360''' | ||||
|     tokens = en_tokenizer(u'$45...............Asking') | ||||
|     assert len(tokens) > 2 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| def test_email(en_tokenizer): | ||||
|     tokens = en_tokenizer('hello@example.com') | ||||
|  |  | |||
|  | @ -227,6 +227,8 @@ cdef class Tokenizer: | |||
|                     for match in matches: | ||||
|                         infix_start = match.start() | ||||
|                         infix_end = match.end() | ||||
|                         if infix_start == start: | ||||
|                             continue | ||||
|                         span = string[start:infix_start] | ||||
|                         tokens.push_back(self.vocab.get(tokens.mem, span), False) | ||||
|                      | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user