mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Fixed issue of infix capturing prefixes
This commit is contained in:
		
							parent
							
								
									33b0f86de3
								
							
						
					
					
						commit
						724ae7dc55
					
				|  | @ -10,8 +10,14 @@ from ...tokenizer import Tokenizer | |||
| 
 | ||||
| def test_issue1494():     | ||||
|     infix_re = re.compile(r'''[^a-z]''') | ||||
|     text_to_tokenize = 'token 123test' | ||||
|     expected_tokens = ['token', '1', '2', '3', 'test'] | ||||
|     text_to_tokenize1 = 'token 123test' | ||||
|     expected_tokens1 = ['token', '1', '2', '3', 'test'] | ||||
| 
 | ||||
|     text_to_tokenize2 = 'token 1test' | ||||
|     expected_tokens2 = ['token', '1test'] | ||||
| 
 | ||||
|     text_to_tokenize3 = 'hello...test' | ||||
|     expected_tokens3 = ['hello', '.', '.', '.', 'test'] | ||||
|      | ||||
|     def my_tokenizer(nlp): | ||||
|         return Tokenizer(nlp.vocab, | ||||
|  | @ -22,6 +28,12 @@ def test_issue1494(): | |||
|     nlp = English() | ||||
| 
 | ||||
|     nlp.tokenizer = my_tokenizer(nlp) | ||||
|     tokenized_words = [token.text for token in nlp(text_to_tokenize)] | ||||
|     print(tokenized_words) | ||||
|     assert tokenized_words == expected_tokens     | ||||
| 
 | ||||
|     tokenized_words1 = [token.text for token in nlp(text_to_tokenize1)] | ||||
|     assert tokenized_words1 == expected_tokens1 | ||||
| 
 | ||||
|     tokenized_words2 = [token.text for token in nlp(text_to_tokenize2)] | ||||
|     assert tokenized_words2 == expected_tokens2 | ||||
| 
 | ||||
|     tokenized_words3 = [token.text for token in nlp(text_to_tokenize3)] | ||||
|     assert tokenized_words3 == expected_tokens3 | ||||
|  |  | |||
|  | @ -238,10 +238,14 @@ cdef class Tokenizer: | |||
|                     # let's say we have dyn-o-mite-dave - the regex finds the | ||||
|                     # start and end positions of the hyphens | ||||
|                     start = 0 | ||||
|                     start_before_infixes = start | ||||
|                     for match in matches: | ||||
|                         infix_start = match.start() | ||||
|                         infix_end = match.end() | ||||
| 
 | ||||
|                         if infix_start == start_before_infixes: | ||||
|                             continue | ||||
| 
 | ||||
|                         if infix_start != start: | ||||
|                             span = string[start:infix_start] | ||||
|                             tokens.push_back(self.vocab.get(tokens.mem, span), False) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user