mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Fixed issue of infix capturing prefixes
This commit is contained in:
		
							parent
							
								
									33b0f86de3
								
							
						
					
					
						commit
						724ae7dc55
					
				|  | @ -10,8 +10,14 @@ from ...tokenizer import Tokenizer | ||||||
| 
 | 
 | ||||||
| def test_issue1494():     | def test_issue1494():     | ||||||
|     infix_re = re.compile(r'''[^a-z]''') |     infix_re = re.compile(r'''[^a-z]''') | ||||||
|     text_to_tokenize = 'token 123test' |     text_to_tokenize1 = 'token 123test' | ||||||
|     expected_tokens = ['token', '1', '2', '3', 'test'] |     expected_tokens1 = ['token', '1', '2', '3', 'test'] | ||||||
|  | 
 | ||||||
|  |     text_to_tokenize2 = 'token 1test' | ||||||
|  |     expected_tokens2 = ['token', '1test'] | ||||||
|  | 
 | ||||||
|  |     text_to_tokenize3 = 'hello...test' | ||||||
|  |     expected_tokens3 = ['hello', '.', '.', '.', 'test'] | ||||||
|      |      | ||||||
|     def my_tokenizer(nlp): |     def my_tokenizer(nlp): | ||||||
|         return Tokenizer(nlp.vocab, |         return Tokenizer(nlp.vocab, | ||||||
|  | @ -22,6 +28,12 @@ def test_issue1494(): | ||||||
|     nlp = English() |     nlp = English() | ||||||
| 
 | 
 | ||||||
|     nlp.tokenizer = my_tokenizer(nlp) |     nlp.tokenizer = my_tokenizer(nlp) | ||||||
|     tokenized_words = [token.text for token in nlp(text_to_tokenize)] | 
 | ||||||
|     print(tokenized_words) |     tokenized_words1 = [token.text for token in nlp(text_to_tokenize1)] | ||||||
|     assert tokenized_words == expected_tokens     |     assert tokenized_words1 == expected_tokens1 | ||||||
|  | 
 | ||||||
|  |     tokenized_words2 = [token.text for token in nlp(text_to_tokenize2)] | ||||||
|  |     assert tokenized_words2 == expected_tokens2 | ||||||
|  | 
 | ||||||
|  |     tokenized_words3 = [token.text for token in nlp(text_to_tokenize3)] | ||||||
|  |     assert tokenized_words3 == expected_tokens3 | ||||||
|  |  | ||||||
|  | @ -238,10 +238,14 @@ cdef class Tokenizer: | ||||||
|                     # let's say we have dyn-o-mite-dave - the regex finds the |                     # let's say we have dyn-o-mite-dave - the regex finds the | ||||||
|                     # start and end positions of the hyphens |                     # start and end positions of the hyphens | ||||||
|                     start = 0 |                     start = 0 | ||||||
|  |                     start_before_infixes = start | ||||||
|                     for match in matches: |                     for match in matches: | ||||||
|                         infix_start = match.start() |                         infix_start = match.start() | ||||||
|                         infix_end = match.end() |                         infix_end = match.end() | ||||||
| 
 | 
 | ||||||
|  |                         if infix_start == start_before_infixes: | ||||||
|  |                             continue | ||||||
|  | 
 | ||||||
|                         if infix_start != start: |                         if infix_start != start: | ||||||
|                             span = string[start:infix_start] |                             span = string[start:infix_start] | ||||||
|                             tokens.push_back(self.vocab.get(tokens.mem, span), False) |                             tokens.push_back(self.vocab.get(tokens.mem, span), False) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user