mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-29 23:17:59 +03:00 
			
		
		
		
	Fix infix as prefix in Tokenizer.explain (#10140)
* Fix infix as prefix in Tokenizer.explain Update `Tokenizer.explain` to align with the `Tokenizer` algorithm: * skip infix matches that are prefixes in the current substring * Update tokenizer pseudocode in docs
This commit is contained in:
		
							parent
							
								
									30cf9d6a05
								
							
						
					
					
						commit
						4f441dfa24
					
				|  | @ -9,6 +9,7 @@ from spacy.tokenizer import Tokenizer | ||||||
| from spacy.tokens import Doc | from spacy.tokens import Doc | ||||||
| from spacy.training import Example | from spacy.training import Example | ||||||
| from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path | from spacy.util import compile_prefix_regex, compile_suffix_regex, ensure_path | ||||||
|  | from spacy.util import compile_infix_regex | ||||||
| from spacy.vocab import Vocab | from spacy.vocab import Vocab | ||||||
| from spacy.symbols import ORTH | from spacy.symbols import ORTH | ||||||
| 
 | 
 | ||||||
|  | @ -503,3 +504,20 @@ def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): | ||||||
|     assert tokens == ["a", "10", "."] |     assert tokens == ["a", "10", "."] | ||||||
|     explain_tokens = [t[1] for t in tokenizer.explain("a10.")] |     explain_tokens = [t[1] for t in tokenizer.explain("a10.")] | ||||||
|     assert tokens == explain_tokens |     assert tokens == explain_tokens | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_tokenizer_infix_prefix(en_vocab): | ||||||
|  |     # the prefix and suffix matches overlap in the suffix lookbehind | ||||||
|  |     infixes = ["±"] | ||||||
|  |     suffixes = ["%"] | ||||||
|  |     infix_re = compile_infix_regex(infixes) | ||||||
|  |     suffix_re = compile_suffix_regex(suffixes) | ||||||
|  |     tokenizer = Tokenizer( | ||||||
|  |         en_vocab, | ||||||
|  |         infix_finditer=infix_re.finditer, | ||||||
|  |         suffix_search=suffix_re.search, | ||||||
|  |     ) | ||||||
|  |     tokens = [t.text for t in tokenizer("±10%")] | ||||||
|  |     assert tokens == ["±10", "%"] | ||||||
|  |     explain_tokens = [t[1] for t in tokenizer.explain("±10%")] | ||||||
|  |     assert tokens == explain_tokens | ||||||
|  |  | ||||||
|  | @ -683,6 +683,8 @@ cdef class Tokenizer: | ||||||
|                     infixes = infix_finditer(substring) |                     infixes = infix_finditer(substring) | ||||||
|                     offset = 0 |                     offset = 0 | ||||||
|                     for match in infixes: |                     for match in infixes: | ||||||
|  |                         if offset == 0 and match.start() == 0: | ||||||
|  |                             continue | ||||||
|                         if substring[offset : match.start()]: |                         if substring[offset : match.start()]: | ||||||
|                             tokens.append(("TOKEN", substring[offset : match.start()])) |                             tokens.append(("TOKEN", substring[offset : match.start()])) | ||||||
|                         if substring[match.start() : match.end()]: |                         if substring[match.start() : match.end()]: | ||||||
|  |  | ||||||
|  | @ -831,6 +831,8 @@ def tokenizer_pseudo_code( | ||||||
|                 infixes = infix_finditer(substring) |                 infixes = infix_finditer(substring) | ||||||
|                 offset = 0 |                 offset = 0 | ||||||
|                 for match in infixes: |                 for match in infixes: | ||||||
|  |                     if offset == 0 and match.start() == 0: | ||||||
|  |                         continue | ||||||
|                     tokens.append(substring[offset : match.start()]) |                     tokens.append(substring[offset : match.start()]) | ||||||
|                     tokens.append(substring[match.start() : match.end()]) |                     tokens.append(substring[match.start() : match.end()]) | ||||||
|                     offset = match.end() |                     offset = match.end() | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user