mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Ignore prefix in suffix matches (#9155)
* Ignore prefix in suffix matches
Ignore the currently matched prefix when looking for suffix matches in
the tokenizer. Otherwise a lookbehind in the suffix pattern may match
incorrectly due the presence of the prefix in the token string.
* Move °[cfkCFK]. to a tokenizer exception
* Adjust exceptions for same tokenization as v3.1
* Also update test accordingly
* Continue to split . after °CFK if ° is not a prefix
* Exclude new ° exceptions for pl
* Switch back to default tokenization of "° C ."
* Revert "Exclude new ° exceptions for pl"
This reverts commit 952013a5b4.
* Add exceptions for °C for hu
			
			
This commit is contained in:
		
							parent
							
								
									4170110ce7
								
							
						
					
					
						commit
						2ea9b58006
					
				|  | @ -646,5 +646,10 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format( | |||
| ) | ||||
| 
 | ||||
| 
 | ||||
| for u in "cfkCFK": | ||||
|     _exc[f"°{u}"] = [{ORTH: f"°{u}"}] | ||||
|     _exc[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}] | ||||
| 
 | ||||
| 
 | ||||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) | ||||
| TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match | ||||
|  |  | |||
|  | @ -250,3 +250,9 @@ o.0 | |||
| 
 | ||||
| for orth in emoticons: | ||||
|     BASE_EXCEPTIONS[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| 
 | ||||
| # Moved from a suffix setting due to #9155 removing prefixes from consideration | ||||
| # for lookbehinds | ||||
| for u in "cfkCFK": | ||||
|     BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: "°"}, {ORTH: f"{u}"}, {ORTH: "."}] | ||||
|  |  | |||
|  | @ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length): | |||
|     if sys.maxunicode >= 1114111: | ||||
|         tokens = tokenizer(text) | ||||
|         assert len(tokens) == length | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_degree(tokenizer): | ||||
|     for u in "cfkCFK": | ||||
|         assert [t.text for t in tokenizer(f"°{u}.")] == ["°", f"{u}", "."] | ||||
|         assert [t[1] for t in tokenizer.explain(f"°{u}.")] == ["°", f"{u}", "."] | ||||
|  |  | |||
|  | @ -2,7 +2,7 @@ import pytest | |||
| import re | ||||
| from spacy.vocab import Vocab | ||||
| from spacy.tokenizer import Tokenizer | ||||
| from spacy.util import ensure_path | ||||
| from spacy.util import ensure_path, compile_prefix_regex, compile_suffix_regex | ||||
| from spacy.lang.en import English | ||||
| 
 | ||||
| 
 | ||||
|  | @ -212,3 +212,20 @@ def test_tokenizer_flush_specials(en_vocab): | |||
|     assert [t.text for t in tokenizer1("a a.")] == ["a a", "."] | ||||
|     tokenizer1.rules = {} | ||||
|     assert [t.text for t in tokenizer1("a a.")] == ["a", "a", "."] | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_prefix_suffix_overlap_lookbehind(en_vocab): | ||||
|     # the prefix and suffix matches overlap in the suffix lookbehind | ||||
|     prefixes = ['a(?=.)'] | ||||
|     suffixes = [r'(?<=\w)\.', r'(?<=a)\d+\.'] | ||||
|     prefix_re = compile_prefix_regex(prefixes) | ||||
|     suffix_re = compile_suffix_regex(suffixes) | ||||
|     tokenizer = Tokenizer( | ||||
|         en_vocab, | ||||
|         prefix_search=prefix_re.search, | ||||
|         suffix_search=suffix_re.search, | ||||
|     ) | ||||
|     tokens = [t.text for t in tokenizer("a10.")] | ||||
|     assert tokens == ["a", "10", "."] | ||||
|     explain_tokens = [t[1] for t in tokenizer.explain("a10.")] | ||||
|     assert tokens == explain_tokens | ||||
|  |  | |||
|  | @ -408,7 +408,7 @@ cdef class Tokenizer: | |||
|                     string = minus_pre | ||||
|                     prefixes.push_back(self.vocab.get(mem, prefix)) | ||||
|                     break | ||||
|             suf_len = self.find_suffix(string) | ||||
|             suf_len = self.find_suffix(string[pre_len:]) | ||||
|             if suf_len != 0: | ||||
|                 suffix = string[-suf_len:] | ||||
|                 minus_suf = string[:-suf_len] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user