mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-24 20:51:30 +03:00 
			
		
		
		
	Move °[cfkCFK]. to a tokenizer exception
This commit is contained in:
		
							parent
							
								
									96d50a3cb3
								
							
						
					
					
						commit
						97fb19d7d3
					
				|  | @ -21,7 +21,6 @@ TOKENIZER_SUFFIXES = ( | |||
|     + ["'s", "'S", "’s", "’S", "—", "–"] | ||||
|     + [ | ||||
|         r"(?<=[0-9])\+", | ||||
|         r"(?<=°[FfCcKk])\.", | ||||
|         r"(?<=[0-9])(?:{c})".format(c=CURRENCY), | ||||
|         r"(?<=[0-9])(?:{u})".format(u=UNITS), | ||||
|         r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format( | ||||
|  |  | |||
|  | @ -250,3 +250,10 @@ o.0 | |||
| 
 | ||||
| for orth in emoticons: | ||||
|     BASE_EXCEPTIONS[orth] = [{ORTH: orth}] | ||||
| 
 | ||||
| 
 | ||||
| # Moved from a suffix setting due to #9155 removing prefixes from consideration | ||||
| # for lookbehinds | ||||
| for u in "cfk": | ||||
|     BASE_EXCEPTIONS[f"°{u}."] = [{ORTH: f"°{u}"}, {ORTH: "."}] | ||||
|     BASE_EXCEPTIONS[f"°{u.upper()}."] = [{ORTH: f"°{u.upper()}"}, {ORTH: "."}] | ||||
|  |  | |||
|  | @ -45,3 +45,9 @@ def test_tokenizer_handles_emoji(tokenizer, text, length): | |||
|     if sys.maxunicode >= 1114111: | ||||
|         tokens = tokenizer(text) | ||||
|         assert len(tokens) == length | ||||
| 
 | ||||
| 
 | ||||
| def test_tokenizer_degree(tokenizer): | ||||
|     for u in "cfkCFK": | ||||
|         assert [t.text for t in tokenizer(f"°{u}.")] == [f"°{u}", "."] | ||||
|         assert [t[1] for t in tokenizer.explain(f"°{u}.")] == [f"°{u}", "."] | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user