mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			51 lines
		
	
	
		
			871 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			871 B
		
	
	
	
		
			Python
		
	
	
	
	
	
| from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | ||
| from ...symbols import ORTH
 | ||
| from ...util import update_exc
 | ||
| 
 | ||
| 
 | ||
| _exc = {}
 | ||
| 
 | ||
| for raw in [
 | ||
|     "a-e",
 | ||
|     "a-o",
 | ||
|     "a-i",
 | ||
|     "a-a",
 | ||
|     "co-a",
 | ||
|     "co-e",
 | ||
|     "co-i",
 | ||
|     "co-o",
 | ||
|     "da-a",
 | ||
|     "da-e",
 | ||
|     "da-i",
 | ||
|     "da-o",
 | ||
|     "pe-a",
 | ||
|     "pe-e",
 | ||
|     "pe-i",
 | ||
|     "pe-o",
 | ||
| ]:
 | ||
|     for orth in [raw, raw.capitalize()]:
 | ||
|         _exc[orth] = [{ORTH: orth}]
 | ||
| 
 | ||
| # Prefix + prepositions with à (e.g. "sott'a-o")
 | ||
| 
 | ||
| for prep in [
 | ||
|     "a-a",
 | ||
|     "a-e",
 | ||
|     "a-o",
 | ||
|     "a-i",
 | ||
| ]:
 | ||
|     for prefix in [
 | ||
|         "sott'",
 | ||
|         "sott’",
 | ||
|         "contr'",
 | ||
|         "contr’",
 | ||
|         "ch'",
 | ||
|         "ch’",
 | ||
|         "s'",
 | ||
|         "s’",
 | ||
|     ]:
 | ||
|         for prefix_orth in [prefix, prefix.capitalize()]:
 | ||
|             _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
 | ||
| 
 | ||
| TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 |