mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			51 lines
		
	
	
		
			871 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			51 lines
		
	
	
		
			871 B
		
	
	
	
		
			Python
		
	
	
	
	
	
from ..tokenizer_exceptions import BASE_EXCEPTIONS
 | 
						||
from ...symbols import ORTH
 | 
						||
from ...util import update_exc
 | 
						||
 | 
						||
 | 
						||
_exc = {}
 | 
						||
 | 
						||
for raw in [
 | 
						||
    "a-e",
 | 
						||
    "a-o",
 | 
						||
    "a-i",
 | 
						||
    "a-a",
 | 
						||
    "co-a",
 | 
						||
    "co-e",
 | 
						||
    "co-i",
 | 
						||
    "co-o",
 | 
						||
    "da-a",
 | 
						||
    "da-e",
 | 
						||
    "da-i",
 | 
						||
    "da-o",
 | 
						||
    "pe-a",
 | 
						||
    "pe-e",
 | 
						||
    "pe-i",
 | 
						||
    "pe-o",
 | 
						||
]:
 | 
						||
    for orth in [raw, raw.capitalize()]:
 | 
						||
        _exc[orth] = [{ORTH: orth}]
 | 
						||
 | 
						||
# Prefix + prepositions with à (e.g. "sott'a-o")
 | 
						||
 | 
						||
for prep in [
 | 
						||
    "a-a",
 | 
						||
    "a-e",
 | 
						||
    "a-o",
 | 
						||
    "a-i",
 | 
						||
]:
 | 
						||
    for prefix in [
 | 
						||
        "sott'",
 | 
						||
        "sott’",
 | 
						||
        "contr'",
 | 
						||
        "contr’",
 | 
						||
        "ch'",
 | 
						||
        "ch’",
 | 
						||
        "s'",
 | 
						||
        "s’",
 | 
						||
    ]:
 | 
						||
        for prefix_orth in [prefix, prefix.capitalize()]:
 | 
						||
            _exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
 | 
						||
 | 
						||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
 |