mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			60 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			60 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# These exceptions are used to add NORM values based on a token's ORTH value.
 | 
						||
# Individual languages can also add their own exceptions and overwrite them -
 | 
						||
# for example, British vs. American spelling in English.
 | 
						||
 | 
						||
# Norms are only set if no alternative is provided in the tokenizer exceptions.
 | 
						||
# Note that this does not change any other token attributes. Its main purpose
 | 
						||
# is to normalise the word representations so that equivalent tokens receive
 | 
						||
# similar representations. For example: $ and € are very different, but they're
 | 
						||
# both currency symbols. By normalising currency symbols to $, all symbols are
 | 
						||
# seen as similar, no matter how common they are in the training data.
 | 
						||
 | 
						||
 | 
						||
BASE_NORMS = {
 | 
						||
    "'s": "'s",
 | 
						||
    "'S": "'s",
 | 
						||
    "’s": "'s",
 | 
						||
    "’S": "'s",
 | 
						||
    "’": "'",
 | 
						||
    "‘": "'",
 | 
						||
    "´": "'",
 | 
						||
    "`": "'",
 | 
						||
    "”": '"',
 | 
						||
    "“": '"',
 | 
						||
    "''": '"',
 | 
						||
    "``": '"',
 | 
						||
    "´´": '"',
 | 
						||
    "„": '"',
 | 
						||
    "»": '"',
 | 
						||
    "«": '"',
 | 
						||
    "‘‘": '"',
 | 
						||
    "’’": '"',
 | 
						||
    "?": "?",
 | 
						||
    "!": "!",
 | 
						||
    ",": ",",
 | 
						||
    ";": ";",
 | 
						||
    ":": ":",
 | 
						||
    "。": ".",
 | 
						||
    "।": ".",
 | 
						||
    "…": "...",
 | 
						||
    "—": "-",
 | 
						||
    "–": "-",
 | 
						||
    "--": "-",
 | 
						||
    "---": "-",
 | 
						||
    "——": "-",
 | 
						||
    "€": "$",
 | 
						||
    "£": "$",
 | 
						||
    "¥": "$",
 | 
						||
    "฿": "$",
 | 
						||
    "US$": "$",
 | 
						||
    "C$": "$",
 | 
						||
    "A$": "$",
 | 
						||
    "₺": "$",
 | 
						||
    "₹": "$",
 | 
						||
    "৳": "$",
 | 
						||
    "₩": "$",
 | 
						||
    "Mex$": "$",
 | 
						||
    "₣": "$",
 | 
						||
    "E£": "$",
 | 
						||
}
 |