mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			57 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			57 lines
		
	
	
		
			1.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | ||
| from __future__ import unicode_literals
 | ||
| 
 | ||
| 
 | ||
| # These exceptions are used to add NORM values based on a token's ORTH value.
 | ||
| # Individual languages can also add their own exceptions and overwrite them -
 | ||
| # for example, British vs. American spelling in English.
 | ||
| 
 | ||
| # Norms are only set if no alternative is provided in the tokenizer exceptions.
 | ||
| # Note that this does not change any other token attributes. Its main purpose
 | ||
| # is to normalise the word representations so that equivalent tokens receive
 | ||
| # similar representations. For example: $ and € are very different, but they're
 | ||
| # both currency symbols. By normalising currency symbols to $, all symbols are
 | ||
| # seen as similar, no matter how common they are in the training data.
 | ||
| 
 | ||
| 
 | ||
| BASE_NORMS = {
 | ||
|     "'s": "'s",
 | ||
|     "'S": "'s",
 | ||
|     "’s": "'s",
 | ||
|     "’S": "'s",
 | ||
|     "’": "'",
 | ||
|     "‘": "'",
 | ||
|     "´": "'",
 | ||
|     "`": "'",
 | ||
|     "”": '"',
 | ||
|     "“": '"',
 | ||
|     "''": '"',
 | ||
|     "``": '"',
 | ||
|     "´´": '"',
 | ||
|     "„": '"',
 | ||
|     "»": '"',
 | ||
|     "«": '"',
 | ||
|     "‘‘": '"',
 | ||
|     "’’": '"',
 | ||
|     "?": "?",
 | ||
|     "!": "!",
 | ||
|     ",": ",",
 | ||
|     ";": ";",
 | ||
|     ":": ":",
 | ||
|     "。": ".",
 | ||
|     "।": ".",
 | ||
|     "…": "...",
 | ||
|     "—": "-",
 | ||
|     "–": "-",
 | ||
|     "--": "-",
 | ||
|     "---": "-",
 | ||
|     "——": "-",
 | ||
|     "€": "$",
 | ||
|     "£": "$",
 | ||
|     "¥": "$",
 | ||
|     "฿": "$",
 | ||
|     "US$": "$",
 | ||
|     "C$": "$",
 | ||
|     "A$": "$"
 | ||
| }
 |