# These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. # Norms are only set if no alternative is provided in the tokenizer exceptions. # Note that this does not change any other token attributes. Its main purpose # is to normalise the word representations so that equivalent tokens receive # similar representations. For example: $ and € are very different, but they're # both currency symbols. By normalising currency symbols to $, all symbols are # seen as similar, no matter how common they are in the training data. BASE_NORMS = { "'s": "'s", "'S": "'s", "’s": "'s", "’S": "'s", "’": "'", "‘": "'", "´": "'", "`": "'", "”": '"', "“": '"', "''": '"', "``": '"', "´´": '"', "„": '"', "»": '"', "«": '"', "‘‘": '"', "’’": '"', "?": "?", "!": "!", ",": ",", ";": ";", ":": ":", "。": ".", "।": ".", "…": "...", "—": "-", "–": "-", "--": "-", "---": "-", "——": "-", "€": "$", "£": "$", "¥": "$", "฿": "$", "US$": "$", "C$": "$", "A$": "$", "₺": "$", "₹": "$", "৳": "$", "₩": "$", "Mex$": "$", "₣": "$", "E£": "$", }