# coding: utf8 from __future__ import unicode_literals # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. # Norms are only set if no alternative is provided in the tokenizer exceptions. # Note that this does not change any other token attributes. Its main purpose # is to normalise the word representations so that equivalent tokens receive # similar representations. For example: $ and € are very different, but they're # both currency symbols. By normalising currency symbols to $, all symbols are # seen as similar, no matter how common they are in the training data. BASE_NORMS = { "'s": "'s", "'S": "'s", "’s": "'s", "’S": "'s", "’": "'", "‘": "'", "´": "'", "`": "'", "”": '"', "“": '"', "''": '"', "``": '"', "´´": '"', "„": '"', "»": '"', "«": '"', "…": "...", "—": "-", "–": "-", "--": "-", "---": "-", "€": "$", "£": "$", "¥": "$", "฿": "$", "US$": "$", "C$": "$", "A$": "$" }