Add base norm exceptions

2025-12-04 08:44:23 +03:00 · 2017-06-03 20:27:05 +02:00 · 2017-06-03 20:27:05 +02:00 · e5d426406a
commit e5d426406a
parent 4c2bbc3ccc
1 changed files with 46 additions and 0 deletions
--- a/spacy/lang/norm_exceptions.py
+++ b/spacy/lang/norm_exceptions.py
@ -0,0 +1,46 @@
 # coding: utf8
 from __future__ import unicode_literals
 # These exceptions are used to add NORM values based on a token's ORTH value.
 # Individual languages can also add their own exceptions and overwrite them -
 # for example, British vs. American spelling in English.
 # Norms are only set if no alternative is provided in the tokenizer exceptions.
 # Note that this does not change any other token attributes. Its main purpose
 # is to normalise the word representations so that equivalent tokens receive
 # similar representations. For example: $ and € are very different, but they're
 # both currency symbols. By normalising currency symbols to $, all symbols are
 # seen as similar, no matter how common they are in the training data.
 BASE_NORMS = {
    "'s": "'s",
    "'S": "'s",
    "’s": "'s",
    "’S": "'s",
    "’": "'",
    "‘": "'",
    "´": "'",
    "`": "'",
    "”": '"',
    "“": '"',
    "''": '"',
    "``": '"',
    "´´": '"',
    "„": '"',
    "»": '"',
    "«": '"',
    "…": "...",
    "—": "-",
    "–": "-",
    "--": "-",
    "---": "-",
    "€": "$",
    "£": "$",
    "¥": "$",
    "฿": "$",
    "US$": "$",
    "C$": "$",
    "A$": "$"
 }