spaCy/spacy/lang/norm_exceptions.py

47 lines
1.2 KiB
Python
Raw Normal View History

2017-06-03 21:27:05 +03:00
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}