Add base norm exceptions

This commit is contained in:
ines 2017-06-03 20:27:05 +02:00
parent 4c2bbc3ccc
commit e5d426406a

View File

@ -0,0 +1,46 @@
# coding: utf8
from __future__ import unicode_literals
# These exceptions are used to add NORM values based on a token's ORTH value.
# Individual languages can also add their own exceptions and overwrite them -
# for example, British vs. American spelling in English.
# Norms are only set if no alternative is provided in the tokenizer exceptions.
# Note that this does not change any other token attributes. Its main purpose
# is to normalise the word representations so that equivalent tokens receive
# similar representations. For example: $ and € are very different, but they're
# both currency symbols. By normalising currency symbols to $, all symbols are
# seen as similar, no matter how common they are in the training data.
BASE_NORMS = {
"'s": "'s",
"'S": "'s",
"s": "'s",
"S": "'s",
"": "'",
"": "'",
"´": "'",
"`": "'",
"": '"',
"": '"',
"''": '"',
"``": '"',
"´´": '"',
"": '"',
"»": '"',
"«": '"',
"": "...",
"": "-",
"": "-",
"--": "-",
"---": "-",
"": "$",
"£": "$",
"¥": "$",
"฿": "$",
"US$": "$",
"C$": "$",
"A$": "$"
}