spaCy/spacy/lang/lij/tokenizer_exceptions.py

50 lines
870 B
Python
Raw Permalink Normal View History

from ...symbols import ORTH
from ...util import update_exc
from ..tokenizer_exceptions import BASE_EXCEPTIONS
2020-03-20 07:20:17 +03:00
_exc = {}
for raw in [
"a-e",
"a-o",
"a-i",
"a-a",
"co-a",
"co-e",
"co-i",
"co-o",
"da-a",
"da-e",
"da-i",
"da-o",
"pe-a",
"pe-e",
"pe-i",
"pe-o",
2020-03-20 07:20:17 +03:00
]:
for orth in [raw, raw.capitalize()]:
_exc[orth] = [{ORTH: orth}]
2020-03-20 07:20:17 +03:00
# Prefix + prepositions with à (e.g. "sott'a-o")
for prep in [
"a-a",
"a-e",
"a-o",
"a-i",
2020-03-20 07:20:17 +03:00
]:
for prefix in [
"sott'",
"sott",
"contr'",
"contr",
"ch'",
"ch",
"s'",
"s",
2020-03-20 07:20:17 +03:00
]:
for prefix_orth in [prefix, prefix.capitalize()]:
_exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
2020-03-20 07:20:17 +03:00
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)