2020-07-22 23:18:46 +03:00
|
|
|
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
2020-07-23 00:09:01 +03:00
|
|
|
|
from ...symbols import ORTH
|
2020-07-22 23:18:46 +03:00
|
|
|
|
from ...util import update_exc
|
|
|
|
|
|
2020-03-20 07:20:17 +03:00
|
|
|
|
|
|
|
|
|
_exc = {}
|
|
|
|
|
|
2020-07-23 00:09:01 +03:00
|
|
|
|
for raw in [
|
|
|
|
|
"a-e",
|
|
|
|
|
"a-o",
|
|
|
|
|
"a-i",
|
|
|
|
|
"a-a",
|
|
|
|
|
"co-a",
|
|
|
|
|
"co-e",
|
|
|
|
|
"co-i",
|
|
|
|
|
"co-o",
|
|
|
|
|
"da-a",
|
|
|
|
|
"da-e",
|
|
|
|
|
"da-i",
|
|
|
|
|
"da-o",
|
|
|
|
|
"pe-a",
|
|
|
|
|
"pe-e",
|
|
|
|
|
"pe-i",
|
|
|
|
|
"pe-o",
|
2020-03-20 07:20:17 +03:00
|
|
|
|
]:
|
|
|
|
|
for orth in [raw, raw.capitalize()]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[orth] = [{ORTH: orth}]
|
2020-03-20 07:20:17 +03:00
|
|
|
|
|
|
|
|
|
# Prefix + prepositions with à (e.g. "sott'a-o")
|
|
|
|
|
|
2020-07-23 00:09:01 +03:00
|
|
|
|
for prep in [
|
|
|
|
|
"a-a",
|
|
|
|
|
"a-e",
|
|
|
|
|
"a-o",
|
|
|
|
|
"a-i",
|
2020-03-20 07:20:17 +03:00
|
|
|
|
]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
for prefix in [
|
|
|
|
|
"sott'",
|
|
|
|
|
"sott’",
|
|
|
|
|
"contr'",
|
|
|
|
|
"contr’",
|
|
|
|
|
"ch'",
|
|
|
|
|
"ch’",
|
|
|
|
|
"s'",
|
|
|
|
|
"s’",
|
2020-03-20 07:20:17 +03:00
|
|
|
|
]:
|
|
|
|
|
for prefix_orth in [prefix, prefix.capitalize()]:
|
2020-07-23 00:09:01 +03:00
|
|
|
|
_exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
|
2020-03-20 07:20:17 +03:00
|
|
|
|
|
2020-07-22 23:18:46 +03:00
|
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|