mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-29 11:26:28 +03:00
51 lines
871 B
Python
51 lines
871 B
Python
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||
from ...symbols import ORTH
|
||
from ...util import update_exc
|
||
|
||
|
||
_exc = {}
|
||
|
||
for raw in [
|
||
"a-e",
|
||
"a-o",
|
||
"a-i",
|
||
"a-a",
|
||
"co-a",
|
||
"co-e",
|
||
"co-i",
|
||
"co-o",
|
||
"da-a",
|
||
"da-e",
|
||
"da-i",
|
||
"da-o",
|
||
"pe-a",
|
||
"pe-e",
|
||
"pe-i",
|
||
"pe-o",
|
||
]:
|
||
for orth in [raw, raw.capitalize()]:
|
||
_exc[orth] = [{ORTH: orth}]
|
||
|
||
# Prefix + prepositions with à (e.g. "sott'a-o")
|
||
|
||
for prep in [
|
||
"a-a",
|
||
"a-e",
|
||
"a-o",
|
||
"a-i",
|
||
]:
|
||
for prefix in [
|
||
"sott'",
|
||
"sott’",
|
||
"contr'",
|
||
"contr’",
|
||
"ch'",
|
||
"ch’",
|
||
"s'",
|
||
"s’",
|
||
]:
|
||
for prefix_orth in [prefix, prefix.capitalize()]:
|
||
_exc[prefix_orth + prep] = [{ORTH: prefix_orth}, {ORTH: prep}]
|
||
|
||
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|