mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
71884d0942
Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
77 lines
1.1 KiB
Python
77 lines
1.1 KiB
Python
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
|
from ...symbols import ORTH
|
|
from ...util import update_exc
|
|
|
|
|
|
## TODO: Look into systematically handling u/v
|
|
_exc = {
|
|
"mecum": [{ORTH: "me"}, {ORTH: "cum"}],
|
|
"tecum": [{ORTH: "te"}, {ORTH: "cum"}],
|
|
"nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}],
|
|
"vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}],
|
|
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
|
|
}
|
|
|
|
for orth in [
|
|
"A.",
|
|
"Agr.",
|
|
"Ap.",
|
|
"C.",
|
|
"Cn.",
|
|
"D.",
|
|
"F.",
|
|
"K.",
|
|
"L.",
|
|
"M'.",
|
|
"M.",
|
|
"Mam.",
|
|
"N.",
|
|
"Oct.",
|
|
"Opet.",
|
|
"P.",
|
|
"Paul.",
|
|
"Post.",
|
|
"Pro.",
|
|
"Q.",
|
|
"S.",
|
|
"Ser.",
|
|
"Sert.",
|
|
"Sex.",
|
|
"St.",
|
|
"Sta.",
|
|
"T.",
|
|
"Ti.",
|
|
"V.",
|
|
"Vol.",
|
|
"Vop.",
|
|
"U.",
|
|
"Uol.",
|
|
"Uop.",
|
|
"Ian.",
|
|
"Febr.",
|
|
"Mart.",
|
|
"Apr.",
|
|
"Mai.",
|
|
"Iun.",
|
|
"Iul.",
|
|
"Aug.",
|
|
"Sept.",
|
|
"Oct.",
|
|
"Nov.",
|
|
"Nou.",
|
|
"Dec.",
|
|
"Non.",
|
|
"Id.",
|
|
"A.D.",
|
|
"Coll.",
|
|
"Cos.",
|
|
"Ord.",
|
|
"Pl.",
|
|
"S.C.",
|
|
"Suff.",
|
|
"Trib.",
|
|
]:
|
|
_exc[orth] = [{ORTH: orth}]
|
|
|
|
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)
|