from ..tokenizer_exceptions import BASE_EXCEPTIONS from ...symbols import ORTH from ...util import update_exc ## TODO: Look into systematically handling u/v _exc = { "mecum": [{ORTH: "me"}, {ORTH: "cum"}], "tecum": [{ORTH: "te"}, {ORTH: "cum"}], "nobiscum": [{ORTH: "nobis"}, {ORTH: "cum"}], "vobiscum": [{ORTH: "vobis"}, {ORTH: "cum"}], "uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}], } for orth in [ "A.", "Agr.", "Ap.", "C.", "Cn.", "D.", "F.", "K.", "L.", "M'.", "M.", "Mam.", "N.", "Oct.", "Opet.", "P.", "Paul.", "Post.", "Pro.", "Q.", "S.", "Ser.", "Sert.", "Sex.", "St.", "Sta.", "T.", "Ti.", "V.", "Vol.", "Vop.", "U.", "Uol.", "Uop.", "Ian.", "Febr.", "Mart.", "Apr.", "Mai.", "Iun.", "Iul.", "Aug.", "Sept.", "Oct.", "Nov.", "Nou.", "Dec.", "Non.", "Id.", "A.D.", "Coll.", "Cos.", "Ord.", "Pl.", "S.C.", "Suff.", "Trib.", ]: _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)