Expand abbreviations in la tokenizer_exceptions

This commit is contained in:
Patrick J. Burns 2023-04-09 08:12:44 -04:00
parent 1a2cedeab1
commit f8d0dc1ef8

View File

@ -12,65 +12,15 @@ _exc = {
"uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}],
}
for orth in [
"A.",
"Agr.",
"Ap.",
"C.",
"Cn.",
"D.",
"F.",
"K.",
"L.",
"M'.",
"M.",
"Mam.",
"N.",
"Oct.",
"Opet.",
"P.",
"Paul.",
"Post.",
"Pro.",
"Q.",
"S.",
"Ser.",
"Sert.",
"Sex.",
"St.",
"Sta.",
"T.",
"Ti.",
"V.",
"Vol.",
"Vop.",
"U.",
"Uol.",
"Uop.",
"Ian.",
"Febr.",
"Mart.",
"Apr.",
"Mai.",
"Iun.",
"Iul.",
"Aug.",
"Sept.",
"Oct.",
"Nov.",
"Nou.",
"Dec.",
"Non.",
"Id.",
"A.D.",
"Coll.",
"Cos.",
"Ord.",
"Pl.",
"S.C.",
"Suff.",
"Trib.",
]:
_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split()
_abbrev_exc += [item.lower() for item in _abbrev_exc]
_abbrev_exc += [item.upper() for item in _abbrev_exc]
_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc]
_abbrev_exc += ["d.N."]
for orth in _abbrev_exc:
_exc[orth] = [{ORTH: orth}]
TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)