diff --git a/spacy/lang/la/tokenizer_exceptions.py b/spacy/lang/la/tokenizer_exceptions.py index 060f6e085..e9934fcef 100644 --- a/spacy/lang/la/tokenizer_exceptions.py +++ b/spacy/lang/la/tokenizer_exceptions.py @@ -12,65 +12,15 @@ _exc = { "uobiscum": [{ORTH: "uobis"}, {ORTH: "cum"}], } -for orth in [ - "A.", - "Agr.", - "Ap.", - "C.", - "Cn.", - "D.", - "F.", - "K.", - "L.", - "M'.", - "M.", - "Mam.", - "N.", - "Oct.", - "Opet.", - "P.", - "Paul.", - "Post.", - "Pro.", - "Q.", - "S.", - "Ser.", - "Sert.", - "Sex.", - "St.", - "Sta.", - "T.", - "Ti.", - "V.", - "Vol.", - "Vop.", - "U.", - "Uol.", - "Uop.", - "Ian.", - "Febr.", - "Mart.", - "Apr.", - "Mai.", - "Iun.", - "Iul.", - "Aug.", - "Sept.", - "Oct.", - "Nov.", - "Nou.", - "Dec.", - "Non.", - "Id.", - "A.D.", - "Coll.", - "Cos.", - "Ord.", - "Pl.", - "S.C.", - "Suff.", - "Trib.", -]: +_abbrev_exc = """A. A.D. Aa. Aaa. Acc. Agr. Ap. Apr. April. A.U.C. Aug. C. Caes. Caess. Cc. Cn. Coll. Cons. Conss. Cos. Coss. D. D.N. Dat. Dd. Dec. Decemb. Decembr. F. Feb. Febr. Februar. Ian. Id. Imp. Impp. Imppp. Iul. Iun. K. Kal. L. M'. M. Mai. Mam. Mar. Mart. Med. N. Nn. Nob. Non. Nov. Novemb. Oct. Octob. Opet. Ord. P. Paul. Pf. Pl. Plur. Post. Pp. Prid. Pro. Procos. Q. Quint. S. S.C. Scr. Sept. Septemb. Ser. Sert. Sex. Sext. St. Sta. Suff. T. Ti. Trib. V. Vol. Vop. Vv.""".split() + +_abbrev_exc += [item.lower() for item in _abbrev_exc] +_abbrev_exc += [item.upper() for item in _abbrev_exc] +_abbrev_exc += [item.replace("v", "u").replace("V", "U") for item in _abbrev_exc] + +_abbrev_exc += ["d.N."] + +for orth in _abbrev_exc: _exc[orth] = [{ORTH: orth}] TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc)