diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 1571e13d7..2a4587856 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_ ELISION = " ' ’ ".strip().replace(" ", "") +abbrev = ("d", "D") + _infixes = ( LIST_ELLIPSES + LIST_ICONS + [ - r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION), + r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA), diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index b32daa58c..1c9b2dde3 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -10,6 +10,8 @@ _exc = {} # translate / delete what is not necessary for exc_data in [ + {ORTH: "’t", LEMMA: "et", NORM: "et"}, + {ORTH: "’T", LEMMA: "et", NORM: "et"}, {ORTH: "'t", LEMMA: "et", NORM: "et"}, {ORTH: "'T", LEMMA: "et", NORM: "et"}, {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},