Merge pull request #5019 from questoph/master

Optimizing tokenization for Luxembourgish (dealing with apostrophe infixes)
2026-02-18 05:00:41 +03:00 · 2020-02-25 14:48:50 +01:00 · 2020-02-25 14:48:50 +01:00 · d50152b917
commit d50152b917
parent 4440a072d2 5352fc8fc3
2 changed files with 5 additions and 1 deletions
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@ -5,11 +5,13 @@ from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_

 ELISION = " ' ’ ".strip().replace(" ", "")

+abbrev = ("d", "D")
+
 _infixes = (
    LIST_ELLIPSES
    + LIST_ICONS
    + [
-        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+        r"(?<=^[{ab}][{el}])(?=[{a}])".format(ab=abbrev, a=ALPHA, el=ELISION),
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@ -10,6 +10,8 @@ _exc = {}

 # translate / delete what is not necessary
 for exc_data in [
+    {ORTH: "’t", LEMMA: "et", NORM: "et"},
+    {ORTH: "’T", LEMMA: "et", NORM: "et"},
    {ORTH: "'t", LEMMA: "et", NORM: "et"},
    {ORTH: "'T", LEMMA: "et", NORM: "et"},
    {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},