new tests & tokenization fixes (#4734)

- added some tests for tokenization issues - fixed some issues with tokenization of words with hyphen infix - rewrote the "tokenizer_exceptions.py" file (stemming from the German version)
2025-07-23 06:29:48 +03:00 · 2019-12-01 23:08:21 +01:00 · 2019-12-01 23:08:21 +01:00 · a7ee4b6f17
commit a7ee4b6f17
parent 48ea2e8d0f
5 changed files with 38 additions and 12 deletions
--- a/spacy/lang/lb/norm_exceptions.py
+++ b/spacy/lang/lb/norm_exceptions.py
@ -6,7 +6,7 @@ from __future__ import unicode_literals
 # variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
 # here one could include the most common spelling mistakes

-_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
+_exc = {"dass": "datt", "viläicht": "vläicht"}


 NORM_EXCEPTIONS = {}
--- a/spacy/lang/lb/punctuation.py
+++ b/spacy/lang/lb/punctuation.py
@ -1,16 +1,23 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import ALPHA
-
+from ..char_classes import LIST_ELLIPSES, LIST_ICONS
+from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER

 ELISION = " ' ’ ".strip().replace(" ", "")
-HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "")

-
-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION),
+        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
+        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[0-9])-(?=[0-9])",
+    ]
+)

 TOKENIZER_INFIXES = _infixes
--- a/spacy/lang/lb/tokenizer_exceptions.py
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@ -10,7 +10,9 @@ _exc = {}

 # translate / delete what is not necessary
 for exc_data in [
-    {ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
+    {ORTH: "'t", LEMMA: "et", NORM: "et"},
+    {ORTH: "'T", LEMMA: "et", NORM: "et"},
+    {ORTH: "wgl.", LEMMA: "wannechgelift", NORM: "wannechgelift"},
    {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
    {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
    {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
@ -18,7 +20,7 @@ for exc_data in [
    {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
    {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
    {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
-    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
+    {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

--- a/spacy/tests/lang/lb/test_exceptions.py
+++ b/spacy/tests/lang/lb/test_exceptions.py
@ -3,8 +3,24 @@ from __future__ import unicode_literals

 import pytest

-
@pytest.mark.parametrize("text", ["z.B.", "Jan."])
 def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
    tokens = lb_tokenizer(text)
    assert len(tokens) == 1
+
+@pytest.mark.parametrize("text", ["d'Saach", "d'Kanner", "d’Welt", "d’Suen"])
+def test_lb_tokenizer_splits_contractions(lb_tokenizer, text):
+    tokens = lb_tokenizer(text)
+    assert len(tokens) == 2
+
+def test_lb_tokenizer_handles_exc_in_text(lb_tokenizer):
+    text = "Mee 't ass net evident, d'Liewen."
+    tokens = lb_tokenizer(text)
+    assert len(tokens) == 9
+    assert tokens[1].text == "'t"
+    assert tokens[1].lemma_ == "et"
+
+@pytest.mark.parametrize("text,norm", [("dass", "datt"), ("viläicht", "vläicht")])
+def test_lb_norm_exceptions(lb_tokenizer, text, norm):
+    tokens = lb_tokenizer(text)
+    assert tokens[0].norm_ == norm
--- a/spacy/tests/lang/lb/test_text.py
+++ b/spacy/tests/lang/lb/test_text.py
@ -16,6 +16,7 @@ def test_lb_tokenizer_handles_long_text(lb_tokenizer):
    [
        ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
        ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
+        ("Am Grand-Duché ass d'Liewen schéin, mee 't gëtt ze vill Autoen.", 14)
    ],
 )
 def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):