Remove English exceptions with mismatched features (#10873)

Remove English contraction exceptions with mismatched features that lead to exceptions like "theses" and "thisre".
2025-07-15 02:32:37 +03:00 · 2022-06-03 09:44:04 +02:00 · 2022-06-03 09:44:04 +02:00 · 727ce6d1f5
commit 727ce6d1f5
parent 41389ffe1e
2 changed files with 45 additions and 34 deletions
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -35,7 +35,7 @@ for pron in ["i"]:
        _exc[orth + "m"] = [
            {ORTH: orth, NORM: pron},
-            {ORTH: "m", "tenspect": 1, "number": 1},
+            {ORTH: "m"},
        ]
        _exc[orth + "'ma"] = [
@ -139,26 +139,27 @@ for pron in ["he", "she", "it"]:
 # W-words, relative pronouns, prepositions etc.
-for word in [
+for word, morph in [
-    "who",
+    ("who", None),
-    "what",
+    ("what", None),
-    "when",
+    ("when", None),
-    "where",
+    ("where", None),
-    "why",
+    ("why", None),
-    "how",
+    ("how", None),
-    "there",
+    ("there", None),
-    "that",
+    ("that", "Number=Sing|Person=3"),
-    "this",
+    ("this", "Number=Sing|Person=3"),
-    "these",
+    ("these", "Number=Plur|Person=3"),
-    "those",
+    ("those", "Number=Plur|Person=3"),
 ]:
    for orth in [word, word.title()]:
-        _exc[orth + "'s"] = [
+        if morph != "Number=Plur|Person=3":
-            {ORTH: orth, NORM: word},
+            _exc[orth + "'s"] = [
-            {ORTH: "'s", NORM: "'s"},
+                {ORTH: orth, NORM: word},
-        ]
+                {ORTH: "'s", NORM: "'s"},
            ]
-        _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
+            _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
        _exc[orth + "'ll"] = [
            {ORTH: orth, NORM: word},
@ -182,25 +183,26 @@ for word in [
            {ORTH: "ve", NORM: "have"},
        ]
-        _exc[orth + "'re"] = [
+        if morph != "Number=Sing|Person=3":
-            {ORTH: orth, NORM: word},
+            _exc[orth + "'re"] = [
-            {ORTH: "'re", NORM: "are"},
+                {ORTH: orth, NORM: word},
-        ]
+                {ORTH: "'re", NORM: "are"},
            ]
-        _exc[orth + "re"] = [
+            _exc[orth + "re"] = [
-            {ORTH: orth, NORM: word},
+                {ORTH: orth, NORM: word},
-            {ORTH: "re", NORM: "are"},
+                {ORTH: "re", NORM: "are"},
-        ]
+            ]
-        _exc[orth + "'ve"] = [
+            _exc[orth + "'ve"] = [
-            {ORTH: orth, NORM: word},
+                {ORTH: orth, NORM: word},
-            {ORTH: "'ve"},
+                {ORTH: "'ve"},
-        ]
+            ]
-        _exc[orth + "ve"] = [
+            _exc[orth + "ve"] = [
-            {ORTH: orth},
+                {ORTH: orth},
-            {ORTH: "ve", NORM: "have"},
+                {ORTH: "ve", NORM: "have"},
-        ]
+            ]
        _exc[orth + "'d"] = [
            {ORTH: orth, NORM: word},
--- a/spacy/tests/lang/en/test_tokenizer.py
+++ b/spacy/tests/lang/en/test_tokenizer.py
@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop
@pytest.mark.issue(10699)
@pytest.mark.parametrize("text", ["theses", "thisre"])
 def test_issue10699(en_tokenizer, text):
    """Test that 'theses' and 'thisre' are excluded from the contractions
    generated by the English tokenizer exceptions."""
    tokens = en_tokenizer(text)
    assert len(tokens) == 1