Add warning when Matcher subpattern is discarded (#5873)

* Add a warning when a subpattern is not processed and discarded * Normalize subpattern attribute/operator keys to upper case like top-level attributes
2025-07-31 18:39:49 +03:00 · 2020-08-05 14:56:14 +02:00 · 2020-08-05 14:56:14 +02:00 · 4193402c47
commit 4193402c47
parent 9e45d064bb
3 changed files with 18 additions and 2 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -133,6 +133,8 @@ class Warnings(object):
            "normalization table, please ignore this warning.")
    W034 = ("Please install the package spacy-lookups-data in order to include "
            "the default lexeme normalization table for the language '{lang}'.")
+    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
+            "attribute or operator.")


@add_codes
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -811,9 +811,11 @@ def _get_extra_predicates(spec, extra_predicates):
                attr = "ORTH"
            attr = IDS.get(attr.upper())
        if isinstance(value, dict):
+            processed = False
+            value_with_upper_keys = {k.upper(): v for k, v in value.items()}
            for type_, cls in predicate_types.items():
-                if type_ in value:
-                    predicate = cls(len(extra_predicates), attr, value[type_], type_)
+                if type_ in value_with_upper_keys:
+                    predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_)
                    # Don't create a redundant predicates.
                    # This helps with efficiency, as we're caching the results.
                    if predicate.key in seen_predicates:
@ -822,6 +824,9 @@ def _get_extra_predicates(spec, extra_predicates):
                        extra_predicates.append(predicate)
                        output.append(predicate.i)
                        seen_predicates[predicate.key] = predicate.i
+                    processed = True
+            if not processed:
+                warnings.warn(Warnings.W035.format(pattern=value))
    return output


--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@ -76,3 +76,12 @@ def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
            matcher.add("TEST", [pattern])
    elif n_errors == 0:
        matcher.add("TEST", [pattern])
+
+
+def test_pattern_warnings(en_vocab):
+    matcher = Matcher(en_vocab)
+    # normalize "regex" to upper like "text"
+    matcher.add("TEST1", [[{"text": {"regex": "regex"}}]])
+    # warn if subpattern attribute isn't recognized and processed
+    with pytest.warns(UserWarning):
+        matcher.add("TEST2", [[{"TEXT": {"XX": "xx"}}]])