diff --git a/spacy/errors.py b/spacy/errors.py index fe59453c0..7f9164694 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -133,6 +133,8 @@ class Warnings(object): "normalization table, please ignore this warning.") W034 = ("Please install the package spacy-lookups-data in order to include " "the default lexeme normalization table for the language '{lang}'.") + W035 = ('Discarding subpattern "{pattern}" due to an unrecognized ' + "attribute or operator.") @add_codes diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 0c1a56187..8fbfe305a 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -811,9 +811,11 @@ def _get_extra_predicates(spec, extra_predicates): attr = "ORTH" attr = IDS.get(attr.upper()) if isinstance(value, dict): + processed = False + value_with_upper_keys = {k.upper(): v for k, v in value.items()} for type_, cls in predicate_types.items(): - if type_ in value: - predicate = cls(len(extra_predicates), attr, value[type_], type_) + if type_ in value_with_upper_keys: + predicate = cls(len(extra_predicates), attr, value_with_upper_keys[type_], type_) # Don't create a redundant predicates. # This helps with efficiency, as we're caching the results. if predicate.key in seen_predicates: @@ -822,6 +824,9 @@ def _get_extra_predicates(spec, extra_predicates): extra_predicates.append(predicate) output.append(predicate.i) seen_predicates[predicate.key] = predicate.i + processed = True + if not processed: + warnings.warn(Warnings.W035.format(pattern=value)) return output diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index c536698d0..ec2660ab4 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -76,3 +76,12 @@ def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors): matcher.add("TEST", [pattern]) elif n_errors == 0: matcher.add("TEST", [pattern]) + + +def test_pattern_warnings(en_vocab): + matcher = Matcher(en_vocab) + # normalize "regex" to upper like "text" + matcher.add("TEST1", [[{"text": {"regex": "regex"}}]]) + # warn if subpattern attribute isn't recognized and processed + with pytest.warns(UserWarning): + matcher.add("TEST2", [[{"TEXT": {"XX": "xx"}}]])