Add option to disable Matcher errors (#6125)

* Add option to disable Matcher errors * Add option to disable Matcher errors when a doc doesn't contain a particular type of annotation Minor additional change: * Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH` values * Rename suppress_errors to allow_missing Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> * Refactor annotation checks in Matcher and PhraseMatcher Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2025-07-16 03:02:41 +03:00 · 2020-09-24 16:54:39 +02:00 · 2020-09-24 16:54:39 +02:00 · 59340606b7
commit 59340606b7
parent c7eedd3534
5 changed files with 42 additions and 30 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -401,10 +401,6 @@ class Errors:
            "Matcher or PhraseMatcher with the attribute {attr}. "
            "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
            "instead of list(nlp.tokenizer.pipe()).")
-    E156 = ("The pipeline needs to include a parser in order to use "
-            "Matcher or PhraseMatcher with the attribute DEP. Try using "
-            "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
-            "list(nlp.tokenizer.pipe()).")
    E157 = ("Can't render negative values for dependency arc start or end. "
            "Make sure that you're passing in absolute token indices, not "
            "relative token offsets.\nstart: {start}, end: {end}, label: "
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -195,7 +195,7 @@ cdef class Matcher:
                else:
                    yield doc

-    def __call__(self, object doclike, *, as_spans=False):
+    def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
        """Find all token sequences matching the supplied pattern.

        doclike (Doc or Span): The document to match over.
@ -215,16 +215,19 @@ cdef class Matcher:
        else:
            raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
        cdef Pool tmp_pool = Pool()
-        if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
-            raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-        if POS in self._seen_attrs and not doc.has_annotation("POS"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-        if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
-            raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-        if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
-            raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-        if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
-            raise ValueError(Errors.E156.format())
+        if not allow_missing:
+            for attr in (TAG, POS, MORPH, LEMMA, DEP):
+                if attr in self._seen_attrs and not doc.has_annotation(attr):
+                    if attr == TAG:
+                        pipe = "tagger"
+                    elif attr in (POS, MORPH):
+                        pipe = "morphologizer"
+                    elif attr == LEMMA:
+                        pipe = "lemmatizer"
+                    elif attr == DEP:
+                        pipe = "parser"
+                    error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                    raise ValueError(error_msg)
        matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
                                extensions=self._extensions, predicates=self._extra_predicates)
        final_matches = []
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -186,16 +186,18 @@ cdef class PhraseMatcher:
            if isinstance(doc, Doc):
                attrs = (TAG, POS, MORPH, LEMMA, DEP)
                has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
-                if self.attr == TAG and not has_annotation[TAG]:
-                    raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
-                if self.attr == POS and not has_annotation[POS]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
-                if self.attr == MORPH and not has_annotation[MORPH]:
-                    raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
-                if self.attr == LEMMA and not has_annotation[LEMMA]:
-                    raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
-                if self.attr == DEP and not has_annotation[DEP]:
-                    raise ValueError(Errors.E156.format())
+                for attr in attrs:
+                    if self.attr == attr and not has_annotation[attr]:
+                        if attr == TAG:
+                            pipe = "tagger"
+                        elif attr in (POS, MORPH):
+                            pipe = "morphologizer"
+                        elif attr == LEMMA:
+                            pipe = "lemmatizer"
+                        elif attr == DEP:
+                            pipe = "parser"
+                        error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
+                        raise ValueError(error_msg)
                if self._validate and any(has_annotation.values()) \
                        and self.attr not in attrs:
                    string_attr = self.vocab.strings[self.attr]
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@ -79,7 +79,7 @@ class AttributeRuler(Pipe):

        DOCS: https://nightly.spacy.io/api/attributeruler#call
        """
-        matches = sorted(self.matcher(doc))
+        matches = sorted(self.matcher(doc, allow_missing=True))

        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
        for tag, attrs in tag_map.items():
            pattern = [{"TAG": tag}]
            attrs, morph_attrs = _split_morph_attrs(attrs)
+            if "MORPH" not in attrs:
                morph = self.vocab.morphology.add(morph_attrs)
                attrs["MORPH"] = self.vocab.strings[morph]
+            else:
+                morph = self.vocab.morphology.add(attrs["MORPH"])
+                attrs["MORPH"] = self.vocab.strings[morph]
            self.add([pattern], attrs)

    def load_from_morph_rules(
@ -146,6 +150,10 @@ class AttributeRuler(Pipe):
                pattern = [{"ORTH": word, "TAG": tag}]
                attrs = morph_rules[tag][word]
                attrs, morph_attrs = _split_morph_attrs(attrs)
+                if "MORPH" in attrs:
+                    morph = self.vocab.morphology.add(attrs["MORPH"])
+                    attrs["MORPH"] = self.vocab.strings[morph]
+                elif morph_attrs:
                    morph = self.vocab.morphology.add(morph_attrs)
                    attrs["MORPH"] = self.vocab.strings[morph]
                self.add([pattern], attrs)
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
        matcher(doc2)
    with pytest.raises(ValueError):
        matcher(doc3)
+    # errors can be suppressed if desired
+    matcher(doc2, allow_missing=True)
+    matcher(doc3, allow_missing=True)
    # TAG, POS, LEMMA require those values
    for attr in ("TAG", "POS", "LEMMA"):
        matcher = Matcher(en_vocab)