From 59340606b7881928c924e4c11bc59192522fedb8 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 24 Sep 2020 16:54:39 +0200 Subject: [PATCH] Add option to disable Matcher errors (#6125) * Add option to disable Matcher errors * Add option to disable Matcher errors when a doc doesn't contain a particular type of annotation Minor additional change: * Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH` values * Rename suppress_errors to allow_missing Co-authored-by: Matthew Honnibal * Refactor annotation checks in Matcher and PhraseMatcher Co-authored-by: Matthew Honnibal --- spacy/errors.py | 4 ---- spacy/matcher/matcher.pyx | 25 ++++++++++++++----------- spacy/matcher/phrasematcher.pyx | 22 ++++++++++++---------- spacy/pipeline/attributeruler.py | 18 +++++++++++++----- spacy/tests/matcher/test_matcher_api.py | 3 +++ 5 files changed, 42 insertions(+), 30 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 50d2fea5f..4216e3936 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -401,10 +401,6 @@ class Errors: "Matcher or PhraseMatcher with the attribute {attr}. " "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " "instead of list(nlp.tokenizer.pipe()).") - E156 = ("The pipeline needs to include a parser in order to use " - "Matcher or PhraseMatcher with the attribute DEP. Try using " - "nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of " - "list(nlp.tokenizer.pipe()).") E157 = ("Can't render negative values for dependency arc start or end. " "Make sure that you're passing in absolute token indices, not " "relative token offsets.\nstart: {start}, end: {end}, label: " diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index d83f58181..39c7168e4 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -195,7 +195,7 @@ cdef class Matcher: else: yield doc - def __call__(self, object doclike, *, as_spans=False): + def __call__(self, object doclike, *, as_spans=False, allow_missing=False): """Find all token sequences matching the supplied pattern. doclike (Doc or Span): The document to match over. @@ -215,16 +215,19 @@ cdef class Matcher: else: raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) cdef Pool tmp_pool = Pool() - if TAG in self._seen_attrs and not doc.has_annotation("TAG"): - raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) - if POS in self._seen_attrs and not doc.has_annotation("POS"): - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) - if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"): - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) - if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"): - raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) - if DEP in self._seen_attrs and not doc.has_annotation("DEP"): - raise ValueError(Errors.E156.format()) + if not allow_missing: + for attr in (TAG, POS, MORPH, LEMMA, DEP): + if attr in self._seen_attrs and not doc.has_annotation(attr): + if attr == TAG: + pipe = "tagger" + elif attr in (POS, MORPH): + pipe = "morphologizer" + elif attr == LEMMA: + pipe = "lemmatizer" + elif attr == DEP: + pipe = "parser" + error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) + raise ValueError(error_msg) matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, extensions=self._extensions, predicates=self._extra_predicates) final_matches = [] diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index b00ba157f..7e99859b5 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -186,16 +186,18 @@ cdef class PhraseMatcher: if isinstance(doc, Doc): attrs = (TAG, POS, MORPH, LEMMA, DEP) has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} - if self.attr == TAG and not has_annotation[TAG]: - raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) - if self.attr == POS and not has_annotation[POS]: - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) - if self.attr == MORPH and not has_annotation[MORPH]: - raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) - if self.attr == LEMMA and not has_annotation[LEMMA]: - raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) - if self.attr == DEP and not has_annotation[DEP]: - raise ValueError(Errors.E156.format()) + for attr in attrs: + if self.attr == attr and not has_annotation[attr]: + if attr == TAG: + pipe = "tagger" + elif attr in (POS, MORPH): + pipe = "morphologizer" + elif attr == LEMMA: + pipe = "lemmatizer" + elif attr == DEP: + pipe = "parser" + error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr)) + raise ValueError(error_msg) if self._validate and any(has_annotation.values()) \ and self.attr not in attrs: string_attr = self.vocab.strings[self.attr] diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py index f64fcbc54..0d59a1ba0 100644 --- a/spacy/pipeline/attributeruler.py +++ b/spacy/pipeline/attributeruler.py @@ -79,7 +79,7 @@ class AttributeRuler(Pipe): DOCS: https://nightly.spacy.io/api/attributeruler#call """ - matches = sorted(self.matcher(doc)) + matches = sorted(self.matcher(doc, allow_missing=True)) for match_id, start, end in matches: span = Span(doc, start, end, label=match_id) @@ -126,8 +126,12 @@ class AttributeRuler(Pipe): for tag, attrs in tag_map.items(): pattern = [{"TAG": tag}] attrs, morph_attrs = _split_morph_attrs(attrs) - morph = self.vocab.morphology.add(morph_attrs) - attrs["MORPH"] = self.vocab.strings[morph] + if "MORPH" not in attrs: + morph = self.vocab.morphology.add(morph_attrs) + attrs["MORPH"] = self.vocab.strings[morph] + else: + morph = self.vocab.morphology.add(attrs["MORPH"]) + attrs["MORPH"] = self.vocab.strings[morph] self.add([pattern], attrs) def load_from_morph_rules( @@ -146,8 +150,12 @@ class AttributeRuler(Pipe): pattern = [{"ORTH": word, "TAG": tag}] attrs = morph_rules[tag][word] attrs, morph_attrs = _split_morph_attrs(attrs) - morph = self.vocab.morphology.add(morph_attrs) - attrs["MORPH"] = self.vocab.strings[morph] + if "MORPH" in attrs: + morph = self.vocab.morphology.add(attrs["MORPH"]) + attrs["MORPH"] = self.vocab.strings[morph] + elif morph_attrs: + morph = self.vocab.morphology.add(morph_attrs) + attrs["MORPH"] = self.vocab.strings[morph] self.add([pattern], attrs) def add( diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 04f9585f1..c407595e5 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab): matcher(doc2) with pytest.raises(ValueError): matcher(doc3) + # errors can be suppressed if desired + matcher(doc2, allow_missing=True) + matcher(doc3, allow_missing=True) # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = Matcher(en_vocab)