Add option to disable Matcher errors (#6125)

* Add option to disable Matcher errors

* Add option to disable Matcher errors when a doc doesn't contain a
particular type of annotation

Minor additional change:

* Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH`
values

* Rename suppress_errors to allow_missing

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Refactor annotation checks in Matcher and PhraseMatcher

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Adriane Boyd 2020-09-24 16:54:39 +02:00 committed by GitHub
parent c7eedd3534
commit 59340606b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 42 additions and 30 deletions

View File

@ -401,10 +401,6 @@ class Errors:
"Matcher or PhraseMatcher with the attribute {attr}. " "Matcher or PhraseMatcher with the attribute {attr}. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) " "Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).") "instead of list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use "
"Matcher or PhraseMatcher with the attribute DEP. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. " E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not " "Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: " "relative token offsets.\nstart: {start}, end: {end}, label: "

View File

@ -195,7 +195,7 @@ cdef class Matcher:
else: else:
yield doc yield doc
def __call__(self, object doclike, *, as_spans=False): def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
"""Find all token sequences matching the supplied pattern. """Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over. doclike (Doc or Span): The document to match over.
@ -215,16 +215,19 @@ cdef class Matcher:
else: else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__)) raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
cdef Pool tmp_pool = Pool() cdef Pool tmp_pool = Pool()
if TAG in self._seen_attrs and not doc.has_annotation("TAG"): if not allow_missing:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) for attr in (TAG, POS, MORPH, LEMMA, DEP):
if POS in self._seen_attrs and not doc.has_annotation("POS"): if attr in self._seen_attrs and not doc.has_annotation(attr):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) if attr == TAG:
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"): pipe = "tagger"
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) elif attr in (POS, MORPH):
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"): pipe = "morphologizer"
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) elif attr == LEMMA:
if DEP in self._seen_attrs and not doc.has_annotation("DEP"): pipe = "lemmatizer"
raise ValueError(Errors.E156.format()) elif attr == DEP:
pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length, matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates) extensions=self._extensions, predicates=self._extra_predicates)
final_matches = [] final_matches = []

View File

@ -186,16 +186,18 @@ cdef class PhraseMatcher:
if isinstance(doc, Doc): if isinstance(doc, Doc):
attrs = (TAG, POS, MORPH, LEMMA, DEP) attrs = (TAG, POS, MORPH, LEMMA, DEP)
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs} has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
if self.attr == TAG and not has_annotation[TAG]: for attr in attrs:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG")) if self.attr == attr and not has_annotation[attr]:
if self.attr == POS and not has_annotation[POS]: if attr == TAG:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS")) pipe = "tagger"
if self.attr == MORPH and not has_annotation[MORPH]: elif attr in (POS, MORPH):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH")) pipe = "morphologizer"
if self.attr == LEMMA and not has_annotation[LEMMA]: elif attr == LEMMA:
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA")) pipe = "lemmatizer"
if self.attr == DEP and not has_annotation[DEP]: elif attr == DEP:
raise ValueError(Errors.E156.format()) pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
if self._validate and any(has_annotation.values()) \ if self._validate and any(has_annotation.values()) \
and self.attr not in attrs: and self.attr not in attrs:
string_attr = self.vocab.strings[self.attr] string_attr = self.vocab.strings[self.attr]

View File

@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call DOCS: https://nightly.spacy.io/api/attributeruler#call
""" """
matches = sorted(self.matcher(doc)) matches = sorted(self.matcher(doc, allow_missing=True))
for match_id, start, end in matches: for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id) span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
for tag, attrs in tag_map.items(): for tag, attrs in tag_map.items():
pattern = [{"TAG": tag}] pattern = [{"TAG": tag}]
attrs, morph_attrs = _split_morph_attrs(attrs) attrs, morph_attrs = _split_morph_attrs(attrs)
morph = self.vocab.morphology.add(morph_attrs) if "MORPH" not in attrs:
attrs["MORPH"] = self.vocab.strings[morph] morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
else:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs) self.add([pattern], attrs)
def load_from_morph_rules( def load_from_morph_rules(
@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
pattern = [{"ORTH": word, "TAG": tag}] pattern = [{"ORTH": word, "TAG": tag}]
attrs = morph_rules[tag][word] attrs = morph_rules[tag][word]
attrs, morph_attrs = _split_morph_attrs(attrs) attrs, morph_attrs = _split_morph_attrs(attrs)
morph = self.vocab.morphology.add(morph_attrs) if "MORPH" in attrs:
attrs["MORPH"] = self.vocab.strings[morph] morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
elif morph_attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs) self.add([pattern], attrs)
def add( def add(

View File

@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc2) matcher(doc2)
with pytest.raises(ValueError): with pytest.raises(ValueError):
matcher(doc3) matcher(doc3)
# errors can be suppressed if desired
matcher(doc2, allow_missing=True)
matcher(doc3, allow_missing=True)
# TAG, POS, LEMMA require those values # TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"): for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab) matcher = Matcher(en_vocab)