Add option to disable Matcher errors (#6125)

* Add option to disable Matcher errors

* Add option to disable Matcher errors when a doc doesn't contain a
particular type of annotation

Minor additional change:

* Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH`
values

* Rename suppress_errors to allow_missing

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>

* Refactor annotation checks in Matcher and PhraseMatcher

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Adriane Boyd 2020-09-24 16:54:39 +02:00 committed by GitHub
parent c7eedd3534
commit 59340606b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 42 additions and 30 deletions

View File

@ -401,10 +401,6 @@ class Errors:
"Matcher or PhraseMatcher with the attribute {attr}. "
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
"instead of list(nlp.tokenizer.pipe()).")
E156 = ("The pipeline needs to include a parser in order to use "
"Matcher or PhraseMatcher with the attribute DEP. Try using "
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
"list(nlp.tokenizer.pipe()).")
E157 = ("Can't render negative values for dependency arc start or end. "
"Make sure that you're passing in absolute token indices, not "
"relative token offsets.\nstart: {start}, end: {end}, label: "

View File

@ -195,7 +195,7 @@ cdef class Matcher:
else:
yield doc
def __call__(self, object doclike, *, as_spans=False):
def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
"""Find all token sequences matching the supplied pattern.
doclike (Doc or Span): The document to match over.
@ -215,16 +215,19 @@ cdef class Matcher:
else:
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
cdef Pool tmp_pool = Pool()
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
if POS in self._seen_attrs and not doc.has_annotation("POS"):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
raise ValueError(Errors.E156.format())
if not allow_missing:
for attr in (TAG, POS, MORPH, LEMMA, DEP):
if attr in self._seen_attrs and not doc.has_annotation(attr):
if attr == TAG:
pipe = "tagger"
elif attr in (POS, MORPH):
pipe = "morphologizer"
elif attr == LEMMA:
pipe = "lemmatizer"
elif attr == DEP:
pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
extensions=self._extensions, predicates=self._extra_predicates)
final_matches = []

View File

@ -186,16 +186,18 @@ cdef class PhraseMatcher:
if isinstance(doc, Doc):
attrs = (TAG, POS, MORPH, LEMMA, DEP)
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
if self.attr == TAG and not has_annotation[TAG]:
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
if self.attr == POS and not has_annotation[POS]:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
if self.attr == MORPH and not has_annotation[MORPH]:
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
if self.attr == LEMMA and not has_annotation[LEMMA]:
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
if self.attr == DEP and not has_annotation[DEP]:
raise ValueError(Errors.E156.format())
for attr in attrs:
if self.attr == attr and not has_annotation[attr]:
if attr == TAG:
pipe = "tagger"
elif attr in (POS, MORPH):
pipe = "morphologizer"
elif attr == LEMMA:
pipe = "lemmatizer"
elif attr == DEP:
pipe = "parser"
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
raise ValueError(error_msg)
if self._validate and any(has_annotation.values()) \
and self.attr not in attrs:
string_attr = self.vocab.strings[self.attr]

View File

@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
DOCS: https://nightly.spacy.io/api/attributeruler#call
"""
matches = sorted(self.matcher(doc))
matches = sorted(self.matcher(doc, allow_missing=True))
for match_id, start, end in matches:
span = Span(doc, start, end, label=match_id)
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
for tag, attrs in tag_map.items():
pattern = [{"TAG": tag}]
attrs, morph_attrs = _split_morph_attrs(attrs)
if "MORPH" not in attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
else:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)
def load_from_morph_rules(
@ -146,6 +150,10 @@ class AttributeRuler(Pipe):
pattern = [{"ORTH": word, "TAG": tag}]
attrs = morph_rules[tag][word]
attrs, morph_attrs = _split_morph_attrs(attrs)
if "MORPH" in attrs:
morph = self.vocab.morphology.add(attrs["MORPH"])
attrs["MORPH"] = self.vocab.strings[morph]
elif morph_attrs:
morph = self.vocab.morphology.add(morph_attrs)
attrs["MORPH"] = self.vocab.strings[morph]
self.add([pattern], attrs)

View File

@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc2)
with pytest.raises(ValueError):
matcher(doc3)
# errors can be suppressed if desired
matcher(doc2, allow_missing=True)
matcher(doc3, allow_missing=True)
# TAG, POS, LEMMA require those values
for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab)