mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Add option to disable Matcher errors (#6125)
* Add option to disable Matcher errors * Add option to disable Matcher errors when a doc doesn't contain a particular type of annotation Minor additional change: * Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH` values * Rename suppress_errors to allow_missing Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> * Refactor annotation checks in Matcher and PhraseMatcher Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
c7eedd3534
commit
59340606b7
|
@ -401,10 +401,6 @@ class Errors:
|
|||
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
||||
"instead of list(nlp.tokenizer.pipe()).")
|
||||
E156 = ("The pipeline needs to include a parser in order to use "
|
||||
"Matcher or PhraseMatcher with the attribute DEP. Try using "
|
||||
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
|
||||
"list(nlp.tokenizer.pipe()).")
|
||||
E157 = ("Can't render negative values for dependency arc start or end. "
|
||||
"Make sure that you're passing in absolute token indices, not "
|
||||
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
||||
|
|
|
@ -195,7 +195,7 @@ cdef class Matcher:
|
|||
else:
|
||||
yield doc
|
||||
|
||||
def __call__(self, object doclike, *, as_spans=False):
|
||||
def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
|
||||
"""Find all token sequences matching the supplied pattern.
|
||||
|
||||
doclike (Doc or Span): The document to match over.
|
||||
|
@ -215,16 +215,19 @@ cdef class Matcher:
|
|||
else:
|
||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||
cdef Pool tmp_pool = Pool()
|
||||
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
|
||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||
if POS in self._seen_attrs and not doc.has_annotation("POS"):
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
|
||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
|
||||
raise ValueError(Errors.E156.format())
|
||||
if not allow_missing:
|
||||
for attr in (TAG, POS, MORPH, LEMMA, DEP):
|
||||
if attr in self._seen_attrs and not doc.has_annotation(attr):
|
||||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||
extensions=self._extensions, predicates=self._extra_predicates)
|
||||
final_matches = []
|
||||
|
|
|
@ -186,16 +186,18 @@ cdef class PhraseMatcher:
|
|||
if isinstance(doc, Doc):
|
||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
if self.attr == TAG and not has_annotation[TAG]:
|
||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
||||
if self.attr == POS and not has_annotation[POS]:
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
||||
if self.attr == MORPH and not has_annotation[MORPH]:
|
||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
||||
if self.attr == LEMMA and not has_annotation[LEMMA]:
|
||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
||||
if self.attr == DEP and not has_annotation[DEP]:
|
||||
raise ValueError(Errors.E156.format())
|
||||
for attr in attrs:
|
||||
if self.attr == attr and not has_annotation[attr]:
|
||||
if attr == TAG:
|
||||
pipe = "tagger"
|
||||
elif attr in (POS, MORPH):
|
||||
pipe = "morphologizer"
|
||||
elif attr == LEMMA:
|
||||
pipe = "lemmatizer"
|
||||
elif attr == DEP:
|
||||
pipe = "parser"
|
||||
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||
raise ValueError(error_msg)
|
||||
if self._validate and any(has_annotation.values()) \
|
||||
and self.attr not in attrs:
|
||||
string_attr = self.vocab.strings[self.attr]
|
||||
|
|
|
@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
|
|||
|
||||
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
||||
"""
|
||||
matches = sorted(self.matcher(doc))
|
||||
matches = sorted(self.matcher(doc, allow_missing=True))
|
||||
|
||||
for match_id, start, end in matches:
|
||||
span = Span(doc, start, end, label=match_id)
|
||||
|
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
|
|||
for tag, attrs in tag_map.items():
|
||||
pattern = [{"TAG": tag}]
|
||||
attrs, morph_attrs = _split_morph_attrs(attrs)
|
||||
morph = self.vocab.morphology.add(morph_attrs)
|
||||
attrs["MORPH"] = self.vocab.strings[morph]
|
||||
if "MORPH" not in attrs:
|
||||
morph = self.vocab.morphology.add(morph_attrs)
|
||||
attrs["MORPH"] = self.vocab.strings[morph]
|
||||
else:
|
||||
morph = self.vocab.morphology.add(attrs["MORPH"])
|
||||
attrs["MORPH"] = self.vocab.strings[morph]
|
||||
self.add([pattern], attrs)
|
||||
|
||||
def load_from_morph_rules(
|
||||
|
@ -146,8 +150,12 @@ class AttributeRuler(Pipe):
|
|||
pattern = [{"ORTH": word, "TAG": tag}]
|
||||
attrs = morph_rules[tag][word]
|
||||
attrs, morph_attrs = _split_morph_attrs(attrs)
|
||||
morph = self.vocab.morphology.add(morph_attrs)
|
||||
attrs["MORPH"] = self.vocab.strings[morph]
|
||||
if "MORPH" in attrs:
|
||||
morph = self.vocab.morphology.add(attrs["MORPH"])
|
||||
attrs["MORPH"] = self.vocab.strings[morph]
|
||||
elif morph_attrs:
|
||||
morph = self.vocab.morphology.add(morph_attrs)
|
||||
attrs["MORPH"] = self.vocab.strings[morph]
|
||||
self.add([pattern], attrs)
|
||||
|
||||
def add(
|
||||
|
|
|
@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
|
|||
matcher(doc2)
|
||||
with pytest.raises(ValueError):
|
||||
matcher(doc3)
|
||||
# errors can be suppressed if desired
|
||||
matcher(doc2, allow_missing=True)
|
||||
matcher(doc3, allow_missing=True)
|
||||
# TAG, POS, LEMMA require those values
|
||||
for attr in ("TAG", "POS", "LEMMA"):
|
||||
matcher = Matcher(en_vocab)
|
||||
|
|
Loading…
Reference in New Issue
Block a user