mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add option to disable Matcher errors (#6125)
* Add option to disable Matcher errors * Add option to disable Matcher errors when a doc doesn't contain a particular type of annotation Minor additional change: * Update `AttributeRuler.load_from_morph_rules` to allow direct `MORPH` values * Rename suppress_errors to allow_missing Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com> * Refactor annotation checks in Matcher and PhraseMatcher Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
parent
c7eedd3534
commit
59340606b7
|
@ -401,10 +401,6 @@ class Errors:
|
||||||
"Matcher or PhraseMatcher with the attribute {attr}. "
|
"Matcher or PhraseMatcher with the attribute {attr}. "
|
||||||
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
"Try using nlp() instead of nlp.make_doc() or list(nlp.pipe()) "
|
||||||
"instead of list(nlp.tokenizer.pipe()).")
|
"instead of list(nlp.tokenizer.pipe()).")
|
||||||
E156 = ("The pipeline needs to include a parser in order to use "
|
|
||||||
"Matcher or PhraseMatcher with the attribute DEP. Try using "
|
|
||||||
"nlp() instead of nlp.make_doc() or list(nlp.pipe()) instead of "
|
|
||||||
"list(nlp.tokenizer.pipe()).")
|
|
||||||
E157 = ("Can't render negative values for dependency arc start or end. "
|
E157 = ("Can't render negative values for dependency arc start or end. "
|
||||||
"Make sure that you're passing in absolute token indices, not "
|
"Make sure that you're passing in absolute token indices, not "
|
||||||
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
||||||
|
|
|
@ -195,7 +195,7 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def __call__(self, object doclike, *, as_spans=False):
|
def __call__(self, object doclike, *, as_spans=False, allow_missing=False):
|
||||||
"""Find all token sequences matching the supplied pattern.
|
"""Find all token sequences matching the supplied pattern.
|
||||||
|
|
||||||
doclike (Doc or Span): The document to match over.
|
doclike (Doc or Span): The document to match over.
|
||||||
|
@ -215,16 +215,19 @@ cdef class Matcher:
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
raise ValueError(Errors.E195.format(good="Doc or Span", got=type(doclike).__name__))
|
||||||
cdef Pool tmp_pool = Pool()
|
cdef Pool tmp_pool = Pool()
|
||||||
if TAG in self._seen_attrs and not doc.has_annotation("TAG"):
|
if not allow_missing:
|
||||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
for attr in (TAG, POS, MORPH, LEMMA, DEP):
|
||||||
if POS in self._seen_attrs and not doc.has_annotation("POS"):
|
if attr in self._seen_attrs and not doc.has_annotation(attr):
|
||||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
if attr == TAG:
|
||||||
if MORPH in self._seen_attrs and not doc.has_annotation("MORPH"):
|
pipe = "tagger"
|
||||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
elif attr in (POS, MORPH):
|
||||||
if LEMMA in self._seen_attrs and not doc.has_annotation("LEMMA"):
|
pipe = "morphologizer"
|
||||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
elif attr == LEMMA:
|
||||||
if DEP in self._seen_attrs and not doc.has_annotation("DEP"):
|
pipe = "lemmatizer"
|
||||||
raise ValueError(Errors.E156.format())
|
elif attr == DEP:
|
||||||
|
pipe = "parser"
|
||||||
|
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||||
|
raise ValueError(error_msg)
|
||||||
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
matches = find_matches(&self.patterns[0], self.patterns.size(), doclike, length,
|
||||||
extensions=self._extensions, predicates=self._extra_predicates)
|
extensions=self._extensions, predicates=self._extra_predicates)
|
||||||
final_matches = []
|
final_matches = []
|
||||||
|
|
|
@ -186,16 +186,18 @@ cdef class PhraseMatcher:
|
||||||
if isinstance(doc, Doc):
|
if isinstance(doc, Doc):
|
||||||
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
attrs = (TAG, POS, MORPH, LEMMA, DEP)
|
||||||
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
has_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||||
if self.attr == TAG and not has_annotation[TAG]:
|
for attr in attrs:
|
||||||
raise ValueError(Errors.E155.format(pipe="tagger", attr="TAG"))
|
if self.attr == attr and not has_annotation[attr]:
|
||||||
if self.attr == POS and not has_annotation[POS]:
|
if attr == TAG:
|
||||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="POS"))
|
pipe = "tagger"
|
||||||
if self.attr == MORPH and not has_annotation[MORPH]:
|
elif attr in (POS, MORPH):
|
||||||
raise ValueError(Errors.E155.format(pipe="morphologizer", attr="MORPH"))
|
pipe = "morphologizer"
|
||||||
if self.attr == LEMMA and not has_annotation[LEMMA]:
|
elif attr == LEMMA:
|
||||||
raise ValueError(Errors.E155.format(pipe="lemmatizer", attr="LEMMA"))
|
pipe = "lemmatizer"
|
||||||
if self.attr == DEP and not has_annotation[DEP]:
|
elif attr == DEP:
|
||||||
raise ValueError(Errors.E156.format())
|
pipe = "parser"
|
||||||
|
error_msg = Errors.E155.format(pipe=pipe, attr=self.vocab.strings.as_string(attr))
|
||||||
|
raise ValueError(error_msg)
|
||||||
if self._validate and any(has_annotation.values()) \
|
if self._validate and any(has_annotation.values()) \
|
||||||
and self.attr not in attrs:
|
and self.attr not in attrs:
|
||||||
string_attr = self.vocab.strings[self.attr]
|
string_attr = self.vocab.strings[self.attr]
|
||||||
|
|
|
@ -79,7 +79,7 @@ class AttributeRuler(Pipe):
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
DOCS: https://nightly.spacy.io/api/attributeruler#call
|
||||||
"""
|
"""
|
||||||
matches = sorted(self.matcher(doc))
|
matches = sorted(self.matcher(doc, allow_missing=True))
|
||||||
|
|
||||||
for match_id, start, end in matches:
|
for match_id, start, end in matches:
|
||||||
span = Span(doc, start, end, label=match_id)
|
span = Span(doc, start, end, label=match_id)
|
||||||
|
@ -126,8 +126,12 @@ class AttributeRuler(Pipe):
|
||||||
for tag, attrs in tag_map.items():
|
for tag, attrs in tag_map.items():
|
||||||
pattern = [{"TAG": tag}]
|
pattern = [{"TAG": tag}]
|
||||||
attrs, morph_attrs = _split_morph_attrs(attrs)
|
attrs, morph_attrs = _split_morph_attrs(attrs)
|
||||||
|
if "MORPH" not in attrs:
|
||||||
morph = self.vocab.morphology.add(morph_attrs)
|
morph = self.vocab.morphology.add(morph_attrs)
|
||||||
attrs["MORPH"] = self.vocab.strings[morph]
|
attrs["MORPH"] = self.vocab.strings[morph]
|
||||||
|
else:
|
||||||
|
morph = self.vocab.morphology.add(attrs["MORPH"])
|
||||||
|
attrs["MORPH"] = self.vocab.strings[morph]
|
||||||
self.add([pattern], attrs)
|
self.add([pattern], attrs)
|
||||||
|
|
||||||
def load_from_morph_rules(
|
def load_from_morph_rules(
|
||||||
|
@ -146,6 +150,10 @@ class AttributeRuler(Pipe):
|
||||||
pattern = [{"ORTH": word, "TAG": tag}]
|
pattern = [{"ORTH": word, "TAG": tag}]
|
||||||
attrs = morph_rules[tag][word]
|
attrs = morph_rules[tag][word]
|
||||||
attrs, morph_attrs = _split_morph_attrs(attrs)
|
attrs, morph_attrs = _split_morph_attrs(attrs)
|
||||||
|
if "MORPH" in attrs:
|
||||||
|
morph = self.vocab.morphology.add(attrs["MORPH"])
|
||||||
|
attrs["MORPH"] = self.vocab.strings[morph]
|
||||||
|
elif morph_attrs:
|
||||||
morph = self.vocab.morphology.add(morph_attrs)
|
morph = self.vocab.morphology.add(morph_attrs)
|
||||||
attrs["MORPH"] = self.vocab.strings[morph]
|
attrs["MORPH"] = self.vocab.strings[morph]
|
||||||
self.add([pattern], attrs)
|
self.add([pattern], attrs)
|
||||||
|
|
|
@ -316,6 +316,9 @@ def test_attr_pipeline_checks(en_vocab):
|
||||||
matcher(doc2)
|
matcher(doc2)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
matcher(doc3)
|
matcher(doc3)
|
||||||
|
# errors can be suppressed if desired
|
||||||
|
matcher(doc2, allow_missing=True)
|
||||||
|
matcher(doc3, allow_missing=True)
|
||||||
# TAG, POS, LEMMA require those values
|
# TAG, POS, LEMMA require those values
|
||||||
for attr in ("TAG", "POS", "LEMMA"):
|
for attr in ("TAG", "POS", "LEMMA"):
|
||||||
matcher = Matcher(en_vocab)
|
matcher = Matcher(en_vocab)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user