throw informative error when running the components with the wrong type of objects

This commit is contained in:
svlandeg 2020-06-18 10:36:05 +02:00
parent 6712d0b5db
commit 9f43ba839a
4 changed files with 54 additions and 24 deletions

View File

@ -579,6 +579,8 @@ class Errors(object):
"table, which contains {n_rows} vectors.")
# TODO: fix numbering after merging develop into master
E978 = ("The {method} method of component {name} takes a list of Example objects, "
"but found {types} instead.")
E979 = ("Cannot convert {type} to an Example object.")
E980 = ("Each link annotation should refer to a dictionary with at most one "
"identifier mapping to 1.0, and all others to 0.0.")

View File

@ -286,14 +286,16 @@ class Tagger(Pipe):
doc.is_tagged = True
def update(self, examples, drop=0., sgd=None, losses=None, set_annotations=False):
for eg in examples:
assert isinstance(eg, Example)
if losses is not None and self.name not in losses:
losses[self.name] = 0.
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
try:
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="Tagger", method="update", types=types))
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update(
[eg.predicted for eg in examples])
@ -315,7 +317,11 @@ class Tagger(Pipe):
"""Perform a 'rehearsal' update, where we try to match the output of
an initial model.
"""
docs = [eg.predicted for eg in examples]
try:
docs = [eg.predicted for eg in examples]
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="Tagger", method="rehearse", types=types))
if self._rehearsal_model is None:
return
if not any(len(doc) for doc in docs):
@ -347,7 +353,11 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = {}
for example in get_examples():
for token in example.y:
try:
y = example.y
except AttributeError:
raise ValueError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example)))
for token in y:
tag = token.tag_
if tag in orig_tag_map:
new_tag_map[tag] = orig_tag_map[tag]
@ -771,10 +781,12 @@ class ClozeMultitask(Pipe):
def rehearse(self, examples, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop)
predictions, bp_predictions = self.model.begin_update(
[eg.predicted for eg in examples])
try:
predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types))
loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
bp_predictions(d_predictions)
if sgd is not None:
@ -834,11 +846,13 @@ class TextCategorizer(Pipe):
doc.cats[label] = float(scores[i, j])
def update(self, examples, state=None, drop=0., set_annotations=False, sgd=None, losses=None):
for eg in examples:
assert isinstance(eg, Example)
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
try:
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
# Handle cases where there are no tokens in any docs.
return
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=types))
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update(
[eg.predicted for eg in examples]
@ -857,9 +871,11 @@ class TextCategorizer(Pipe):
def rehearse(self, examples, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None:
return
for eg in examples:
assert isinstance(eg, Example)
docs = [eg.predicted for eg in examples]
try:
docs = [eg.predicted for eg in examples]
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="TextCategorizer", method="rehearse", types=types))
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
return
@ -917,7 +933,11 @@ class TextCategorizer(Pipe):
# TODO: begin_training is not guaranteed to see all data / labels ?
examples = list(get_examples())
for example in examples:
for cat in example.y.cats:
try:
y = example.y
except AttributeError:
raise ValueError(Errors.E978.format(name="TextCategorizer", method="update", types=type(example)))
for cat in y.cats:
self.add_label(cat)
self.require_labels()
docs = [Doc(Vocab(), words=["hello"])]
@ -1074,10 +1094,12 @@ class EntityLinker(Pipe):
losses.setdefault(self.name, 0.0)
if not examples:
return 0
for eg in examples:
assert isinstance(eg, Example)
sentence_docs = []
docs = [eg.predicted for eg in examples]
try:
docs = [eg.predicted for eg in examples]
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="EntityLinker", method="update", types=types))
if set_annotations:
# This seems simpler than other ways to get that exact output -- but
# it does run the model twice :(

View File

@ -268,7 +268,11 @@ cdef class Parser:
for multitask in self._multitasks:
multitask.update(examples, drop=drop, sgd=sgd)
set_dropout_rate(self.model, drop)
states, golds, max_steps = self._init_gold_batch_no_cut(examples)
try:
states, golds, max_steps = self._init_gold_batch_no_cut(examples)
except AttributeError:
types = set([type(eg) for eg in examples])
raise ValueError(Errors.E978.format(name="Parser", method="update", types=types))
states_golds = [(s, g) for (s, g) in zip(states, golds)
if not s.is_final() and g is not None]
# Prepare the stepwise model, and get the callback for finishing the batch

View File

@ -3,6 +3,7 @@ from thinc.api import Adam, NumpyOps
from spacy.attrs import NORM
from spacy.vocab import Vocab
from spacy.gold import Example
from spacy.pipeline.defaults import default_parser, default_ner
from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer
@ -38,7 +39,8 @@ def _train_parser(parser):
"heads": [1, 1, 3, 3],
"deps": ["left", "ROOT", "left", "ROOT"]
}
parser.update((doc, gold), sgd=sgd, losses=losses)
example = Example.from_dict(doc, gold)
parser.update([example], sgd=sgd, losses=losses)
return parser