mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-23 07:44:12 +03:00
Prevent tok2vec to broadcast to listeners when predicting (#11385)
* replicate bug with tok2vec in annotating components * add overfitting test with a frozen tok2vec * remove broadcast from predict and check doc.tensor instead * remove broadcast * proper error * slight rephrase of documentation
This commit is contained in:
parent
1f23c615d7
commit
cc10a27c59
|
@ -538,6 +538,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
|||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||
E200 = ("Can't set {attr} from Span.")
|
||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||
E203 = ("If the {name} embedding layer is not updated "
|
||||
"during training, make sure to include it in 'annotating components'")
|
||||
|
||||
# New errors added in v3.x
|
||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||
|
|
|
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
|
|||
width = self.model.get_dim("nO")
|
||||
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners:
|
||||
listener.receive(batch_id, tokvecs, _empty_backprop)
|
||||
return tokvecs
|
||||
|
||||
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
||||
|
@ -286,8 +283,19 @@ class Tok2VecListener(Model):
|
|||
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||
"""Supply the outputs from the upstream Tok2Vec component."""
|
||||
if is_train:
|
||||
model.verify_inputs(inputs)
|
||||
return model._outputs, model._backprop
|
||||
# This might occur during training when the tok2vec layer is frozen / hasn't been updated.
|
||||
# In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
|
||||
if model._batch_id is None:
|
||||
outputs = []
|
||||
for doc in inputs:
|
||||
if doc.tensor.size == 0:
|
||||
raise ValueError(Errors.E203.format(name="tok2vec"))
|
||||
else:
|
||||
outputs.append(doc.tensor)
|
||||
return outputs, _empty_backprop
|
||||
else:
|
||||
model.verify_inputs(inputs)
|
||||
return model._outputs, model._backprop
|
||||
else:
|
||||
# This is pretty grim, but it's hard to do better :(.
|
||||
# It's hard to avoid relying on the doc.tensor attribute, because the
|
||||
|
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
|
|||
outputs.append(model.ops.alloc2f(len(doc), width))
|
||||
else:
|
||||
outputs.append(doc.tensor)
|
||||
return outputs, lambda dX: []
|
||||
return outputs, _empty_backprop
|
||||
|
||||
|
||||
def _empty_backprop(dX): # for pickling
|
||||
|
|
|
@ -230,6 +230,87 @@ def test_tok2vec_listener_callback():
|
|||
assert get_dX(Y) is not None
|
||||
|
||||
|
||||
def test_tok2vec_listener_overfitting():
|
||||
""" Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
|
||||
assert losses["tagger"] < 0.00001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
assert doc[0].tag_ == "N"
|
||||
assert doc[1].tag_ == "V"
|
||||
assert doc[2].tag_ == "J"
|
||||
assert doc[3].tag_ == "N"
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert doc2[0].tag_ == "N"
|
||||
assert doc2[1].tag_ == "V"
|
||||
assert doc2[2].tag_ == "J"
|
||||
assert doc2[3].tag_ == "N"
|
||||
|
||||
|
||||
def test_tok2vec_frozen_not_annotating():
|
||||
""" Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(2):
|
||||
losses = {}
|
||||
with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
|
||||
|
||||
|
||||
def test_tok2vec_frozen_overfitting():
|
||||
""" Test that a pipeline with a frozen & annotating tok2vec can still overfit """
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
train_examples = []
|
||||
for t in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||
|
||||
for i in range(100):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
|
||||
assert losses["tagger"] < 0.0001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like blue eggs"
|
||||
doc = nlp(test_text)
|
||||
assert doc[0].tag_ == "N"
|
||||
assert doc[1].tag_ == "V"
|
||||
assert doc[2].tag_ == "J"
|
||||
assert doc[3].tag_ == "N"
|
||||
|
||||
# Also test the results are still the same after IO
|
||||
with make_tempdir() as tmp_dir:
|
||||
nlp.to_disk(tmp_dir)
|
||||
nlp2 = util.load_model_from_path(tmp_dir)
|
||||
doc2 = nlp2(test_text)
|
||||
assert doc2[0].tag_ == "N"
|
||||
assert doc2[1].tag_ == "V"
|
||||
assert doc2[2].tag_ == "J"
|
||||
assert doc2[3].tag_ == "N"
|
||||
|
||||
|
||||
def test_replace_listeners():
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
|
|
|
@ -480,7 +480,7 @@ as-is. They are also excluded when calling
|
|||
> parse. So the evaluation results should always reflect what your pipeline will
|
||||
> produce at runtime. If you want a frozen component to run (without updating)
|
||||
> during training as well, so that downstream components can use its
|
||||
> **predictions**, you can add it to the list of
|
||||
> **predictions**, you should add it to the list of
|
||||
> [`annotating_components`](/usage/training#annotating-components).
|
||||
|
||||
```ini
|
||||
|
|
Loading…
Reference in New Issue
Block a user