mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Prevent tok2vec to broadcast to listeners when predicting (#11385)
* replicate bug with tok2vec in annotating components * add overfitting test with a frozen tok2vec * remove broadcast from predict and check doc.tensor instead * remove broadcast * proper error * slight rephrase of documentation
This commit is contained in:
parent
1f23c615d7
commit
cc10a27c59
|
@ -538,6 +538,8 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
|
||||||
E200 = ("Can't set {attr} from Span.")
|
E200 = ("Can't set {attr} from Span.")
|
||||||
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
E203 = ("If the {name} embedding layer is not updated "
|
||||||
|
"during training, make sure to include it in 'annotating components'")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
|
||||||
|
|
|
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
|
||||||
width = self.model.get_dim("nO")
|
width = self.model.get_dim("nO")
|
||||||
return [self.model.ops.alloc((0, width)) for doc in docs]
|
return [self.model.ops.alloc((0, width)) for doc in docs]
|
||||||
tokvecs = self.model.predict(docs)
|
tokvecs = self.model.predict(docs)
|
||||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
|
||||||
for listener in self.listeners:
|
|
||||||
listener.receive(batch_id, tokvecs, _empty_backprop)
|
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
||||||
|
@ -286,8 +283,19 @@ class Tok2VecListener(Model):
|
||||||
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||||
"""Supply the outputs from the upstream Tok2Vec component."""
|
"""Supply the outputs from the upstream Tok2Vec component."""
|
||||||
if is_train:
|
if is_train:
|
||||||
model.verify_inputs(inputs)
|
# This might occur during training when the tok2vec layer is frozen / hasn't been updated.
|
||||||
return model._outputs, model._backprop
|
# In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
|
||||||
|
if model._batch_id is None:
|
||||||
|
outputs = []
|
||||||
|
for doc in inputs:
|
||||||
|
if doc.tensor.size == 0:
|
||||||
|
raise ValueError(Errors.E203.format(name="tok2vec"))
|
||||||
|
else:
|
||||||
|
outputs.append(doc.tensor)
|
||||||
|
return outputs, _empty_backprop
|
||||||
|
else:
|
||||||
|
model.verify_inputs(inputs)
|
||||||
|
return model._outputs, model._backprop
|
||||||
else:
|
else:
|
||||||
# This is pretty grim, but it's hard to do better :(.
|
# This is pretty grim, but it's hard to do better :(.
|
||||||
# It's hard to avoid relying on the doc.tensor attribute, because the
|
# It's hard to avoid relying on the doc.tensor attribute, because the
|
||||||
|
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||||
outputs.append(model.ops.alloc2f(len(doc), width))
|
outputs.append(model.ops.alloc2f(len(doc), width))
|
||||||
else:
|
else:
|
||||||
outputs.append(doc.tensor)
|
outputs.append(doc.tensor)
|
||||||
return outputs, lambda dX: []
|
return outputs, _empty_backprop
|
||||||
|
|
||||||
|
|
||||||
def _empty_backprop(dX): # for pickling
|
def _empty_backprop(dX): # for pickling
|
||||||
|
|
|
@ -230,6 +230,87 @@ def test_tok2vec_listener_callback():
|
||||||
assert get_dX(Y) is not None
|
assert get_dX(Y) is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_listener_overfitting():
|
||||||
|
""" Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
|
||||||
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
|
||||||
|
assert losses["tagger"] < 0.00001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
assert doc[0].tag_ == "N"
|
||||||
|
assert doc[1].tag_ == "V"
|
||||||
|
assert doc[2].tag_ == "J"
|
||||||
|
assert doc[3].tag_ == "N"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert doc2[0].tag_ == "N"
|
||||||
|
assert doc2[1].tag_ == "V"
|
||||||
|
assert doc2[2].tag_ == "J"
|
||||||
|
assert doc2[3].tag_ == "N"
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_frozen_not_annotating():
|
||||||
|
""" Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
|
||||||
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(2):
|
||||||
|
losses = {}
|
||||||
|
with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_tok2vec_frozen_overfitting():
|
||||||
|
""" Test that a pipeline with a frozen & annotating tok2vec can still overfit """
|
||||||
|
orig_config = Config().from_str(cfg_string)
|
||||||
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
|
optimizer = nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(100):
|
||||||
|
losses = {}
|
||||||
|
nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
|
||||||
|
assert losses["tagger"] < 0.0001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = nlp(test_text)
|
||||||
|
assert doc[0].tag_ == "N"
|
||||||
|
assert doc[1].tag_ == "V"
|
||||||
|
assert doc[2].tag_ == "J"
|
||||||
|
assert doc[3].tag_ == "N"
|
||||||
|
|
||||||
|
# Also test the results are still the same after IO
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
nlp.to_disk(tmp_dir)
|
||||||
|
nlp2 = util.load_model_from_path(tmp_dir)
|
||||||
|
doc2 = nlp2(test_text)
|
||||||
|
assert doc2[0].tag_ == "N"
|
||||||
|
assert doc2[1].tag_ == "V"
|
||||||
|
assert doc2[2].tag_ == "J"
|
||||||
|
assert doc2[3].tag_ == "N"
|
||||||
|
|
||||||
|
|
||||||
def test_replace_listeners():
|
def test_replace_listeners():
|
||||||
orig_config = Config().from_str(cfg_string)
|
orig_config = Config().from_str(cfg_string)
|
||||||
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||||
|
|
|
@ -480,7 +480,7 @@ as-is. They are also excluded when calling
|
||||||
> parse. So the evaluation results should always reflect what your pipeline will
|
> parse. So the evaluation results should always reflect what your pipeline will
|
||||||
> produce at runtime. If you want a frozen component to run (without updating)
|
> produce at runtime. If you want a frozen component to run (without updating)
|
||||||
> during training as well, so that downstream components can use its
|
> during training as well, so that downstream components can use its
|
||||||
> **predictions**, you can add it to the list of
|
> **predictions**, you should add it to the list of
|
||||||
> [`annotating_components`](/usage/training#annotating-components).
|
> [`annotating_components`](/usage/training#annotating-components).
|
||||||
|
|
||||||
```ini
|
```ini
|
||||||
|
|
Loading…
Reference in New Issue
Block a user