Prevent tok2vec to broadcast to listeners when predicting (#11385)

* replicate bug with tok2vec in annotating components * add overfitting test with a frozen tok2vec * remove broadcast from predict and check doc.tensor instead * remove broadcast * proper error * slight rephrase of documentation
2025-07-18 20:22:25 +03:00 · 2022-09-12 15:36:48 +02:00 · 2022-09-12 15:36:48 +02:00 · cc10a27c59
commit cc10a27c59
parent 1f23c615d7
4 changed files with 98 additions and 7 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -538,6 +538,8 @@ class Errors(metaclass=ErrorsWithCodes):
    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
    E200 = ("Can't set {attr} from Span.")
    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
    E203 = ("If the {name} embedding layer is not updated "
            "during training, make sure to include it in 'annotating components'")
    # New errors added in v3.x
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
            width = self.model.get_dim("nO")
            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
        batch_id = Tok2VecListener.get_batch_id(docs)
        for listener in self.listeners:
            listener.receive(batch_id, tokvecs, _empty_backprop)
        return tokvecs
    def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
@ -286,6 +283,17 @@ class Tok2VecListener(Model):
 def forward(model: Tok2VecListener, inputs, is_train: bool):
    """Supply the outputs from the upstream Tok2Vec component."""
    if is_train:
        # This might occur during training when the tok2vec layer is frozen / hasn't been updated.
        # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
        if model._batch_id is None:
            outputs = []
            for doc in inputs:
                if doc.tensor.size == 0:
                    raise ValueError(Errors.E203.format(name="tok2vec"))
                else:
                    outputs.append(doc.tensor)
            return outputs, _empty_backprop
        else:
            model.verify_inputs(inputs)
            return model._outputs, model._backprop
    else:
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
                outputs.append(model.ops.alloc2f(len(doc), width))
            else:
                outputs.append(doc.tensor)
-        return outputs, lambda dX: []
+        return outputs, _empty_backprop
 def _empty_backprop(dX):  # for pickling
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -230,6 +230,87 @@ def test_tok2vec_listener_callback():
    assert get_dX(Y) is not None
 def test_tok2vec_listener_overfitting():
    """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
    assert losses["tagger"] < 0.00001
    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    assert doc[0].tag_ == "N"
    assert doc[1].tag_ == "V"
    assert doc[2].tag_ == "J"
    assert doc[3].tag_ == "N"
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].tag_ == "N"
        assert doc2[1].tag_ == "V"
        assert doc2[2].tag_ == "J"
        assert doc2[3].tag_ == "N"
 def test_tok2vec_frozen_not_annotating():
    """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
            nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
 def test_tok2vec_frozen_overfitting():
    """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
    train_examples = []
    for t in TRAIN_DATA:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(100):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
    assert losses["tagger"] < 0.0001
    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    assert doc[0].tag_ == "N"
    assert doc[1].tag_ == "V"
    assert doc[2].tag_ == "J"
    assert doc[3].tag_ == "N"
    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
        assert doc2[0].tag_ == "N"
        assert doc2[1].tag_ == "V"
        assert doc2[2].tag_ == "J"
        assert doc2[3].tag_ == "N"
 def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -480,7 +480,7 @@ as-is. They are also excluded when calling
 > parse. So the evaluation results should always reflect what your pipeline will
 > produce at runtime. If you want a frozen component to run (without updating)
 > during training as well, so that downstream components can use its
-> **predictions**, you can add it to the list of
+> **predictions**, you should add it to the list of
 > [`annotating_components`](/usage/training#annotating-components).
 ```ini