Prevent tok2vec to broadcast to listeners when predicting (#11385)

* replicate bug with tok2vec in annotating components * add overfitting test with a frozen tok2vec * remove broadcast from predict and check doc.tensor instead * remove broadcast * proper error * slight rephrase of documentation
2025-09-09 13:49:41 +03:00 · 2022-09-12 15:36:48 +02:00 · 2022-09-12 15:36:48 +02:00 · cc10a27c59
commit cc10a27c59
parent 1f23c615d7
4 changed files with 98 additions and 7 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -538,6 +538,8 @@ class Errors(metaclass=ErrorsWithCodes):
    E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
    E200 = ("Can't set {attr} from Span.")
    E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+    E203 = ("If the {name} embedding layer is not updated "
+            "during training, make sure to include it in 'annotating components'")

    # New errors added in v3.x
    E853 = ("Unsupported component factory name '{name}'. The character '.' is "
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
            width = self.model.get_dim("nO")
            return [self.model.ops.alloc((0, width)) for doc in docs]
        tokvecs = self.model.predict(docs)
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners:
-            listener.receive(batch_id, tokvecs, _empty_backprop)
        return tokvecs

    def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
@ -286,6 +283,17 @@ class Tok2VecListener(Model):
 def forward(model: Tok2VecListener, inputs, is_train: bool):
    """Supply the outputs from the upstream Tok2Vec component."""
    if is_train:
+        # This might occur during training when the tok2vec layer is frozen / hasn't been updated.
+        # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
+        if model._batch_id is None:
+            outputs = []
+            for doc in inputs:
+                if doc.tensor.size == 0:
+                    raise ValueError(Errors.E203.format(name="tok2vec"))
+                else:
+                    outputs.append(doc.tensor)
+            return outputs, _empty_backprop
+        else:
            model.verify_inputs(inputs)
            return model._outputs, model._backprop
    else:
@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
                outputs.append(model.ops.alloc2f(len(doc), width))
            else:
                outputs.append(doc.tensor)
-        return outputs, lambda dX: []
+        return outputs, _empty_backprop


 def _empty_backprop(dX):  # for pickling
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@ -230,6 +230,87 @@ def test_tok2vec_listener_callback():
    assert get_dX(Y) is not None


+def test_tok2vec_listener_overfitting():
+    """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
+    assert losses["tagger"] < 0.00001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].tag_ == "N"
+        assert doc2[1].tag_ == "V"
+        assert doc2[2].tag_ == "J"
+        assert doc2[3].tag_ == "N"
+
+
+def test_tok2vec_frozen_not_annotating():
+    """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(2):
+        losses = {}
+        with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
+            nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
+
+
+def test_tok2vec_frozen_overfitting():
+    """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(100):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
+    assert losses["tagger"] < 0.0001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].tag_ == "N"
+        assert doc2[1].tag_ == "V"
+        assert doc2[2].tag_ == "J"
+        assert doc2[3].tag_ == "N"
+
+
 def test_replace_listeners():
    orig_config = Config().from_str(cfg_string)
    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -480,7 +480,7 @@ as-is. They are also excluded when calling
 > parse. So the evaluation results should always reflect what your pipeline will
 > produce at runtime. If you want a frozen component to run (without updating)
 > during training as well, so that downstream components can use its
-> **predictions**, you can add it to the list of
+> **predictions**, you should add it to the list of
 > [`annotating_components`](/usage/training#annotating-components).

 ```ini