From cc10a27c59a3e5fe3c2d08667534fcbf22908f06 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 12 Sep 2022 15:36:48 +0200
Subject: [PATCH] Prevent tok2vec to broadcast to listeners when predicting
 (#11385)

* replicate bug with tok2vec in annotating components

* add overfitting test with a frozen tok2vec

* remove broadcast from predict and check doc.tensor instead

* remove broadcast

* proper error

* slight rephrase of documentation
---
 spacy/errors.py                      |  2 +
 spacy/pipeline/tok2vec.py            | 20 ++++---
 spacy/tests/pipeline/test_tok2vec.py | 81 ++++++++++++++++++++++++++++
 website/docs/usage/training.md       |  2 +-
 4 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index e2201284f..7e63dc76c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -538,6 +538,8 @@ class Errors(metaclass=ErrorsWithCodes):
     E199 = ("Unable to merge 0-length span at `doc[{start}:{end}]`.")
     E200 = ("Can't set {attr} from Span.")
     E202 = ("Unsupported {name} mode '{mode}'. Supported modes: {modes}.")
+    E203 = ("If the {name} embedding layer is not updated "
+            "during training, make sure to include it in 'annotating components'")
 
     # New errors added in v3.x
     E853 = ("Unsupported component factory name '{name}'. The character '.' is "
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 2e3dde3cb..c742aaeaa 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -123,9 +123,6 @@ class Tok2Vec(TrainablePipe):
             width = self.model.get_dim("nO")
             return [self.model.ops.alloc((0, width)) for doc in docs]
         tokvecs = self.model.predict(docs)
-        batch_id = Tok2VecListener.get_batch_id(docs)
-        for listener in self.listeners:
-            listener.receive(batch_id, tokvecs, _empty_backprop)
         return tokvecs
 
     def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
@@ -286,8 +283,19 @@ class Tok2VecListener(Model):
 def forward(model: Tok2VecListener, inputs, is_train: bool):
     """Supply the outputs from the upstream Tok2Vec component."""
     if is_train:
-        model.verify_inputs(inputs)
-        return model._outputs, model._backprop
+        # This might occur during training when the tok2vec layer is frozen / hasn't been updated.
+        # In that case, it should be set to "annotating" so we can retrieve the embeddings from the doc.
+        if model._batch_id is None:
+            outputs = []
+            for doc in inputs:
+                if doc.tensor.size == 0:
+                    raise ValueError(Errors.E203.format(name="tok2vec"))
+                else:
+                    outputs.append(doc.tensor)
+            return outputs, _empty_backprop
+        else:
+            model.verify_inputs(inputs)
+            return model._outputs, model._backprop
     else:
         # This is pretty grim, but it's hard to do better :(.
         # It's hard to avoid relying on the doc.tensor attribute, because the
@@ -306,7 +314,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool):
                 outputs.append(model.ops.alloc2f(len(doc), width))
             else:
                 outputs.append(doc.tensor)
-        return outputs, lambda dX: []
+        return outputs, _empty_backprop
 
 
 def _empty_backprop(dX):  # for pickling
diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py
index 64faf133d..659274db9 100644
--- a/spacy/tests/pipeline/test_tok2vec.py
+++ b/spacy/tests/pipeline/test_tok2vec.py
@@ -230,6 +230,87 @@ def test_tok2vec_listener_callback():
     assert get_dX(Y) is not None
 
 
+def test_tok2vec_listener_overfitting():
+    """ Test that a pipeline with a listener properly overfits, even if 'tok2vec' is in the annotating components """
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(50):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses, annotates=["tok2vec"])
+    assert losses["tagger"] < 0.00001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].tag_ == "N"
+        assert doc2[1].tag_ == "V"
+        assert doc2[2].tag_ == "J"
+        assert doc2[3].tag_ == "N"
+
+
+def test_tok2vec_frozen_not_annotating():
+    """ Test that a pipeline with a frozen tok2vec raises an error when the tok2vec is not annotating """
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(2):
+        losses = {}
+        with pytest.raises(ValueError, match=r"the tok2vec embedding layer is not updated"):
+            nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"])
+
+
+def test_tok2vec_frozen_overfitting():
+    """ Test that a pipeline with a frozen & annotating tok2vec can still overfit """
+    orig_config = Config().from_str(cfg_string)
+    nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
+    train_examples = []
+    for t in TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    for i in range(100):
+        losses = {}
+        nlp.update(train_examples, sgd=optimizer, losses=losses, exclude=["tok2vec"], annotates=["tok2vec"])
+    assert losses["tagger"] < 0.0001
+
+    # test the trained model
+    test_text = "I like blue eggs"
+    doc = nlp(test_text)
+    assert doc[0].tag_ == "N"
+    assert doc[1].tag_ == "V"
+    assert doc[2].tag_ == "J"
+    assert doc[3].tag_ == "N"
+
+    # Also test the results are still the same after IO
+    with make_tempdir() as tmp_dir:
+        nlp.to_disk(tmp_dir)
+        nlp2 = util.load_model_from_path(tmp_dir)
+        doc2 = nlp2(test_text)
+        assert doc2[0].tag_ == "N"
+        assert doc2[1].tag_ == "V"
+        assert doc2[2].tag_ == "J"
+        assert doc2[3].tag_ == "N"
+
+
 def test_replace_listeners():
     orig_config = Config().from_str(cfg_string)
     nlp = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 5e064b269..27a8bbca7 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -480,7 +480,7 @@ as-is. They are also excluded when calling
 > parse. So the evaluation results should always reflect what your pipeline will
 > produce at runtime. If you want a frozen component to run (without updating)
 > during training as well, so that downstream components can use its
-> **predictions**, you can add it to the list of
+> **predictions**, you should add it to the list of
 > [`annotating_components`](/usage/training#annotating-components).
 
 ```ini