Language.update: ensure that tok2vec gets updated (#12136)

* `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec *after* the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop
2025-10-23 04:04:22 +03:00 · 2023-02-03 15:22:25 +01:00 · 2023-02-03 15:22:25 +01:00 · eec5ccd72f
commit eec5ccd72f
parent c47ec5b5c6
4 changed files with 95 additions and 15 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1248,17 +1248,12 @@ class Language:
            component_cfg[name].setdefault("drop", drop)
            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
-            # ignore statements are used here because mypy ignores hasattr
-            if name not in exclude and hasattr(proc, "update"):
-                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])  # type: ignore
-            if sgd not in (None, False):
-                if (
-                    name not in exclude
-                    and isinstance(proc, ty.TrainableComponent)
-                    and proc.is_trainable
-                    and proc.model not in (True, False, None)
-                ):
-                    proc.finish_update(sgd)
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
            if name in annotates:
                for doc, eg in zip(
                    _pipe(
@ -1271,6 +1266,17 @@ class Language:
                    examples,
                ):
                    eg.predicted = doc
+        # Only finish the update after all component updates are done. Some
+        # components may share weights (such as tok2vec) and we only want
+        # to apply weight updates after all gradients are accumulated.
+        for name, proc in self.pipeline:
+            if (
+                name not in exclude
+                and isinstance(proc, ty.TrainableComponent)
+                and proc.is_trainable
+            ):
+                proc.finish_update(sgd)
+
        return losses

    def rehearse(
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@ -54,9 +54,11 @@ def test_annotates_on_update():
        return AssertSents(name)

    class AssertSents:
+        model = None
+        is_trainable = True
+
        def __init__(self, name, **cfg):
            self.name = name
-            pass

        def __call__(self, doc):
            if not doc.has_annotation("SENT_START"):
@ -64,10 +66,16 @@ def test_annotates_on_update():
            return doc

        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            losses.setdefault(self.name, 0.0)
+
            for example in examples:
                if not example.predicted.has_annotation("SENT_START"):
                    raise ValueError("No sents")
-            return {}
+
+            return losses
+
+        def finish_update(self, sgd=None):
+            pass

    nlp = English()
    nlp.add_pipe("sentencizer")
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -10,8 +10,9 @@ from spacy.training import Example
 from spacy.lang.en import English
 from spacy.lang.de import German
 from spacy.util import registry, ignore_error, raise_error, find_matching_language
+from spacy.util import load_model_from_config
 import spacy
-from thinc.api import CupyOps, NumpyOps, get_current_ops
+from thinc.api import Config, CupyOps, NumpyOps, get_array_module, get_current_ops

 from .util import add_vecs_to_vocab, assert_docs_equal

@ -25,6 +26,51 @@ try:
 except ImportError:
    pass

+TAGGER_CFG_STRING = """
+    [nlp]
+    lang = "en"
+    pipeline = ["tok2vec","tagger"]
+
+    [components]
+
+    [components.tagger]
+    factory = "tagger"
+
+    [components.tagger.model]
+    @architectures = "spacy.Tagger.v2"
+    nO = null
+
+    [components.tagger.model.tok2vec]
+    @architectures = "spacy.Tok2VecListener.v1"
+    width = ${components.tok2vec.model.encode.width}
+
+    [components.tok2vec]
+    factory = "tok2vec"
+
+    [components.tok2vec.model]
+    @architectures = "spacy.Tok2Vec.v2"
+
+    [components.tok2vec.model.embed]
+    @architectures = "spacy.MultiHashEmbed.v1"
+    width = ${components.tok2vec.model.encode.width}
+    rows = [2000, 1000, 1000, 1000]
+    attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+    include_static_vectors = false
+
+    [components.tok2vec.model.encode]
+    @architectures = "spacy.MaxoutWindowEncoder.v2"
+    width = 96
+    depth = 4
+    window_size = 1
+    maxout_pieces = 3
+    """
+
+
+TAGGER_TRAIN_DATA = [
+    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
+    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
+]
+

 TAGGER_TRAIN_DATA = [
    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
@ -91,6 +137,26 @@ def test_language_update(nlp):
        example = Example.from_dict(doc, wrongkeyannots)


+def test_language_update_updates():
+    config = Config().from_str(TAGGER_CFG_STRING)
+    nlp = load_model_from_config(config, auto_fill=True, validate=True)
+
+    train_examples = []
+    for t in TAGGER_TRAIN_DATA:
+        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+    optimizer = nlp.initialize(get_examples=lambda: train_examples)
+
+    docs_before_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+    nlp.update(train_examples, sgd=optimizer)
+    docs_after_update = list(nlp.pipe([eg.predicted.copy() for eg in train_examples]))
+
+    xp = get_array_module(docs_after_update[0].tensor)
+    assert xp.any(
+        xp.not_equal(docs_before_update[0].tensor, docs_after_update[0].tensor)
+    )
+
+
 def test_language_evaluate(nlp):
    text = "hello world"
    annots = {"doc_annotation": {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}}
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -210,7 +210,7 @@ def train_while_improving(
                subbatch,
                drop=dropout,
                losses=losses,
-                sgd=False,  # type: ignore[arg-type]
+                sgd=None,
                exclude=exclude,
                annotates=annotating_components,
            )