student: do not request use of optimizer in student pipe

We apply finish up the updates once in the training loop instead. Also add the necessary logic to `Language.distill` to mirror `Language.update`.
2025-10-28 22:47:52 +03:00 · 2023-04-19 17:48:35 +02:00 · 2023-04-19 17:48:35 +02:00 · add1a21657
commit add1a21657
parent b6544f50ec
2 changed files with 26 additions and 10 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1024,7 +1024,7 @@ class Language:
        examples: Iterable[Example],
        *,
        drop: float = 0.0,
-        sgd: Optional[Optimizer] = None,
+        sgd: Union[Optimizer, None, Literal[False]] = None,
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
@ -1037,7 +1037,9 @@ class Language:
            (teacher) and predicted (student) docs must have the same number of
            tokens and the same orthography.
        drop (float): The dropout rate.
-        sgd (Optional[Optimizer]): An optimizer.
+        sgd (Union[Optimizer, None, Literal[False]]): An optimizer. Will
            be created via create_optimizer if 'None'. No optimizer will
            be used when set to 'False'.
        losses (Optional(Dict[str, float])): Dictionary to update with the loss,
            keyed by component.
        component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
@ -1107,11 +1109,23 @@ class Language:
                student_proc.distill(
                    teacher_pipe,
                    examples,
-                    sgd=sgd,
+                    sgd=None,
                    losses=losses,
                    **component_cfg[student_name],
                )
        # Only finish the update after all component updates are done. Some
        # components may share weights (such as tok2vec) and we only want
        # to apply weight updates after all gradients are accumulated.
        for student_name, student_proc in self.pipeline:
            if (
                student_name not in exclude
                and isinstance(student_proc, ty.DistillableComponent)
                and student_proc.is_distillable
                and sgd not in (None, False)
            ):
                student_proc.finish_update(sgd)
        return losses
    def disable_pipes(self, *names) -> "DisabledPipes":
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -8,10 +8,12 @@ import random
 import sys
 import shutil
 from .example import Example
 from ..schemas import ConfigSchemaDistill, ConfigSchemaTraining
 from ..errors import Errors
 from ..tokens.doc import Doc
 from .. import ty
 from ..util import resolve_dot_names, registry, logger
 if TYPE_CHECKING:
@ -340,20 +342,20 @@ def _distill_loop(
                subbatch,
                drop=dropout,
                losses=losses,
-                sgd=None,
+                sgd=False,
                exclude=exclude,
                annotates=annotating_components,
                student_to_teacher=student_to_teacher,
            )
        # TODO: refactor this so we don't have to run it separately in here
-        for name, proc in student.pipeline:
+        for student_name, student_proc in student.pipeline:
            if (
-                name not in exclude
+                student_name not in exclude
-                and hasattr(proc, "is_trainable")
+                and isinstance(student_proc, ty.DistillableComponent)
-                and proc.is_trainable
+                and student_proc.is_distillable
-                and proc.model not in (True, False, None)  # type: ignore[attr-defined]
+                and student_proc.model not in (False, None)  # type: ignore[attr-defined]
            ):
-                proc.finish_update(optimizer)  # type: ignore[attr-defined]
+                student_proc.finish_update(optimizer)  # type: ignore[attr-defined]
        optimizer.step_schedules()
        if not (step % eval_frequency):
            if optimizer.averages: