warn when frozen components break listener pattern (#6766)

* warn when frozen components break listener pattern * few notes in the documentation * update arg name * formatting * cleanup * specify listeners return type
2025-07-03 03:13:08 +03:00 · 2021-01-20 01:12:35 +01:00 · 2021-01-20 01:12:35 +01:00 · 57640aa838
commit 57640aa838
parent 88acbfc050
5 changed files with 56 additions and 18 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1496,8 +1496,7 @@ class Language:
        for i, (name1, proc1) in enumerate(self.pipeline):
            if hasattr(proc1, "find_listeners"):
                for name2, proc2 in self.pipeline[i + 1 :]:
-                    if isinstance(getattr(proc2, "model", None), Model):
+                    proc1.find_listeners(proc2)
                        proc1.find_listeners(proc2.model)
    @classmethod
    def from_config(
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -62,28 +62,42 @@ class Tok2Vec(TrainablePipe):
        self.vocab = vocab
        self.model = model
        self.name = name
-        self.listeners = []
+        self.listener_map = {}
        self.cfg = {}
-    def add_listener(self, listener: "Tok2VecListener") -> None:
+    @property
-        """Add a listener for a downstream component. Usually internals."""
+    def listeners(self) -> List["Tok2VecListener"]:
-        self.listeners.append(listener)
+        """RETURNS (List[Tok2VecListener]): The listener models listening to this
        component. Usually internals.
        """
        return [m for c in self.listening_components for m in self.listener_map[c]]
-    def find_listeners(self, model: Model) -> None:
+    @property
-        """Walk over a model, looking for layers that are Tok2vecListener
+    def listening_components(self) -> List[str]:
-        subclasses that have an upstream_name that matches this component.
+        """RETURNS (List[str]): The downstream components listening to this
-        Listeners can also set their upstream_name attribute to the wildcard
+        component. Usually internals.
-        string '*' to match any `Tok2Vec`.
+        """
        return list(self.listener_map.keys())
    def add_listener(self, listener: "Tok2VecListener", component_name: str) -> None:
        """Add a listener for a downstream component. Usually internals."""
        self.listener_map.setdefault(component_name, [])
        self.listener_map[component_name].append(listener)
    def find_listeners(self, component) -> None:
        """Walk over a model of a processing component, looking for layers that
        are Tok2vecListener subclasses that have an upstream_name that matches
        this component. Listeners can also set their upstream_name attribute to
        the wildcard string '*' to match any `Tok2Vec`.
        You're unlikely to ever need multiple `Tok2Vec` components, so it's
        fine to leave your listeners upstream_name on '*'.
        """
-        for node in model.walk():
+        names = ("*", self.name)
-            if isinstance(node, Tok2VecListener) and node.upstream_name in (
+        if isinstance(getattr(component, "model", None), Model):
-                "*",
+            for node in component.model.walk():
-                self.name,
+                if isinstance(node, Tok2VecListener) and node.upstream_name in names:
-            ):
+                    self.add_listener(node, component.name)
                self.add_listener(node)
    def __call__(self, doc: Doc) -> Doc:
        """Add context-sensitive embeddings to the Doc.tensor attribute, allowing
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@ -66,6 +66,20 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
    with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
        nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
    # Detect components with listeners that are not frozen consistently
    for name, proc in nlp.pipeline:
        if getattr(proc, "listening_components", None):
            for listener in proc.listening_components:
                if listener in frozen_components and name not in frozen_components:
                    logger.warn(f"Component '{name}' will be (re)trained, but the "
                                f"'{listener}' depends on it and is frozen. This means "
                                f"that the performance of the '{listener}' will be degraded. "
                                f"You should either freeze both, or neither of the two.")
                if listener not in frozen_components and name in frozen_components:
                    logger.warn(f"Component '{listener}' will be (re)trained, but it needs the "
                                f"'{name}' which is frozen. "
                                f"You should either freeze both, or neither of the two.")
    return nlp
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -400,7 +400,8 @@ vectors available – otherwise, it won't be able to make the same predictions.
 > ```
 >
 > By default, sourced components will be updated with your data during training.
-> If you want to preserve the component as-is, you can "freeze" it:
+> If you want to preserve the component as-is, you can "freeze" it if the pipeline 
 > is not using a shared `Tok2Vec` layer:
 >
 > ```ini
 > [training]
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -419,6 +419,16 @@ pipeline = ["parser", "ner", "textcat", "custom"]
 frozen_components = ["parser", "custom"]
 ```
 <Infobox variant="warning" title="Shared Tok2Vec layer">
 When the components in your pipeline
 [share an embedding layer](/usage/embeddings-transformers#embedding-layers), the
 **performance** of your frozen component will be **degraded** if you continue training
 other layers with the same underlying `Tok2Vec` instance. As a rule of thumb,
 ensure that your frozen components are truly **independent** in the pipeline.
 </Infobox>
 ### Using registered functions {#config-functions}
 The training configuration defined in the config file doesn't have to only