From 57640aa83889a7e0c76ab519337d8dd4d3cbddf7 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Wed, 20 Jan 2021 01:12:35 +0100
Subject: [PATCH] warn when frozen components break listener pattern (#6766)

* warn when frozen components break listener pattern

* few notes in the documentation

* update arg name

* formatting

* cleanup

* specify listeners return type
---
 spacy/language.py                          |  3 +-
 spacy/pipeline/tok2vec.py                  | 44 ++++++++++++++--------
 spacy/training/initialize.py               | 14 +++++++
 website/docs/usage/processing-pipelines.md |  3 +-
 website/docs/usage/training.md             | 10 +++++
 5 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index d98a0e7a1..07e7e4148 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1496,8 +1496,7 @@ class Language:
         for i, (name1, proc1) in enumerate(self.pipeline):
             if hasattr(proc1, "find_listeners"):
                 for name2, proc2 in self.pipeline[i + 1 :]:
-                    if isinstance(getattr(proc2, "model", None), Model):
-                        proc1.find_listeners(proc2.model)
+                    proc1.find_listeners(proc2)
 
     @classmethod
     def from_config(
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 0ad875035..1220611b8 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -62,28 +62,42 @@ class Tok2Vec(TrainablePipe):
         self.vocab = vocab
         self.model = model
         self.name = name
-        self.listeners = []
+        self.listener_map = {}
         self.cfg = {}
 
-    def add_listener(self, listener: "Tok2VecListener") -> None:
-        """Add a listener for a downstream component. Usually internals."""
-        self.listeners.append(listener)
+    @property
+    def listeners(self) -> List["Tok2VecListener"]:
+        """RETURNS (List[Tok2VecListener]): The listener models listening to this
+        component. Usually internals.
+        """
+        return [m for c in self.listening_components for m in self.listener_map[c]]
 
-    def find_listeners(self, model: Model) -> None:
-        """Walk over a model, looking for layers that are Tok2vecListener
-        subclasses that have an upstream_name that matches this component.
-        Listeners can also set their upstream_name attribute to the wildcard
-        string '*' to match any `Tok2Vec`.
+    @property
+    def listening_components(self) -> List[str]:
+        """RETURNS (List[str]): The downstream components listening to this
+        component. Usually internals.
+        """
+        return list(self.listener_map.keys())
+
+    def add_listener(self, listener: "Tok2VecListener", component_name: str) -> None:
+        """Add a listener for a downstream component. Usually internals."""
+        self.listener_map.setdefault(component_name, [])
+        self.listener_map[component_name].append(listener)
+
+    def find_listeners(self, component) -> None:
+        """Walk over a model of a processing component, looking for layers that
+        are Tok2vecListener subclasses that have an upstream_name that matches
+        this component. Listeners can also set their upstream_name attribute to
+        the wildcard string '*' to match any `Tok2Vec`.
 
         You're unlikely to ever need multiple `Tok2Vec` components, so it's
         fine to leave your listeners upstream_name on '*'.
         """
-        for node in model.walk():
-            if isinstance(node, Tok2VecListener) and node.upstream_name in (
-                "*",
-                self.name,
-            ):
-                self.add_listener(node)
+        names = ("*", self.name)
+        if isinstance(getattr(component, "model", None), Model):
+            for node in component.model.walk():
+                if isinstance(node, Tok2VecListener) and node.upstream_name in names:
+                    self.add_listener(node, component.name)
 
     def __call__(self, doc: Doc) -> Doc:
         """Add context-sensitive embeddings to the Doc.tensor attribute, allowing
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 1947e7c27..af3979e46 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -66,6 +66,20 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
         nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
         logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+    # Detect components with listeners that are not frozen consistently
+    for name, proc in nlp.pipeline:
+        if getattr(proc, "listening_components", None):
+            for listener in proc.listening_components:
+                if listener in frozen_components and name not in frozen_components:
+                    logger.warn(f"Component '{name}' will be (re)trained, but the "
+                                f"'{listener}' depends on it and is frozen. This means "
+                                f"that the performance of the '{listener}' will be degraded. "
+                                f"You should either freeze both, or neither of the two.")
+
+                if listener not in frozen_components and name in frozen_components:
+                    logger.warn(f"Component '{listener}' will be (re)trained, but it needs the "
+                                f"'{name}' which is frozen. "
+                                f"You should either freeze both, or neither of the two.")
     return nlp
 
 
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index e55fc9ebd..b9824ea04 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -400,7 +400,8 @@ vectors available – otherwise, it won't be able to make the same predictions.
 > ```
 >
 > By default, sourced components will be updated with your data during training.
-> If you want to preserve the component as-is, you can "freeze" it:
+> If you want to preserve the component as-is, you can "freeze" it if the pipeline 
+> is not using a shared `Tok2Vec` layer:
 >
 > ```ini
 > [training]
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index ad5bec92a..16b2b0f5a 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -419,6 +419,16 @@ pipeline = ["parser", "ner", "textcat", "custom"]
 frozen_components = ["parser", "custom"]
 ```
 
+<Infobox variant="warning" title="Shared Tok2Vec layer">
+
+When the components in your pipeline
+[share an embedding layer](/usage/embeddings-transformers#embedding-layers), the
+**performance** of your frozen component will be **degraded** if you continue training
+other layers with the same underlying `Tok2Vec` instance. As a rule of thumb,
+ensure that your frozen components are truly **independent** in the pipeline.
+
+</Infobox>
+
 ### Using registered functions {#config-functions}
 
 The training configuration defined in the config file doesn't have to only