From 57640aa83889a7e0c76ab519337d8dd4d3cbddf7 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 20 Jan 2021 01:12:35 +0100 Subject: [PATCH] warn when frozen components break listener pattern (#6766) * warn when frozen components break listener pattern * few notes in the documentation * update arg name * formatting * cleanup * specify listeners return type --- spacy/language.py | 3 +- spacy/pipeline/tok2vec.py | 44 ++++++++++++++-------- spacy/training/initialize.py | 14 +++++++ website/docs/usage/processing-pipelines.md | 3 +- website/docs/usage/training.md | 10 +++++ 5 files changed, 56 insertions(+), 18 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index d98a0e7a1..07e7e4148 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1496,8 +1496,7 @@ class Language: for i, (name1, proc1) in enumerate(self.pipeline): if hasattr(proc1, "find_listeners"): for name2, proc2 in self.pipeline[i + 1 :]: - if isinstance(getattr(proc2, "model", None), Model): - proc1.find_listeners(proc2.model) + proc1.find_listeners(proc2) @classmethod def from_config( diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 0ad875035..1220611b8 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -62,28 +62,42 @@ class Tok2Vec(TrainablePipe): self.vocab = vocab self.model = model self.name = name - self.listeners = [] + self.listener_map = {} self.cfg = {} - def add_listener(self, listener: "Tok2VecListener") -> None: - """Add a listener for a downstream component. Usually internals.""" - self.listeners.append(listener) + @property + def listeners(self) -> List["Tok2VecListener"]: + """RETURNS (List[Tok2VecListener]): The listener models listening to this + component. Usually internals. + """ + return [m for c in self.listening_components for m in self.listener_map[c]] - def find_listeners(self, model: Model) -> None: - """Walk over a model, looking for layers that are Tok2vecListener - subclasses that have an upstream_name that matches this component. - Listeners can also set their upstream_name attribute to the wildcard - string '*' to match any `Tok2Vec`. + @property + def listening_components(self) -> List[str]: + """RETURNS (List[str]): The downstream components listening to this + component. Usually internals. + """ + return list(self.listener_map.keys()) + + def add_listener(self, listener: "Tok2VecListener", component_name: str) -> None: + """Add a listener for a downstream component. Usually internals.""" + self.listener_map.setdefault(component_name, []) + self.listener_map[component_name].append(listener) + + def find_listeners(self, component) -> None: + """Walk over a model of a processing component, looking for layers that + are Tok2vecListener subclasses that have an upstream_name that matches + this component. Listeners can also set their upstream_name attribute to + the wildcard string '*' to match any `Tok2Vec`. You're unlikely to ever need multiple `Tok2Vec` components, so it's fine to leave your listeners upstream_name on '*'. """ - for node in model.walk(): - if isinstance(node, Tok2VecListener) and node.upstream_name in ( - "*", - self.name, - ): - self.add_listener(node) + names = ("*", self.name) + if isinstance(getattr(component, "model", None), Model): + for node in component.model.walk(): + if isinstance(node, Tok2VecListener) and node.upstream_name in names: + self.add_listener(node, component.name) def __call__(self, doc: Doc) -> Doc: """Add context-sensitive embeddings to the Doc.tensor attribute, allowing diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 1947e7c27..af3979e46 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -66,6 +66,20 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": with nlp.select_pipes(disable=[*frozen_components, *resume_components]): nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) logger.info(f"Initialized pipeline components: {nlp.pipe_names}") + # Detect components with listeners that are not frozen consistently + for name, proc in nlp.pipeline: + if getattr(proc, "listening_components", None): + for listener in proc.listening_components: + if listener in frozen_components and name not in frozen_components: + logger.warn(f"Component '{name}' will be (re)trained, but the " + f"'{listener}' depends on it and is frozen. This means " + f"that the performance of the '{listener}' will be degraded. " + f"You should either freeze both, or neither of the two.") + + if listener not in frozen_components and name in frozen_components: + logger.warn(f"Component '{listener}' will be (re)trained, but it needs the " + f"'{name}' which is frozen. " + f"You should either freeze both, or neither of the two.") return nlp diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index e55fc9ebd..b9824ea04 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -400,7 +400,8 @@ vectors available – otherwise, it won't be able to make the same predictions. > ``` > > By default, sourced components will be updated with your data during training. -> If you want to preserve the component as-is, you can "freeze" it: +> If you want to preserve the component as-is, you can "freeze" it if the pipeline +> is not using a shared `Tok2Vec` layer: > > ```ini > [training] diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index ad5bec92a..16b2b0f5a 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -419,6 +419,16 @@ pipeline = ["parser", "ner", "textcat", "custom"] frozen_components = ["parser", "custom"] ``` + + +When the components in your pipeline +[share an embedding layer](/usage/embeddings-transformers#embedding-layers), the +**performance** of your frozen component will be **degraded** if you continue training +other layers with the same underlying `Tok2Vec` instance. As a rule of thumb, +ensure that your frozen components are truly **independent** in the pipeline. + + + ### Using registered functions {#config-functions} The training configuration defined in the config file doesn't have to only