mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
warn when frozen components break listener pattern (#6766)
* warn when frozen components break listener pattern * few notes in the documentation * update arg name * formatting * cleanup * specify listeners return type
This commit is contained in:
parent
88acbfc050
commit
57640aa838
|
@ -1496,8 +1496,7 @@ class Language:
|
||||||
for i, (name1, proc1) in enumerate(self.pipeline):
|
for i, (name1, proc1) in enumerate(self.pipeline):
|
||||||
if hasattr(proc1, "find_listeners"):
|
if hasattr(proc1, "find_listeners"):
|
||||||
for name2, proc2 in self.pipeline[i + 1 :]:
|
for name2, proc2 in self.pipeline[i + 1 :]:
|
||||||
if isinstance(getattr(proc2, "model", None), Model):
|
proc1.find_listeners(proc2)
|
||||||
proc1.find_listeners(proc2.model)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_config(
|
def from_config(
|
||||||
|
|
|
@ -62,28 +62,42 @@ class Tok2Vec(TrainablePipe):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.model = model
|
self.model = model
|
||||||
self.name = name
|
self.name = name
|
||||||
self.listeners = []
|
self.listener_map = {}
|
||||||
self.cfg = {}
|
self.cfg = {}
|
||||||
|
|
||||||
def add_listener(self, listener: "Tok2VecListener") -> None:
|
@property
|
||||||
"""Add a listener for a downstream component. Usually internals."""
|
def listeners(self) -> List["Tok2VecListener"]:
|
||||||
self.listeners.append(listener)
|
"""RETURNS (List[Tok2VecListener]): The listener models listening to this
|
||||||
|
component. Usually internals.
|
||||||
|
"""
|
||||||
|
return [m for c in self.listening_components for m in self.listener_map[c]]
|
||||||
|
|
||||||
def find_listeners(self, model: Model) -> None:
|
@property
|
||||||
"""Walk over a model, looking for layers that are Tok2vecListener
|
def listening_components(self) -> List[str]:
|
||||||
subclasses that have an upstream_name that matches this component.
|
"""RETURNS (List[str]): The downstream components listening to this
|
||||||
Listeners can also set their upstream_name attribute to the wildcard
|
component. Usually internals.
|
||||||
string '*' to match any `Tok2Vec`.
|
"""
|
||||||
|
return list(self.listener_map.keys())
|
||||||
|
|
||||||
|
def add_listener(self, listener: "Tok2VecListener", component_name: str) -> None:
|
||||||
|
"""Add a listener for a downstream component. Usually internals."""
|
||||||
|
self.listener_map.setdefault(component_name, [])
|
||||||
|
self.listener_map[component_name].append(listener)
|
||||||
|
|
||||||
|
def find_listeners(self, component) -> None:
|
||||||
|
"""Walk over a model of a processing component, looking for layers that
|
||||||
|
are Tok2vecListener subclasses that have an upstream_name that matches
|
||||||
|
this component. Listeners can also set their upstream_name attribute to
|
||||||
|
the wildcard string '*' to match any `Tok2Vec`.
|
||||||
|
|
||||||
You're unlikely to ever need multiple `Tok2Vec` components, so it's
|
You're unlikely to ever need multiple `Tok2Vec` components, so it's
|
||||||
fine to leave your listeners upstream_name on '*'.
|
fine to leave your listeners upstream_name on '*'.
|
||||||
"""
|
"""
|
||||||
for node in model.walk():
|
names = ("*", self.name)
|
||||||
if isinstance(node, Tok2VecListener) and node.upstream_name in (
|
if isinstance(getattr(component, "model", None), Model):
|
||||||
"*",
|
for node in component.model.walk():
|
||||||
self.name,
|
if isinstance(node, Tok2VecListener) and node.upstream_name in names:
|
||||||
):
|
self.add_listener(node, component.name)
|
||||||
self.add_listener(node)
|
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
|
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
|
||||||
|
|
|
@ -66,6 +66,20 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
||||||
|
# Detect components with listeners that are not frozen consistently
|
||||||
|
for name, proc in nlp.pipeline:
|
||||||
|
if getattr(proc, "listening_components", None):
|
||||||
|
for listener in proc.listening_components:
|
||||||
|
if listener in frozen_components and name not in frozen_components:
|
||||||
|
logger.warn(f"Component '{name}' will be (re)trained, but the "
|
||||||
|
f"'{listener}' depends on it and is frozen. This means "
|
||||||
|
f"that the performance of the '{listener}' will be degraded. "
|
||||||
|
f"You should either freeze both, or neither of the two.")
|
||||||
|
|
||||||
|
if listener not in frozen_components and name in frozen_components:
|
||||||
|
logger.warn(f"Component '{listener}' will be (re)trained, but it needs the "
|
||||||
|
f"'{name}' which is frozen. "
|
||||||
|
f"You should either freeze both, or neither of the two.")
|
||||||
return nlp
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -400,7 +400,8 @@ vectors available – otherwise, it won't be able to make the same predictions.
|
||||||
> ```
|
> ```
|
||||||
>
|
>
|
||||||
> By default, sourced components will be updated with your data during training.
|
> By default, sourced components will be updated with your data during training.
|
||||||
> If you want to preserve the component as-is, you can "freeze" it:
|
> If you want to preserve the component as-is, you can "freeze" it if the pipeline
|
||||||
|
> is not using a shared `Tok2Vec` layer:
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [training]
|
> [training]
|
||||||
|
|
|
@ -419,6 +419,16 @@ pipeline = ["parser", "ner", "textcat", "custom"]
|
||||||
frozen_components = ["parser", "custom"]
|
frozen_components = ["parser", "custom"]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
<Infobox variant="warning" title="Shared Tok2Vec layer">
|
||||||
|
|
||||||
|
When the components in your pipeline
|
||||||
|
[share an embedding layer](/usage/embeddings-transformers#embedding-layers), the
|
||||||
|
**performance** of your frozen component will be **degraded** if you continue training
|
||||||
|
other layers with the same underlying `Tok2Vec` instance. As a rule of thumb,
|
||||||
|
ensure that your frozen components are truly **independent** in the pipeline.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
### Using registered functions {#config-functions}
|
### Using registered functions {#config-functions}
|
||||||
|
|
||||||
The training configuration defined in the config file doesn't have to only
|
The training configuration defined in the config file doesn't have to only
|
||||||
|
|
Loading…
Reference in New Issue
Block a user