Add training option to set annotations on update (#7767)

* Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation
2025-11-08 11:57:39 +03:00 · 2021-04-26 16:53:53 +02:00 · 2021-04-26 16:53:53 +02:00 · 95c0833656
commit 95c0833656
parent c105ed10fd
9 changed files with 289 additions and 38 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -80,6 +80,8 @@ eval_frequency = 200
 score_weights = {}
 # Names of pipeline components that shouldn't be updated during training
 frozen_components = []
+# Names of pipeline components that should set annotations during training
+annotating_components = []
 # Location in the config where the dev corpus is defined
 dev_corpus = "corpora.dev"
 # Location in the config where the train corpus is defined
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1074,6 +1074,7 @@ class Language:
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
    ):
        """Update the models in the pipeline.

@ -1081,10 +1082,13 @@ class Language:
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
        sgd (Optimizer): An optimizer.
-        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
+            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
        RETURNS (Dict[str, float]): The updated losses dictionary

        DOCS: https://spacy.io/api/language#update
@ -1103,15 +1107,16 @@ class Language:
            sgd = self._optimizer
        if component_cfg is None:
            component_cfg = {}
+        pipe_kwargs = {}
        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
+            pipe_kwargs[name] = deepcopy(component_cfg[name])
            component_cfg[name].setdefault("drop", drop)
+            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
-            if name in exclude or not hasattr(proc, "update"):
-                continue
-            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd not in (None, False):
-            for name, proc in self.pipeline:
+            if name not in exclude and hasattr(proc, "update"):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
+            if sgd not in (None, False):
                if (
                    name not in exclude
                    and hasattr(proc, "is_trainable")
@ -1119,6 +1124,18 @@ class Language:
                    and proc.model not in (True, False, None)
                ):
                    proc.finish_update(sgd)
+            if name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=proc,
+                        name=name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
        return losses

    def rehearse(
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel):
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    logger: Logger = Field(..., title="The logger to track training progress")
    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+    annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
    # fmt: on

--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@ -0,0 +1,113 @@
+from typing import Callable, Iterable, Iterator
+import pytest
+import io
+
+from thinc.api import Config
+from spacy.language import Language
+from spacy.training import Example
+from spacy.training.loop import train
+from spacy.lang.en import English
+from spacy.util import registry, load_model_from_config
+
+
+@pytest.fixture
+def config_str():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["sentencizer","assert_sents"]
+    disabled = []
+    before_creation = null
+    after_creation = null
+    after_pipeline_creation = null
+    batch_size = 1000
+    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+    [components]
+
+    [components.assert_sents]
+    factory = "assert_sents"
+
+    [components.sentencizer]
+    factory = "sentencizer"
+    punct_chars = null
+
+    [training]
+    dev_corpus = "corpora.dev"
+    train_corpus = "corpora.train"
+    annotating_components = ["sentencizer"]
+    max_steps = 2
+
+    [corpora]
+
+    [corpora.dev]
+    @readers = "unannotated_corpus"
+
+    [corpora.train]
+    @readers = "unannotated_corpus"
+    """
+
+
+def test_annotates_on_update():
+    # The custom component checks for sentence annotation
+    @Language.factory("assert_sents", default_config={})
+    def assert_sents(nlp, name):
+        return AssertSents(name)
+
+    class AssertSents:
+        def __init__(self, name, **cfg):
+            self.name = name
+            pass
+
+        def __call__(self, doc):
+            if not doc.has_annotation("SENT_START"):
+                raise ValueError("No sents")
+            return doc
+
+        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            for example in examples:
+                if not example.predicted.has_annotation("SENT_START"):
+                    raise ValueError("No sents")
+            return {}
+
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    nlp.add_pipe("assert_sents")
+
+    # When the pipeline runs, annotations are set
+    doc = nlp("This is a sentence.")
+
+    examples = []
+    for text in ["a a", "b b", "c c"]:
+        examples.append(Example(nlp.make_doc(text), nlp(text)))
+
+    for example in examples:
+        assert not example.predicted.has_annotation("SENT_START")
+
+    # If updating without setting annotations, assert_sents will raise an error
+    with pytest.raises(ValueError):
+        nlp.update(examples)
+
+    # Updating while setting annotations for the sentencizer succeeds
+    nlp.update(examples, annotates=["sentencizer"])
+
+
+def test_annotating_components_from_config(config_str):
+    @registry.readers("unannotated_corpus")
+    def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
+        return UnannotatedCorpus()
+
+    class UnannotatedCorpus:
+        def __call__(self, nlp: Language) -> Iterator[Example]:
+            for text in ["a a", "b b", "c c"]:
+                doc = nlp.make_doc(text)
+                yield Example(doc, doc)
+
+    orig_config = Config().from_str(config_str)
+    nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
+    train(nlp)
+
+    nlp.config["training"]["annotating_components"] = []
+    with pytest.raises(ValueError):
+        train(nlp)
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -1,7 +1,9 @@
 import pytest
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names
+from spacy.lang.en import English


@pytest.fixture
@ -417,3 +419,41 @@ def test_pipe_methods_initialize():
    assert "test" in nlp.config["initialize"]["components"]
    nlp.remove_pipe("test")
    assert "test" not in nlp.config["initialize"]["components"]
+
+
+def test_update_with_annotates():
+    name = "test_with_annotates"
+    results = {}
+
+    def make_component(name):
+        results[name] = ""
+
+        def component(doc):
+            nonlocal results
+            results[name] += doc.text
+            return doc
+
+        return component
+
+    c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
+    c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
+
+    components = set([f"{name}1", f"{name}2"])
+
+    nlp = English()
+    texts = ["a", "bb", "ccc"]
+    examples = []
+    for text in texts:
+        examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
+
+    for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
+        for key in results:
+            results[key] = ""
+        nlp = English(vocab=nlp.vocab)
+        nlp.add_pipe(f"{name}1")
+        nlp.add_pipe(f"{name}2")
+        nlp.update(examples, annotates=components_to_annotate)
+        for component in components_to_annotate:
+            assert results[component] == "".join(eg.predicted.text for eg in examples)
+        for component in components - set(components_to_annotate):
+            assert results[component] == ""
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -74,6 +74,8 @@ def train(

    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
+    # Components that should set annotations on update
+    annotating_components = T["annotating_components"]
    # Create iterator, which yields out info after each optimization step.
    training_step_iterator = train_while_improving(
        nlp,
@ -86,11 +88,17 @@ def train(
        max_steps=T["max_steps"],
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
+        annotating_components=annotating_components,
    )
    clean_output_dir(output_path)
    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
    if frozen_components:
        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    if annotating_components:
+        stdout.write(
+            msg.info(f"Set annotations on update for: {annotating_components}")
+            + "\n"
+        )
    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
    with nlp.select_pipes(disable=frozen_components):
        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
@ -142,6 +150,7 @@ def train_while_improving(
    patience: int,
    max_steps: int,
    exclude: List[str],
+    annotating_components: List[str],
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -193,7 +202,12 @@ def train_while_improving(
        dropout = next(dropouts)
        for subbatch in subdivide_batch(batch, accumulate_gradient):
            nlp.update(
-                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
+                subbatch,
+                drop=dropout,
+                losses=losses,
+                sgd=False,
+                exclude=exclude,
+                annotates=annotating_components,
            )
        # TODO: refactor this so we don't have to run it separately in here
        for name, proc in nlp.pipeline:
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).

-| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
-| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
-| `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
-| `max_epochs`          | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
-| `max_steps`           | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
-| `patience`            | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
-| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                         |
+| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient`   | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
+| `batcher`               | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
+| `before_to_disk`        | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
+| `dev_corpus`            | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
+| `dropout`               | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
+| `eval_frequency`        | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
+| `frozen_components`     | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
+| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                          |
+| `gpu_allocator`         | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
+| `logger`                | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
+| `max_epochs`            | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
+| `max_steps`             | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
+| `optimizer`             | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
+| `patience`              | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
+| `score_weights`         | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
+| `seed`                  | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
+| `train_corpus`          | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |

 ### pretraining {#config-pretraining tag="section,optional"}

--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients.
 > losses = trf.update(examples, sgd=optimizer)
 > ```

-| Name              | Description                                                                                                                                                                      |
-| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
-| _keyword-only_    |                                                                                                                                                                                  |
-| `drop`            | The dropout rate. ~~float~~                                                                                                                                                      |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
+| Name           | Description                                                                                                                                                                      |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`     | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                  |
+| `drop`         | The dropout rate. ~~float~~                                                                                                                                                      |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |

 ## Transformer.create_optimizer {#create_optimizer tag="method"}

@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the
 depending on the sentence lengths. However, it does provide the transformer with
 more meaningful windows to attend over.

+To set sentence boundaries with the `sentencizer` during training, add a
+`sentencizer` to the beginning of the pipeline and include it in
+[`[training.annotating_components]`](/usage/training#annotating-components) to
+have it set the sentence boundaries before the `transformer` component runs.
+
 ### strided_spans.v1 {#strided_spans tag="registered function"}

 > #### Example config
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -414,11 +414,11 @@ as-is. They are also excluded when calling
 > #### Note on frozen components
 >
 > Even though frozen components are not **updated** during training, they will
-> still **run** during training and evaluation. This is very important, because
-> they may still impact your model's performance – for instance, a sentence
-> boundary detector can impact what the parser or entity recognizer considers a
-> valid parse. So the evaluation results should always reflect what your
-> pipeline will produce at runtime.
+> still **run** during evaluation. This is very important, because they may
+> still impact your model's performance – for instance, a sentence boundary
+> detector can impact what the parser or entity recognizer considers a valid
+> parse. So the evaluation results should always reflect what your pipeline will
+> produce at runtime.

 ```ini
 [nlp]
@ -455,6 +455,64 @@ replace_listeners = ["model.tok2vec"]

 </Infobox>

+### Using predictions from preceding components {#annotating-components new="3.1"}
+
+By default, components are updated in isolation during training, which means
+that they don't see the predictions of any earlier components in the pipeline. A
+component receives [`Example.predicted`](/api/example) as input and compares its
+predictions to [`Example.reference`](/api/example) without saving its
+annotations in the `predicted` doc.
+
+Instead, if certain components should **set their annotations** during training,
+use the setting `annotating_components` in the `[training]` block to specify a
+list of components. For example, the feature `DEP` from the parser could be used
+as a tagger feature by including `DEP` in the tok2vec `attrs` and including
+`parser` in `annotating_components`:
+
+```ini
+### config.cfg (excerpt) {highlight="7,12"}
+[nlp]
+pipeline = ["parser", "tagger"]
+
+[components.tagger.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tagger.model.tok2vec.encode.width}
+attrs = ["NORM","DEP"]
+rows = [5000,2500]
+include_static_vectors = false
+
+[training]
+annotating_components = ["parser"]
+```
+
+Any component in the pipeline can be included as an annotating component,
+including frozen components. Frozen components can set annotations during
+training just as they would set annotations during evaluation or when the final
+pipeline is run. The config excerpt below shows how a frozen `ner` component and
+a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
+entity linker during training:
+
+```ini
+### config.cfg (excerpt)
+[nlp]
+pipeline = ["sentencizer", "ner", "entity_linker"]
+
+[components.ner]
+source = "en_core_web_sm"
+
+[training]
+frozen_components = ["ner"]
+annotating_components = ["sentencizer", "ner"]
+```
+
+<Infobox variant="warning" title="Training speed with annotating components" id="annotating-components-speed">
+
+Be aware that non-frozen annotating components with statistical models will
+**run twice** on each batch, once to update the model and once to apply the
+now-updated model to the predicted docs.
+
+</Infobox>
+
 ### Using registered functions {#config-functions}

 The training configuration defined in the config file doesn't have to only