spaCy/spacy/tests/pipeline/test_annotates_on_update.py

from typing import Callable, Iterable, Iterator

import pytest
from thinc.api import Config

from spacy.lang.en import English
from spacy.language import Language
from spacy.training import Example
from spacy.training.loop import train
from spacy.util import load_model_from_config, registry


@pytest.fixture
def config_str():
    return """
    [nlp]
    lang = "en"
    pipeline = ["sentencizer","assert_sents"]
    disabled = []
    before_creation = null
    after_creation = null
    after_pipeline_creation = null
    batch_size = 1000
    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

    [components]

    [components.assert_sents]
    factory = "assert_sents"

    [components.sentencizer]
    factory = "sentencizer"
    punct_chars = null

    [training]
    dev_corpus = "corpora.dev"
    train_corpus = "corpora.train"
    annotating_components = ["sentencizer"]
    max_steps = 2

    [corpora]

    [corpora.dev]
    @readers = "unannotated_corpus"

    [corpora.train]
    @readers = "unannotated_corpus"
    """


def test_annotates_on_update():
    # The custom component checks for sentence annotation
    @Language.factory("assert_sents", default_config={})
    def assert_sents(nlp, name):
        return AssertSents(name)

    class AssertSents:
        model = None
        is_trainable = True

        def __init__(self, name, **cfg):
            self.name = name

        def __call__(self, doc):
            if not doc.has_annotation("SENT_START"):
                raise ValueError("No sents")
            return doc

        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
            losses.setdefault(self.name, 0.0)

            for example in examples:
                if not example.predicted.has_annotation("SENT_START"):
                    raise ValueError("No sents")

            return losses

        def finish_update(self, sgd=None):
            pass

    nlp = English()
    nlp.add_pipe("sentencizer")
    nlp.add_pipe("assert_sents")

    # When the pipeline runs, annotations are set
    nlp("This is a sentence.")

    examples = []
    for text in ["a a", "b b", "c c"]:
        examples.append(Example(nlp.make_doc(text), nlp(text)))

    for example in examples:
        assert not example.predicted.has_annotation("SENT_START")

    # If updating without setting annotations, assert_sents will raise an error
    with pytest.raises(ValueError):
        nlp.update(examples)

    # Updating while setting annotations for the sentencizer succeeds
    nlp.update(examples, annotates=["sentencizer"])


def test_annotating_components_from_config(config_str):
    @registry.readers("unannotated_corpus")
    def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
        return UnannotatedCorpus()

    class UnannotatedCorpus:
        def __call__(self, nlp: Language) -> Iterator[Example]:
            for text in ["a a", "b b", "c c"]:
                doc = nlp.make_doc(text)
                yield Example(doc, doc)

    orig_config = Config().from_str(config_str)
    nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
    assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
    train(nlp)

    nlp.config["training"]["annotating_components"] = []
    with pytest.raises(ValueError):
        train(nlp)
Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00			`from typing import Callable, Iterable, Iterator`

isort all the things 2023-06-26 12:41:03 +03:00			`import pytest`
Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00			`from thinc.api import Config`
isort all the things 2023-06-26 12:41:03 +03:00
			`from spacy.lang.en import English`
Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00			`from spacy.language import Language`
			`from spacy.training import Example`
			`from spacy.training.loop import train`
isort all the things 2023-06-26 12:41:03 +03:00			`from spacy.util import load_model_from_config, registry`
Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00

			`@pytest.fixture`
			`def config_str():`
			`return """`
			`[nlp]`
			`lang = "en"`
			`pipeline = ["sentencizer","assert_sents"]`
			`disabled = []`
			`before_creation = null`
			`after_creation = null`
			`after_pipeline_creation = null`
			`batch_size = 1000`
			`tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}`

			`[components]`

			`[components.assert_sents]`
			`factory = "assert_sents"`

			`[components.sentencizer]`
			`factory = "sentencizer"`
			`punct_chars = null`

			`[training]`
			`dev_corpus = "corpora.dev"`
			`train_corpus = "corpora.train"`
			`annotating_components = ["sentencizer"]`
			`max_steps = 2`

			`[corpora]`

			`[corpora.dev]`
			`@readers = "unannotated_corpus"`

			`[corpora.train]`
			`@readers = "unannotated_corpus"`
			`"""`


			`def test_annotates_on_update():`
			`# The custom component checks for sentence annotation`
			`@Language.factory("assert_sents", default_config={})`
			`def assert_sents(nlp, name):`
			`return AssertSents(name)`

			`class AssertSents:`
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec after the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop 2023-02-03 17:22:25 +03:00			`model = None`
			`is_trainable = True`

Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00			`def __init__(self, name, **cfg):`
			`self.name = name`

			`def __call__(self, doc):`
			`if not doc.has_annotation("SENT_START"):`
			`raise ValueError("No sents")`
			`return doc`

			`def update(self, examples, *, drop=0.0, sgd=None, losses=None):`
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec after the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop 2023-02-03 17:22:25 +03:00			`losses.setdefault(self.name, 0.0)`

Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00			`for example in examples:`
			`if not example.predicted.has_annotation("SENT_START"):`
			`raise ValueError("No sents")`
`Language.update`: ensure that tok2vec gets updated (#12136) * `Language.update`: ensure that tok2vec gets updated The components in a pipeline can be updated independently. However, tok2vec implementations are an exception to this, since they depend on listeners for their gradients. The update method of a tok2vec implementation computes the tok2vec forward and passes this along with a backprop function to the listeners. This backprop function accumulates gradients for all the listeners. There are two ways in which the accumulated gradients can be used to update the tok2vec weights: 1. Call the `finish_update` method of tok2vec after the `update` method is called on all of the pipes that use a tok2vec listener. 2. Pass an optimizer to the `update` method of tok2vec. In this case, tok2vec will give the last listener a special backprop function that calls `finish_update` on the tok2vec. Unfortunately, `Language.update` did neither of these. Instead, it immediately called `finish_update` on every pipe after `update`. As a result, the tok2vec weights are updated when no gradients have been accumulated from listeners yet. And the gradients of the listeners are only used in the next call to `Language.update` (when `finish_update` is called on tok2vec again). This change fixes this issue by passing the optimizer to the `update` method of trainable pipes, leading to use of the second strategy outlined above. The main updating loop in `Language.update` is also simplified by using the `TrainableComponent` protocol consistently. * Train loop: `sgd` is `Optional[Optimizer]`, do not pass false * Language.update: call pipe finish_update after all pipe updates This does correct and fast updates if multiple components update the same parameters. * Add comment why we moved `finish_update` to a separate loop 2023-02-03 17:22:25 +03:00
			`return losses`

			`def finish_update(self, sgd=None):`
			`pass`
Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00
			`nlp = English()`
			`nlp.add_pipe("sentencizer")`
			`nlp.add_pipe("assert_sents")`

			`# When the pipeline runs, annotations are set`
Tidy up and auto-format 2021-07-18 08:44:56 +03:00			`nlp("This is a sentence.")`
Add training option to set annotations on update (#7767) * Add training option to set annotations on update Add a `[training]` option called `set_annotations_on_update` to specify a list of components for which the predicted annotations should be set on `example.predicted` immediately after that component has been updated. The predicted annotations can be accessed by later components in the pipeline during the processing of the batch in the same `update` call. * Rename to annotates / annotating_components * Add test for `annotating_components` when training from config * Add documentation 2021-04-26 17:53:53 +03:00
			`examples = []`
			`for text in ["a a", "b b", "c c"]:`
			`examples.append(Example(nlp.make_doc(text), nlp(text)))`

			`for example in examples:`
			`assert not example.predicted.has_annotation("SENT_START")`

			`# If updating without setting annotations, assert_sents will raise an error`
			`with pytest.raises(ValueError):`
			`nlp.update(examples)`

			`# Updating while setting annotations for the sentencizer succeeds`
			`nlp.update(examples, annotates=["sentencizer"])`


			`def test_annotating_components_from_config(config_str):`
			`@registry.readers("unannotated_corpus")`
			`def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:`
			`return UnannotatedCorpus()`

			`class UnannotatedCorpus:`
			`def __call__(self, nlp: Language) -> Iterator[Example]:`
			`for text in ["a a", "b b", "c c"]:`
			`doc = nlp.make_doc(text)`
			`yield Example(doc, doc)`

			`orig_config = Config().from_str(config_str)`
			`nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)`
			`assert nlp.config["training"]["annotating_components"] == ["sentencizer"]`
			`train(nlp)`

			`nlp.config["training"]["annotating_components"] = []`
			`with pytest.raises(ValueError):`
			`train(nlp)`