diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 7f092d5f5..ceb7357fc 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -80,6 +80,8 @@ eval_frequency = 200 score_weights = {} # Names of pipeline components that shouldn't be updated during training frozen_components = [] +# Names of pipeline components that should set annotations during training +annotating_components = [] # Location in the config where the dev corpus is defined dev_corpus = "corpora.dev" # Location in the config where the train corpus is defined diff --git a/spacy/language.py b/spacy/language.py index 6f6470533..1a447c11b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1074,6 +1074,7 @@ class Language: losses: Optional[Dict[str, float]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, exclude: Iterable[str] = SimpleFrozenList(), + annotates: Iterable[str] = SimpleFrozenList(), ): """Update the models in the pipeline. @@ -1081,10 +1082,13 @@ class Language: _: Should not be set - serves to catch backwards-incompatible scripts. drop (float): The dropout rate. sgd (Optimizer): An optimizer. - losses (Dict[str, float]): Dictionary to update with the loss, keyed by component. + losses (Dict[str, float]): Dictionary to update with the loss, keyed by + component. component_cfg (Dict[str, Dict]): Config parameters for specific pipeline components, keyed by component name. exclude (Iterable[str]): Names of components that shouldn't be updated. + annotates (Iterable[str]): Names of components that should set + annotations on the predicted examples after updating. RETURNS (Dict[str, float]): The updated losses dictionary DOCS: https://spacy.io/api/language#update @@ -1103,15 +1107,16 @@ class Language: sgd = self._optimizer if component_cfg is None: component_cfg = {} + pipe_kwargs = {} for i, (name, proc) in enumerate(self.pipeline): component_cfg.setdefault(name, {}) + pipe_kwargs[name] = deepcopy(component_cfg[name]) component_cfg[name].setdefault("drop", drop) + pipe_kwargs[name].setdefault("batch_size", self.batch_size) for name, proc in self.pipeline: - if name in exclude or not hasattr(proc, "update"): - continue - proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) - if sgd not in (None, False): - for name, proc in self.pipeline: + if name not in exclude and hasattr(proc, "update"): + proc.update(examples, sgd=None, losses=losses, **component_cfg[name]) + if sgd not in (None, False): if ( name not in exclude and hasattr(proc, "is_trainable") @@ -1119,6 +1124,18 @@ class Language: and proc.model not in (True, False, None) ): proc.finish_update(sgd) + if name in annotates: + for doc, eg in zip( + _pipe( + (eg.predicted for eg in examples), + proc=proc, + name=name, + default_error_handler=self.default_error_handler, + kwargs=pipe_kwargs[name], + ), + examples, + ): + eg.predicted = doc return losses def rehearse( diff --git a/spacy/schemas.py b/spacy/schemas.py index 2f25c785f..92315399d 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel): optimizer: Optimizer = Field(..., title="The optimizer to use") logger: Logger = Field(..., title="The logger to track training progress") frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training") + annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training") before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk") # fmt: on diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py new file mode 100644 index 000000000..b17855d85 --- /dev/null +++ b/spacy/tests/pipeline/test_annotates_on_update.py @@ -0,0 +1,113 @@ +from typing import Callable, Iterable, Iterator +import pytest +import io + +from thinc.api import Config +from spacy.language import Language +from spacy.training import Example +from spacy.training.loop import train +from spacy.lang.en import English +from spacy.util import registry, load_model_from_config + + +@pytest.fixture +def config_str(): + return """ + [nlp] + lang = "en" + pipeline = ["sentencizer","assert_sents"] + disabled = [] + before_creation = null + after_creation = null + after_pipeline_creation = null + batch_size = 1000 + tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + + [components] + + [components.assert_sents] + factory = "assert_sents" + + [components.sentencizer] + factory = "sentencizer" + punct_chars = null + + [training] + dev_corpus = "corpora.dev" + train_corpus = "corpora.train" + annotating_components = ["sentencizer"] + max_steps = 2 + + [corpora] + + [corpora.dev] + @readers = "unannotated_corpus" + + [corpora.train] + @readers = "unannotated_corpus" + """ + + +def test_annotates_on_update(): + # The custom component checks for sentence annotation + @Language.factory("assert_sents", default_config={}) + def assert_sents(nlp, name): + return AssertSents(name) + + class AssertSents: + def __init__(self, name, **cfg): + self.name = name + pass + + def __call__(self, doc): + if not doc.has_annotation("SENT_START"): + raise ValueError("No sents") + return doc + + def update(self, examples, *, drop=0.0, sgd=None, losses=None): + for example in examples: + if not example.predicted.has_annotation("SENT_START"): + raise ValueError("No sents") + return {} + + nlp = English() + nlp.add_pipe("sentencizer") + nlp.add_pipe("assert_sents") + + # When the pipeline runs, annotations are set + doc = nlp("This is a sentence.") + + examples = [] + for text in ["a a", "b b", "c c"]: + examples.append(Example(nlp.make_doc(text), nlp(text))) + + for example in examples: + assert not example.predicted.has_annotation("SENT_START") + + # If updating without setting annotations, assert_sents will raise an error + with pytest.raises(ValueError): + nlp.update(examples) + + # Updating while setting annotations for the sentencizer succeeds + nlp.update(examples, annotates=["sentencizer"]) + + +def test_annotating_components_from_config(config_str): + @registry.readers("unannotated_corpus") + def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]: + return UnannotatedCorpus() + + class UnannotatedCorpus: + def __call__(self, nlp: Language) -> Iterator[Example]: + for text in ["a a", "b b", "c c"]: + doc = nlp.make_doc(text) + yield Example(doc, doc) + + orig_config = Config().from_str(config_str) + nlp = load_model_from_config(orig_config, auto_fill=True, validate=True) + assert nlp.config["training"]["annotating_components"] == ["sentencizer"] + train(nlp) + + nlp.config["training"]["annotating_components"] = [] + with pytest.raises(ValueError): + train(nlp) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 9af8395a6..0b84db4c0 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,7 +1,9 @@ import pytest from spacy.language import Language from spacy.pipeline import TrainablePipe +from spacy.training import Example from spacy.util import SimpleFrozenList, get_arg_names +from spacy.lang.en import English @pytest.fixture @@ -417,3 +419,41 @@ def test_pipe_methods_initialize(): assert "test" in nlp.config["initialize"]["components"] nlp.remove_pipe("test") assert "test" not in nlp.config["initialize"]["components"] + + +def test_update_with_annotates(): + name = "test_with_annotates" + results = {} + + def make_component(name): + results[name] = "" + + def component(doc): + nonlocal results + results[name] += doc.text + return doc + + return component + + c1 = Language.component(f"{name}1", func=make_component(f"{name}1")) + c2 = Language.component(f"{name}2", func=make_component(f"{name}2")) + + components = set([f"{name}1", f"{name}2"]) + + nlp = English() + texts = ["a", "bb", "ccc"] + examples = [] + for text in texts: + examples.append(Example(nlp.make_doc(text), nlp.make_doc(text))) + + for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]: + for key in results: + results[key] = "" + nlp = English(vocab=nlp.vocab) + nlp.add_pipe(f"{name}1") + nlp.add_pipe(f"{name}2") + nlp.update(examples, annotates=components_to_annotate) + for component in components_to_annotate: + assert results[component] == "".join(eg.predicted.text for eg in examples) + for component in components - set(components_to_annotate): + assert results[component] == "" diff --git a/spacy/training/loop.py b/spacy/training/loop.py index ecfa12fdb..85aa458f0 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -74,6 +74,8 @@ def train( # Components that shouldn't be updated during training frozen_components = T["frozen_components"] + # Components that should set annotations on update + annotating_components = T["annotating_components"] # Create iterator, which yields out info after each optimization step. training_step_iterator = train_while_improving( nlp, @@ -86,11 +88,17 @@ def train( max_steps=T["max_steps"], eval_frequency=T["eval_frequency"], exclude=frozen_components, + annotating_components=annotating_components, ) clean_output_dir(output_path) stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n") if frozen_components: stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n") + if annotating_components: + stdout.write( + msg.info(f"Set annotations on update for: {annotating_components}") + + "\n" + ) stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n") with nlp.select_pipes(disable=frozen_components): log_step, finalize_logger = train_logger(nlp, stdout, stderr) @@ -142,6 +150,7 @@ def train_while_improving( patience: int, max_steps: int, exclude: List[str], + annotating_components: List[str], ): """Train until an evaluation stops improving. Works as a generator, with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`, @@ -193,7 +202,12 @@ def train_while_improving( dropout = next(dropouts) for subbatch in subdivide_batch(batch, accumulate_gradient): nlp.update( - subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude + subbatch, + drop=dropout, + losses=losses, + sgd=False, + exclude=exclude, + annotates=annotating_components, ) # TODO: refactor this so we don't have to run it separately in here for name, proc in nlp.pipeline: diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 0c2a4c9f3..576ab8394 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions. This section defines settings and controls for the training and evaluation process that are used when you run [`spacy train`](/api/cli#train). -| Name | Description | -| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | -| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | -| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | -| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | -| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | -| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | -| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | -| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | -| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | -| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | -| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | -| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | -| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | -| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | -| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | -| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | +| Name | Description | +| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ | +| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | +| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | +| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ | +| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ | +| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ | +| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~ | +| `gpu_allocator` | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~ | +| `logger` | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ | +| `max_epochs` | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~ | +| `max_steps` | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~ | +| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ | +| `patience` | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~ | +| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~ | +| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ | +| `train_corpus` | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~ | ### pretraining {#config-pretraining tag="section,optional"} diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md index 5aaa1d23e..4698529a1 100644 --- a/website/docs/api/transformer.md +++ b/website/docs/api/transformer.md @@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients. > losses = trf.update(examples, sgd=optimizer) > ``` -| Name | Description | -| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | -| _keyword-only_ | | -| `drop` | The dropout rate. ~~float~~ | -| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | -| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | -| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | +| Name | Description | +| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ | +| _keyword-only_ | | +| `drop` | The dropout rate. ~~float~~ | +| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ | +| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ | +| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ | ## Transformer.create_optimizer {#create_optimizer tag="method"} @@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the depending on the sentence lengths. However, it does provide the transformer with more meaningful windows to attend over. +To set sentence boundaries with the `sentencizer` during training, add a +`sentencizer` to the beginning of the pipeline and include it in +[`[training.annotating_components]`](/usage/training#annotating-components) to +have it set the sentence boundaries before the `transformer` component runs. + ### strided_spans.v1 {#strided_spans tag="registered function"} > #### Example config diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 9f929fe19..1b345050c 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -414,11 +414,11 @@ as-is. They are also excluded when calling > #### Note on frozen components > > Even though frozen components are not **updated** during training, they will -> still **run** during training and evaluation. This is very important, because -> they may still impact your model's performance – for instance, a sentence -> boundary detector can impact what the parser or entity recognizer considers a -> valid parse. So the evaluation results should always reflect what your -> pipeline will produce at runtime. +> still **run** during evaluation. This is very important, because they may +> still impact your model's performance – for instance, a sentence boundary +> detector can impact what the parser or entity recognizer considers a valid +> parse. So the evaluation results should always reflect what your pipeline will +> produce at runtime. ```ini [nlp] @@ -455,6 +455,64 @@ replace_listeners = ["model.tok2vec"] +### Using predictions from preceding components {#annotating-components new="3.1"} + +By default, components are updated in isolation during training, which means +that they don't see the predictions of any earlier components in the pipeline. A +component receives [`Example.predicted`](/api/example) as input and compares its +predictions to [`Example.reference`](/api/example) without saving its +annotations in the `predicted` doc. + +Instead, if certain components should **set their annotations** during training, +use the setting `annotating_components` in the `[training]` block to specify a +list of components. For example, the feature `DEP` from the parser could be used +as a tagger feature by including `DEP` in the tok2vec `attrs` and including +`parser` in `annotating_components`: + +```ini +### config.cfg (excerpt) {highlight="7,12"} +[nlp] +pipeline = ["parser", "tagger"] + +[components.tagger.model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tagger.model.tok2vec.encode.width} +attrs = ["NORM","DEP"] +rows = [5000,2500] +include_static_vectors = false + +[training] +annotating_components = ["parser"] +``` + +Any component in the pipeline can be included as an annotating component, +including frozen components. Frozen components can set annotations during +training just as they would set annotations during evaluation or when the final +pipeline is run. The config excerpt below shows how a frozen `ner` component and +a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the +entity linker during training: + +```ini +### config.cfg (excerpt) +[nlp] +pipeline = ["sentencizer", "ner", "entity_linker"] + +[components.ner] +source = "en_core_web_sm" + +[training] +frozen_components = ["ner"] +annotating_components = ["sentencizer", "ner"] +``` + + + +Be aware that non-frozen annotating components with statistical models will +**run twice** on each batch, once to update the model and once to apply the +now-updated model to the predicted docs. + + + ### Using registered functions {#config-functions} The training configuration defined in the config file doesn't have to only