From 8c1d86ea9262adb6ed4cd73da8e7baa5748eddc2 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 26 Aug 2021 09:50:35 +0200 Subject: [PATCH] Document use-case of freezing tok2vec (#8992) * update error msg * add sentence to docs * expand note on frozen components --- spacy/errors.py | 12 +++++------- spacy/training/initialize.py | 3 ++- website/docs/usage/training.md | 25 ++++++++++++++++--------- 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 36331fe15..a206826ff 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -116,13 +116,11 @@ class Warnings: # New warnings added in v3.x W086 = ("Component '{listener}' will be (re)trained, but it needs the component " - "'{name}' which is frozen. You can either freeze both, or neither " - "of the two. If you're sourcing the component from " - "an existing pipeline, you can use the `replace_listeners` setting in " - "the config block to replace its token-to-vector listener with a copy " - "and make it independent. For example, `replace_listeners = " - "[\"model.tok2vec\"]` See the documentation for details: " - "https://spacy.io/usage/training#config-components-listeners") + "'{name}' which is frozen. If you want to prevent retraining '{name}' " + "but want to train '{listener}' on top of it, you should add '{name}' to the " + "list of 'annotating_components' in the 'training' block in the config. " + "See the documentation for details: " + "https://spacy.io/usage/training#annotating-components") W087 = ("Component '{name}' will be (re)trained, but the component '{listener}' " "depends on it via a listener and is frozen. This means that the " "performance of '{listener}' will be degraded. You can either freeze " diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index 04d030964..bd014f75f 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -95,7 +95,8 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language": logger.warning(Warnings.W087.format(name=name, listener=listener)) # We always check this regardless, in case user freezes tok2vec if listener not in frozen_components and name in frozen_components: - logger.warning(Warnings.W086.format(name=name, listener=listener)) + if name not in T["annotating_components"]: + logger.warning(Warnings.W086.format(name=name, listener=listener)) return nlp diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 0fe34f2a2..94fdad209 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -480,7 +480,10 @@ as-is. They are also excluded when calling > still impact your model's performance – for instance, a sentence boundary > detector can impact what the parser or entity recognizer considers a valid > parse. So the evaluation results should always reflect what your pipeline will -> produce at runtime. +> produce at runtime. If you want a frozen component to run (without updating) +> during training as well, so that downstream components can use its +> **predictions**, you can add it to the list of +> [`annotating_components`](/usage/training#annotating-components). ```ini [nlp] @@ -567,6 +570,10 @@ frozen_components = ["ner"] annotating_components = ["sentencizer", "ner"] ``` +Similarly, a pretrained `tok2vec` layer can be frozen and specified in the list +of `annotating_components` to ensure that a downstream component can use the +embedding layer without updating it. + Be aware that non-frozen annotating components with statistical models will @@ -699,14 +706,14 @@ excluded from the logs and the score won't be weighted. -| Name | Description | -| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | -| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. | -| **Recall** (R) | Percentage of reference annotations recovered. Should increase. | -| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. | -| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | -| **Speed** | Prediction speed in words per second (WPS). Should stay stable. | +| Name | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------------------- | +| **Loss** | The training loss representing the amount of work left for the optimizer. Should decrease, but usually not to `0`. | +| **Precision** (P) | Percentage of predicted annotations that were correct. Should increase. | +| **Recall** (R) | Percentage of reference annotations recovered. Should increase. | +| **F-Score** (F) | Harmonic mean of precision and recall. Should increase. | +| **UAS** / **LAS** | Unlabeled and labeled attachment score for the dependency parser, i.e. the percentage of correct arcs. Should increase. | +| **Speed** | Prediction speed in words per second (WPS). Should stay stable. | Note that if the development data has raw text, some of the gold-standard entities might not align to the predicted tokenization. These tokenization