Add callback to copy vocab/tokenizer from model (#7750)

* Add callback to copy vocab/tokenizer from model Add callback `spacy.copy_from_base_model.v1` to copy the tokenizer settings and/or vocab (including vectors) from a base model. * Move spacy.copy_from_base_model.v1 to spacy.training.callbacks * Add documentation * Modify to specify model as tokenizer and vocab params
2025-09-18 01:52:37 +03:00 · 2021-04-22 12:36:50 +02:00 · 2021-04-22 12:36:50 +02:00 · bdb485cc80
commit bdb485cc80
parent f68fc29130
4 changed files with 66 additions and 0 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -501,6 +501,9 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")

    # New errors added in v3.x
+    E872 = ("Unable to copy tokenizer from base model due to different "
+            'tokenizer settings: current tokenizer config "{curr_config}" '
+            'vs. base model "{base_config}"')
    E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
            "'{text}'. This is likely a bug in spaCy, so feel free to open an "
            "issue: https://github.com/explosion/spaCy/issues")
--- a/spacy/training/init.py
+++ b/spacy/training/init.py
@ -8,3 +8,4 @@ from .iob_utils import biluo_tags_to_spans, tags_to_entities  # noqa: F401
 from .gold_io import docs_to_json, read_json_file  # noqa: F401
 from .batchers import minibatch_by_padded_size, minibatch_by_words  # noqa: F401
 from .loggers import console_logger, wandb_logger  # noqa: F401
+from .callbacks import create_copy_from_base_model  # noqa: F401
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@ -0,0 +1,32 @@
+from typing import Optional
+from ..errors import Errors
+from ..language import Language
+from ..util import load_model, registry, logger
+
+
+@registry.callbacks("spacy.copy_from_base_model.v1")
+def create_copy_from_base_model(
+    tokenizer: Optional[str] = None,
+    vocab: Optional[str] = None,
+) -> Language:
+    def copy_from_base_model(nlp):
+        if tokenizer:
+            logger.info(f"Copying tokenizer from: {tokenizer}")
+            base_nlp = load_model(tokenizer)
+            if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
+                nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
+            else:
+                raise ValueError(
+                    Errors.E872.format(
+                        curr_config=nlp.config["nlp"]["tokenizer"],
+                        base_config=base_nlp.config["nlp"]["tokenizer"],
+                    )
+                )
+        if vocab:
+            logger.info(f"Copying vocab from: {vocab}")
+            # only reload if the vocab is from a different model
+            if tokenizer != vocab:
+                base_nlp = load_model(vocab)
+            nlp.vocab.from_bytes(base_nlp.vocab.to_bytes())
+
+    return copy_from_base_model
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -8,6 +8,7 @@ menu:
  - ['Readers', 'readers']
  - ['Batchers', 'batchers']
  - ['Augmenters', 'augmenters']
+  - ['Callbacks', 'callbacks']
  - ['Training & Alignment', 'gold']
  - ['Utility Functions', 'util']
 ---
@ -785,6 +786,35 @@ useful for making the model less sensitive to capitalization.
 | `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                    |
 | **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |

+## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"}
+
+The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at
+several points in the lifecycle that can be used modify the `nlp` object.
+
+### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [initialize.before_init]
+> @callbacks = "spacy.copy_from_base_model.v1"
+> tokenizer = "en_core_sci_md"
+> vocab = "en_core_sci_md"
+> ```
+
+Copy the tokenizer and/or vocab from the specified models. It's similar to the
+v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in
+combination with
+[sourced components](/usage/processing-pipelines#sourced-components) when
+fine-tuning an existing pipeline. The vocab includes the lookups and the vectors
+from the specified model. Intended for use in `[initialize.before_init]`.
+
+| Name        | Description                                                                                                             |
+| ----------- | ----------------------------------------------------------------------------------------------------------------------- |
+| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~                                          |
+| `vocab`     | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~  |
+| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ |
+
 ## Training data and alignment {#gold source="spacy/training"}

 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}