Add initialize.before_init and after_init callbacks

Add `initialize.before_init` and `initialize.after_init` callbacks to the config. The `initialize.before_init` callback is a place to implement one-time tokenizer customizations that are then saved with the model.
2025-12-22 17:43:13 +03:00 · 2021-01-12 11:29:31 +01:00 · 2021-01-12 11:29:31 +01:00 · a45d89f09a
commit a45d89f09a
parent ad43cbb042
6 changed files with 165 additions and 15 deletions
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -124,3 +124,5 @@ lookups = null
 tokenizer = {}
 # Arguments for initialize methods of the components (keyed by component)
 components = {}
+before_init = null
+after_init = null
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1209,6 +1209,9 @@ class Language:
        config = self.config.interpolate()
        # These are the settings provided in the [initialize] block in the config
        I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
+        before_init = I["before_init"]
+        if before_init is not None:
+            before_init(self)
        init_vocab(
            self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
        )
@ -1240,6 +1243,9 @@ class Language:
            self._optimizer = sgd
        elif self._optimizer is None:
            self._optimizer = self.create_optimizer()
+        after_init = I["after_init"]
+        if after_init is not None:
+            after_init(self)
        return self._optimizer

    def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
    init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
    tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
    components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
+    before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
+    after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
    # fmt: on

    class Config:
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
    ran_before = False
    ran_after = False
    ran_after_pipeline = False
+    ran_before_init = False
+    ran_after_init = False

    @registry.callbacks(f"{name}_before")
    def make_before_creation():
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():

        return after_pipeline_creation

+    @registry.callbacks(f"{name}_before_init")
+    def make_before_init():
+        def before_init(nlp):
+            nonlocal ran_before_init
+            ran_before_init = True
+            nlp.meta["before_init"] = "before"
+            return nlp
+
+        return before_init
+
+    @registry.callbacks(f"{name}_after_init")
+    def make_after_init():
+        def after_init(nlp):
+            nonlocal ran_after_init
+            ran_after_init = True
+            nlp.meta["after_init"] = "after"
+            return nlp
+
+        return after_init
+
    config = {
        "nlp": {
            "pipeline": ["sentencizer"],
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
            "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
        },
        "components": {"sentencizer": {"factory": "sentencizer"}},
+        "initialize": {
+            "before_init": {"@callbacks": f"{name}_before_init"},
+            "after_init": {"@callbacks": f"{name}_after_init"},
+        },
    }
    nlp = English.from_config(config)
-    assert all([ran_before, ran_after, ran_after_pipeline])
    assert nlp.Defaults.foo == "bar"
    assert nlp.meta["foo"] == "bar"
    assert nlp.meta["bar"] == "baz"
+    assert "before_init" not in nlp.meta
+    assert "after_init" not in nlp.meta
    assert nlp.pipe_names == ["sentencizer"]
    assert nlp("text")
+    nlp.initialize()
+    assert nlp.meta["before_init"] == "before"
+    assert nlp.meta["after_init"] == "after"
+    assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])


 def test_language_from_config_before_after_init_invalid():
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal
 modifications, like adjusting the
 [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
 [language defaults](/api/language#defaults) like stop words. The config lets you
-provide three optional **callback functions** that give you access to the
+provide five optional **callback functions** that give you access to the
 language class and `nlp` object at different points of the lifecycle:

 | Callback                      | Description                                                                                                                                                                                                                |
-| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
-| `after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer.          |
-| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                |
+| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `nlp.before_creation`         | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
+| `nlp.after_creation`          | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object.                                                                                |
+| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components.                                                                                  |
+| `initialize.before_init`      | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option.                           |
+| `initialize.after_init`       | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification.                                                                                                              |

 The `@spacy.registry.callbacks` decorator lets you register your custom function
 in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
 a block contains a key starting with an `@`, it's interpreted as a reference to
 a function. Because you've registered the function, spaCy knows how to create it
 when you reference `"customize_language_data"` in your config. Here's an example
-of a callback that runs before the `nlp` object is created and adds a few custom
-tokenization rules to the defaults:
+of a callback that runs before the `nlp` object is created and adds a custom
+stop word to the defaults:

 > #### config.cfg
 >
@ -643,7 +645,7 @@ import spacy
@spacy.registry.callbacks("customize_language_data")
 def create_callback():
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
+        lang_cls.Defaults.stop_words.add("good")
        return lang_cls

    return customize_language_data
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
 > ```

 ```python
-### functions.py {highlight="5,8-10"}
+### functions.py {highlight="5,7-9"}
 from typing import List
 import spacy

@spacy.registry.callbacks("customize_language_data")
 def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
    def customize_language_data(lang_cls):
-        lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
-        lang_cls.Defaults.stop_words.add(extra_stop_words)
+        lang_cls.Defaults.stop_words.update(extra_stop_words)
        if debug:
-            print("Updated stop words and tokenizer suffixes")
+            print("Updated stop words")
        return lang_cls

    return customize_language_data
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
 $ python -m spacy train config.cfg --output ./output --code ./functions.py
 ```

+#### Example: Modifying tokenizer settings {#custom-tokenizer}
+
+Use the `initialize.before_init` callback to modify the tokenizer settings when
+training a new pipeline. Write a registered callback that modifies the tokenizer
+settings and specify this callback in your config:
+
+> #### config.cfg
+>
+> ```ini
+> [initialize]
+>
+> [initialize.before_init]
+> @callbacks = "customize_tokenizer"
+> ```
+
+```python
+### functions.py
+from spacy.util import registry, compile_suffix_regex
+
+@registry.callbacks("customize_tokenizer")
+def make_customize_tokenizer():
+    def customize_tokenizer(nlp):
+        # remove a suffix
+        suffixes = list(nlp.Defaults.suffixes)
+        suffixes.remove("\\[")
+        suffix_regex = compile_suffix_regex(suffixes)
+        nlp.tokenizer.suffix_search = suffix_regex.search
+
+        # add a special case
+        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
+    return customize_tokenizer
+```
+
+When training, provide the function above with the `--code` option:
+
+```cli
+$ python -m spacy train config.cfg --code ./functions.py
+```
+
+Because this callback is only called in the one-time initialization step before
+training, the callback code does not need to be packaged with the final pipeline
+package. However, to make it easier for others to replicate your training setup,
+you can choose to package the initialization callbacks with the pipeline package
+or to publish them separately.
+
+<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
+
+- `nlp.before_creation` is the best place to modify language defaults other than
+  the tokenizer settings.
+- `initialize.before_init` is the best place to modify tokenizer settings when
+  training a new pipeline.
+
+Unlike the other language defaults, the tokenizer settings are saved with the
+pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
+will be clobbered by the saved settings when the trained pipeline is loaded from
+disk.
+
+</Infobox>
+
 #### Example: Custom logging function {#custom-logging}

 During training, the results of each step are passed to a logger function. By
--- a/website/docs/usage/v3.md
+++ b/website/docs/usage/v3.md
@ -930,6 +930,55 @@ treebank.

 </Project>

+#### Modifying tokenizer settings
+
+If you were using a base model with `spacy train` to customize the tokenizer
+settings in v2, your modifications can be provided in the
+`[initialize.before_init]` callback.
+
+Write a registered callback that modifies the tokenizer settings and specify
+this callback in your config:
+
+> #### config.cfg
+>
+> ```ini
+> [initialize]
+>
+> [initialize.before_init]
+> @callbacks = "customize_tokenizer"
+> ```
+
+```python
+### functions.py
+from spacy.util import registry, compile_suffix_regex
+
+@registry.callbacks("customize_tokenizer")
+def make_customize_tokenizer():
+    def customize_tokenizer(nlp):
+        # remove a suffix
+        suffixes = list(nlp.Defaults.suffixes)
+        suffixes.remove("\\[")
+        suffix_regex = compile_suffix_regex(suffixes)
+        nlp.tokenizer.suffix_search = suffix_regex.search
+
+        # add a special case
+        nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
+    return customize_tokenizer
+```
+
+When training, provide the function above with the `--code` option:
+
+```cli
+$ python -m spacy train config.cfg --code ./functions.py
+```
+
+The train step requires the `--code` option with your registered functions from
+the `[initialize]` block, but since those callbacks are only required during the
+initialization step, you don't need to provide them with the final pipeline
+package. However, to make it easier for others to replicate your training setup,
+you can choose to package the initialization callbacks with the pipeline package
+or to publish them separately.
+
 #### Training via the Python API {#migrating-training-python}

 For most use cases, you **shouldn't** have to write your own training scripts