diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index c9f82caa0..0f7226083 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -124,3 +124,5 @@ lookups = null tokenizer = {} # Arguments for initialize methods of the components (keyed by component) components = {} +before_init = null +after_init = null diff --git a/spacy/language.py b/spacy/language.py index f695ddc9e..91f4b99d4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1209,6 +1209,9 @@ class Language: config = self.config.interpolate() # These are the settings provided in the [initialize] block in the config I = registry.resolve(config["initialize"], schema=ConfigSchemaInit) + before_init = I["before_init"] + if before_init is not None: + before_init(self) init_vocab( self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"] ) @@ -1240,6 +1243,9 @@ class Language: self._optimizer = sgd elif self._optimizer is None: self._optimizer = self.create_optimizer() + after_init = I["after_init"] + if after_init is not None: + after_init(self) return self._optimizer def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer: diff --git a/spacy/schemas.py b/spacy/schemas.py index 3ea611287..d041845f3 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel): init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize") components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component") + before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization") + after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization") # fmt: on class Config: diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 4fbcaee9e..6ffeeadce 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -166,6 +166,8 @@ def test_language_from_config_before_after_init(): ran_before = False ran_after = False ran_after_pipeline = False + ran_before_init = False + ran_after_init = False @registry.callbacks(f"{name}_before") def make_before_creation(): @@ -205,6 +207,26 @@ def test_language_from_config_before_after_init(): return after_pipeline_creation + @registry.callbacks(f"{name}_before_init") + def make_before_init(): + def before_init(nlp): + nonlocal ran_before_init + ran_before_init = True + nlp.meta["before_init"] = "before" + return nlp + + return before_init + + @registry.callbacks(f"{name}_after_init") + def make_after_init(): + def after_init(nlp): + nonlocal ran_after_init + ran_after_init = True + nlp.meta["after_init"] = "after" + return nlp + + return after_init + config = { "nlp": { "pipeline": ["sentencizer"], @@ -213,14 +235,23 @@ def test_language_from_config_before_after_init(): "after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"}, }, "components": {"sentencizer": {"factory": "sentencizer"}}, + "initialize": { + "before_init": {"@callbacks": f"{name}_before_init"}, + "after_init": {"@callbacks": f"{name}_after_init"}, + }, } nlp = English.from_config(config) - assert all([ran_before, ran_after, ran_after_pipeline]) assert nlp.Defaults.foo == "bar" assert nlp.meta["foo"] == "bar" assert nlp.meta["bar"] == "baz" + assert "before_init" not in nlp.meta + assert "after_init" not in nlp.meta assert nlp.pipe_names == ["sentencizer"] assert nlp("text") + nlp.initialize() + assert nlp.meta["before_init"] == "before" + assert nlp.meta["after_init"] == "after" + assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init]) def test_language_from_config_before_after_init_invalid(): diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 3447e7b1a..ad5bec92a 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -611,14 +611,16 @@ subclass and language data from scratch – it's often enough to make a few smal modifications, like adjusting the [tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or [language defaults](/api/language#defaults) like stop words. The config lets you -provide three optional **callback functions** that give you access to the +provide five optional **callback functions** that give you access to the language class and `nlp` object at different points of the lifecycle: -| Callback | Description | -| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). | -| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. | -| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. | +| Callback | Description | +| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. | +| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. | +| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. | +| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. | +| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. | The `@spacy.registry.callbacks` decorator lets you register your custom function in the `callbacks` [registry](/api/top-level#registry) under a given name. You @@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If a block contains a key starting with an `@`, it's interpreted as a reference to a function. Because you've registered the function, spaCy knows how to create it when you reference `"customize_language_data"` in your config. Here's an example -of a callback that runs before the `nlp` object is created and adds a few custom -tokenization rules to the defaults: +of a callback that runs before the `nlp` object is created and adds a custom +stop word to the defaults: > #### config.cfg > @@ -643,7 +645,7 @@ import spacy @spacy.registry.callbacks("customize_language_data") def create_callback(): def customize_language_data(lang_cls): - lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",) + lang_cls.Defaults.stop_words.add("good") return lang_cls return customize_language_data @@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug` > ``` ```python -### functions.py {highlight="5,8-10"} +### functions.py {highlight="5,7-9"} from typing import List import spacy @spacy.registry.callbacks("customize_language_data") def create_callback(extra_stop_words: List[str] = [], debug: bool = False): def customize_language_data(lang_cls): - lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",) - lang_cls.Defaults.stop_words.add(extra_stop_words) + lang_cls.Defaults.stop_words.update(extra_stop_words) if debug: - print("Updated stop words and tokenizer suffixes") + print("Updated stop words") return lang_cls return customize_language_data @@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the $ python -m spacy train config.cfg --output ./output --code ./functions.py ``` +#### Example: Modifying tokenizer settings {#custom-tokenizer} + +Use the `initialize.before_init` callback to modify the tokenizer settings when +training a new pipeline. Write a registered callback that modifies the tokenizer +settings and specify this callback in your config: + +> #### config.cfg +> +> ```ini +> [initialize] +> +> [initialize.before_init] +> @callbacks = "customize_tokenizer" +> ``` + +```python +### functions.py +from spacy.util import registry, compile_suffix_regex + +@registry.callbacks("customize_tokenizer") +def make_customize_tokenizer(): + def customize_tokenizer(nlp): + # remove a suffix + suffixes = list(nlp.Defaults.suffixes) + suffixes.remove("\\[") + suffix_regex = compile_suffix_regex(suffixes) + nlp.tokenizer.suffix_search = suffix_regex.search + + # add a special case + nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}]) + return customize_tokenizer +``` + +When training, provide the function above with the `--code` option: + +```cli +$ python -m spacy train config.cfg --code ./functions.py +``` + +Because this callback is only called in the one-time initialization step before +training, the callback code does not need to be packaged with the final pipeline +package. However, to make it easier for others to replicate your training setup, +you can choose to package the initialization callbacks with the pipeline package +or to publish them separately. + + + +- `nlp.before_creation` is the best place to modify language defaults other than + the tokenizer settings. +- `initialize.before_init` is the best place to modify tokenizer settings when + training a new pipeline. + +Unlike the other language defaults, the tokenizer settings are saved with the +pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation` +will be clobbered by the saved settings when the trained pipeline is loaded from +disk. + + + #### Example: Custom logging function {#custom-logging} During training, the results of each step are passed to a logger function. By @@ -1060,7 +1120,7 @@ In this example we assume a custom function `read_custom_data` which loads or generates texts with relevant text classification annotations. Then, small lexical variations of the input text are created before generating the final [`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets -you register the function creating the custom reader in the `readers` +you register the function creating the custom reader in the `readers` [registry](/api/top-level#registry) and assign it a string name, so it can be used in your config. All arguments on the registered function become available as **config settings** – in this case, `source`. diff --git a/website/docs/usage/v3.md b/website/docs/usage/v3.md index 47ddcf53a..9b911b960 100644 --- a/website/docs/usage/v3.md +++ b/website/docs/usage/v3.md @@ -930,6 +930,55 @@ treebank. +#### Modifying tokenizer settings + +If you were using a base model with `spacy train` to customize the tokenizer +settings in v2, your modifications can be provided in the +`[initialize.before_init]` callback. + +Write a registered callback that modifies the tokenizer settings and specify +this callback in your config: + +> #### config.cfg +> +> ```ini +> [initialize] +> +> [initialize.before_init] +> @callbacks = "customize_tokenizer" +> ``` + +```python +### functions.py +from spacy.util import registry, compile_suffix_regex + +@registry.callbacks("customize_tokenizer") +def make_customize_tokenizer(): + def customize_tokenizer(nlp): + # remove a suffix + suffixes = list(nlp.Defaults.suffixes) + suffixes.remove("\\[") + suffix_regex = compile_suffix_regex(suffixes) + nlp.tokenizer.suffix_search = suffix_regex.search + + # add a special case + nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}]) + return customize_tokenizer +``` + +When training, provide the function above with the `--code` option: + +```cli +$ python -m spacy train config.cfg --code ./functions.py +``` + +The train step requires the `--code` option with your registered functions from +the `[initialize]` block, but since those callbacks are only required during the +initialization step, you don't need to provide them with the final pipeline +package. However, to make it easier for others to replicate your training setup, +you can choose to package the initialization callbacks with the pipeline package +or to publish them separately. + #### Training via the Python API {#migrating-training-python} For most use cases, you **shouldn't** have to write your own training scripts