Add initialize.before_init and after_init callbacks

Add `initialize.before_init` and `initialize.after_init` callbacks to
the config. The `initialize.before_init` callback is a place to
implement one-time tokenizer customizations that are then saved with the
model.
This commit is contained in:
Adriane Boyd 2021-01-12 11:29:31 +01:00
parent ad43cbb042
commit a45d89f09a
6 changed files with 165 additions and 15 deletions

View File

@ -124,3 +124,5 @@ lookups = null
tokenizer = {}
# Arguments for initialize methods of the components (keyed by component)
components = {}
before_init = null
after_init = null

View File

@ -1209,6 +1209,9 @@ class Language:
config = self.config.interpolate()
# These are the settings provided in the [initialize] block in the config
I = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
before_init = I["before_init"]
if before_init is not None:
before_init(self)
init_vocab(
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
)
@ -1240,6 +1243,9 @@ class Language:
self._optimizer = sgd
elif self._optimizer is None:
self._optimizer = self.create_optimizer()
after_init = I["after_init"]
if after_init is not None:
after_init(self)
return self._optimizer
def resume_training(self, *, sgd: Optional[Optimizer] = None) -> Optimizer:

View File

@ -369,6 +369,8 @@ class ConfigSchemaInit(BaseModel):
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
tokenizer: Dict[StrictStr, Any] = Field(..., help="Arguments to be passed into Tokenizer.initialize")
components: Dict[StrictStr, Dict[StrictStr, Any]] = Field(..., help="Arguments for TrainablePipe.initialize methods of pipeline components, keyed by component")
before_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object before initialization")
after_init: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after initialization")
# fmt: on
class Config:

View File

@ -166,6 +166,8 @@ def test_language_from_config_before_after_init():
ran_before = False
ran_after = False
ran_after_pipeline = False
ran_before_init = False
ran_after_init = False
@registry.callbacks(f"{name}_before")
def make_before_creation():
@ -205,6 +207,26 @@ def test_language_from_config_before_after_init():
return after_pipeline_creation
@registry.callbacks(f"{name}_before_init")
def make_before_init():
def before_init(nlp):
nonlocal ran_before_init
ran_before_init = True
nlp.meta["before_init"] = "before"
return nlp
return before_init
@registry.callbacks(f"{name}_after_init")
def make_after_init():
def after_init(nlp):
nonlocal ran_after_init
ran_after_init = True
nlp.meta["after_init"] = "after"
return nlp
return after_init
config = {
"nlp": {
"pipeline": ["sentencizer"],
@ -213,14 +235,23 @@ def test_language_from_config_before_after_init():
"after_pipeline_creation": {"@callbacks": f"{name}_after_pipeline"},
},
"components": {"sentencizer": {"factory": "sentencizer"}},
"initialize": {
"before_init": {"@callbacks": f"{name}_before_init"},
"after_init": {"@callbacks": f"{name}_after_init"},
},
}
nlp = English.from_config(config)
assert all([ran_before, ran_after, ran_after_pipeline])
assert nlp.Defaults.foo == "bar"
assert nlp.meta["foo"] == "bar"
assert nlp.meta["bar"] == "baz"
assert "before_init" not in nlp.meta
assert "after_init" not in nlp.meta
assert nlp.pipe_names == ["sentencizer"]
assert nlp("text")
nlp.initialize()
assert nlp.meta["before_init"] == "before"
assert nlp.meta["after_init"] == "after"
assert all([ran_before, ran_after, ran_after_pipeline, ran_before_init, ran_after_init])
def test_language_from_config_before_after_init_invalid():

View File

@ -611,14 +611,16 @@ subclass and language data from scratch it's often enough to make a few smal
modifications, like adjusting the
[tokenization rules](/usage/linguistic-features#native-tokenizer-additions) or
[language defaults](/api/language#defaults) like stop words. The config lets you
provide three optional **callback functions** that give you access to the
provide five optional **callback functions** that give you access to the
language class and `nlp` object at different points of the lifecycle:
| Callback | Description |
| ------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults). |
| `after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. Useful for modifying the tokenizer. |
| `after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
| Callback | Description |
| ----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp.before_creation` | Called before the `nlp` object is created and receives the language subclass like `English` (not the instance). Useful for writing to the [`Language.Defaults`](/api/language#defaults) aside from the tokenizer settings. |
| `nlp.after_creation` | Called right after the `nlp` object is created, but before the pipeline components are added to the pipeline and receives the `nlp` object. |
| `nlp.after_pipeline_creation` | Called right after the pipeline components are created and added and receives the `nlp` object. Useful for modifying pipeline components. |
| `initialize.before_init` | Called before the pipeline components are initialized and receives the `nlp` object for in-place modification. Useful for modifying the tokenizer settings, similar to the v2 base model option. |
| `initialize.after_init` | Called after the pipeline components are initialized and receives the `nlp` object for in-place modification. |
The `@spacy.registry.callbacks` decorator lets you register your custom function
in the `callbacks` [registry](/api/top-level#registry) under a given name. You
@ -626,8 +628,8 @@ can then reference the function in a config block using the `@callbacks` key. If
a block contains a key starting with an `@`, it's interpreted as a reference to
a function. Because you've registered the function, spaCy knows how to create it
when you reference `"customize_language_data"` in your config. Here's an example
of a callback that runs before the `nlp` object is created and adds a few custom
tokenization rules to the defaults:
of a callback that runs before the `nlp` object is created and adds a custom
stop word to the defaults:
> #### config.cfg
>
@ -643,7 +645,7 @@ import spacy
@spacy.registry.callbacks("customize_language_data")
def create_callback():
def customize_language_data(lang_cls):
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
lang_cls.Defaults.stop_words.add("good")
return lang_cls
return customize_language_data
@ -674,17 +676,16 @@ we're adding the arguments `extra_stop_words` (a list of strings) and `debug`
> ```
```python
### functions.py {highlight="5,8-10"}
### functions.py {highlight="5,7-9"}
from typing import List
import spacy
@spacy.registry.callbacks("customize_language_data")
def create_callback(extra_stop_words: List[str] = [], debug: bool = False):
def customize_language_data(lang_cls):
lang_cls.Defaults.suffixes = lang_cls.Defaults.suffixes + (r"-+$",)
lang_cls.Defaults.stop_words.add(extra_stop_words)
lang_cls.Defaults.stop_words.update(extra_stop_words)
if debug:
print("Updated stop words and tokenizer suffixes")
print("Updated stop words")
return lang_cls
return customize_language_data
@ -715,6 +716,65 @@ to your Python file. Before loading the config, spaCy will import the
$ python -m spacy train config.cfg --output ./output --code ./functions.py
```
#### Example: Modifying tokenizer settings {#custom-tokenizer}
Use the `initialize.before_init` callback to modify the tokenizer settings when
training a new pipeline. Write a registered callback that modifies the tokenizer
settings and specify this callback in your config:
> #### config.cfg
>
> ```ini
> [initialize]
>
> [initialize.before_init]
> @callbacks = "customize_tokenizer"
> ```
```python
### functions.py
from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
def make_customize_tokenizer():
def customize_tokenizer(nlp):
# remove a suffix
suffixes = list(nlp.Defaults.suffixes)
suffixes.remove("\\[")
suffix_regex = compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
# add a special case
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
return customize_tokenizer
```
When training, provide the function above with the `--code` option:
```cli
$ python -m spacy train config.cfg --code ./functions.py
```
Because this callback is only called in the one-time initialization step before
training, the callback code does not need to be packaged with the final pipeline
package. However, to make it easier for others to replicate your training setup,
you can choose to package the initialization callbacks with the pipeline package
or to publish them separately.
<Infobox variant="warning" title="nlp.before_creation vs. initialize.before_init">
- `nlp.before_creation` is the best place to modify language defaults other than
the tokenizer settings.
- `initialize.before_init` is the best place to modify tokenizer settings when
training a new pipeline.
Unlike the other language defaults, the tokenizer settings are saved with the
pipeline with `nlp.to_disk()`, so modifications made in `nlp.before_creation`
will be clobbered by the saved settings when the trained pipeline is loaded from
disk.
</Infobox>
#### Example: Custom logging function {#custom-logging}
During training, the results of each step are passed to a logger function. By
@ -1060,7 +1120,7 @@ In this example we assume a custom function `read_custom_data` which loads or
generates texts with relevant text classification annotations. Then, small
lexical variations of the input text are created before generating the final
[`Example`](/api/example) objects. The `@spacy.registry.readers` decorator lets
you register the function creating the custom reader in the `readers`
you register the function creating the custom reader in the `readers`
[registry](/api/top-level#registry) and assign it a string name, so it can be
used in your config. All arguments on the registered function become available
as **config settings** in this case, `source`.

View File

@ -930,6 +930,55 @@ treebank.
</Project>
#### Modifying tokenizer settings
If you were using a base model with `spacy train` to customize the tokenizer
settings in v2, your modifications can be provided in the
`[initialize.before_init]` callback.
Write a registered callback that modifies the tokenizer settings and specify
this callback in your config:
> #### config.cfg
>
> ```ini
> [initialize]
>
> [initialize.before_init]
> @callbacks = "customize_tokenizer"
> ```
```python
### functions.py
from spacy.util import registry, compile_suffix_regex
@registry.callbacks("customize_tokenizer")
def make_customize_tokenizer():
def customize_tokenizer(nlp):
# remove a suffix
suffixes = list(nlp.Defaults.suffixes)
suffixes.remove("\\[")
suffix_regex = compile_suffix_regex(suffixes)
nlp.tokenizer.suffix_search = suffix_regex.search
# add a special case
nlp.tokenizer.add_special_case("_SPECIAL_", [{"ORTH": "_SPECIAL_"}])
return customize_tokenizer
```
When training, provide the function above with the `--code` option:
```cli
$ python -m spacy train config.cfg --code ./functions.py
```
The train step requires the `--code` option with your registered functions from
the `[initialize]` block, but since those callbacks are only required during the
initialization step, you don't need to provide them with the final pipeline
package. However, to make it easier for others to replicate your training setup,
you can choose to package the initialization callbacks with the pipeline package
or to publish them separately.
#### Training via the Python API {#migrating-training-python}
For most use cases, you **shouldn't** have to write your own training scripts