Add callback to copy vocab/tokenizer from model (#7750)

* Add callback to copy vocab/tokenizer from model

Add callback `spacy.copy_from_base_model.v1` to copy the tokenizer
settings and/or vocab (including vectors) from a base model.

* Move spacy.copy_from_base_model.v1 to spacy.training.callbacks

* Add documentation

* Modify to specify model as tokenizer and vocab params
This commit is contained in:
Adriane Boyd 2021-04-22 12:36:50 +02:00 committed by GitHub
parent f68fc29130
commit bdb485cc80
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 66 additions and 0 deletions

View File

@ -501,6 +501,9 @@ class Errors:
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
# New errors added in v3.x
E872 = ("Unable to copy tokenizer from base model due to different "
'tokenizer settings: current tokenizer config "{curr_config}" '
'vs. base model "{base_config}"')
E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
"'{text}'. This is likely a bug in spaCy, so feel free to open an "
"issue: https://github.com/explosion/spaCy/issues")

View File

@ -8,3 +8,4 @@ from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
from .loggers import console_logger, wandb_logger # noqa: F401
from .callbacks import create_copy_from_base_model # noqa: F401

View File

@ -0,0 +1,32 @@
from typing import Optional
from ..errors import Errors
from ..language import Language
from ..util import load_model, registry, logger
@registry.callbacks("spacy.copy_from_base_model.v1")
def create_copy_from_base_model(
tokenizer: Optional[str] = None,
vocab: Optional[str] = None,
) -> Language:
def copy_from_base_model(nlp):
if tokenizer:
logger.info(f"Copying tokenizer from: {tokenizer}")
base_nlp = load_model(tokenizer)
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
else:
raise ValueError(
Errors.E872.format(
curr_config=nlp.config["nlp"]["tokenizer"],
base_config=base_nlp.config["nlp"]["tokenizer"],
)
)
if vocab:
logger.info(f"Copying vocab from: {vocab}")
# only reload if the vocab is from a different model
if tokenizer != vocab:
base_nlp = load_model(vocab)
nlp.vocab.from_bytes(base_nlp.vocab.to_bytes())
return copy_from_base_model

View File

@ -8,6 +8,7 @@ menu:
- ['Readers', 'readers']
- ['Batchers', 'batchers']
- ['Augmenters', 'augmenters']
- ['Callbacks', 'callbacks']
- ['Training & Alignment', 'gold']
- ['Utility Functions', 'util']
---
@ -785,6 +786,35 @@ useful for making the model less sensitive to capitalization.
| `level` | The percentage of texts that will be augmented. ~~float~~ |
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"}
The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at
several points in the lifecycle that can be used modify the `nlp` object.
### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"}
> #### Example config
>
> ```ini
> [initialize.before_init]
> @callbacks = "spacy.copy_from_base_model.v1"
> tokenizer = "en_core_sci_md"
> vocab = "en_core_sci_md"
> ```
Copy the tokenizer and/or vocab from the specified models. It's similar to the
v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in
combination with
[sourced components](/usage/processing-pipelines#sourced-components) when
fine-tuning an existing pipeline. The vocab includes the lookups and the vectors
from the specified model. Intended for use in `[initialize.before_init]`.
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------- |
| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~ |
| `vocab` | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~ |
| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ |
## Training data and alignment {#gold source="spacy/training"}
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}