mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Add callback to copy vocab/tokenizer from model (#7750)
* Add callback to copy vocab/tokenizer from model Add callback `spacy.copy_from_base_model.v1` to copy the tokenizer settings and/or vocab (including vectors) from a base model. * Move spacy.copy_from_base_model.v1 to spacy.training.callbacks * Add documentation * Modify to specify model as tokenizer and vocab params
This commit is contained in:
parent
f68fc29130
commit
bdb485cc80
|
@ -501,6 +501,9 @@ class Errors:
|
|||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||
|
||||
# New errors added in v3.x
|
||||
E872 = ("Unable to copy tokenizer from base model due to different "
|
||||
'tokenizer settings: current tokenizer config "{curr_config}" '
|
||||
'vs. base model "{base_config}"')
|
||||
E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
|
||||
"'{text}'. This is likely a bug in spaCy, so feel free to open an "
|
||||
"issue: https://github.com/explosion/spaCy/issues")
|
||||
|
|
|
@ -8,3 +8,4 @@ from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
|||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||
from .loggers import console_logger, wandb_logger # noqa: F401
|
||||
from .callbacks import create_copy_from_base_model # noqa: F401
|
||||
|
|
32
spacy/training/callbacks.py
Normal file
32
spacy/training/callbacks.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
from typing import Optional
|
||||
from ..errors import Errors
|
||||
from ..language import Language
|
||||
from ..util import load_model, registry, logger
|
||||
|
||||
|
||||
@registry.callbacks("spacy.copy_from_base_model.v1")
|
||||
def create_copy_from_base_model(
|
||||
tokenizer: Optional[str] = None,
|
||||
vocab: Optional[str] = None,
|
||||
) -> Language:
|
||||
def copy_from_base_model(nlp):
|
||||
if tokenizer:
|
||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
||||
base_nlp = load_model(tokenizer)
|
||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||
else:
|
||||
raise ValueError(
|
||||
Errors.E872.format(
|
||||
curr_config=nlp.config["nlp"]["tokenizer"],
|
||||
base_config=base_nlp.config["nlp"]["tokenizer"],
|
||||
)
|
||||
)
|
||||
if vocab:
|
||||
logger.info(f"Copying vocab from: {vocab}")
|
||||
# only reload if the vocab is from a different model
|
||||
if tokenizer != vocab:
|
||||
base_nlp = load_model(vocab)
|
||||
nlp.vocab.from_bytes(base_nlp.vocab.to_bytes())
|
||||
|
||||
return copy_from_base_model
|
|
@ -8,6 +8,7 @@ menu:
|
|||
- ['Readers', 'readers']
|
||||
- ['Batchers', 'batchers']
|
||||
- ['Augmenters', 'augmenters']
|
||||
- ['Callbacks', 'callbacks']
|
||||
- ['Training & Alignment', 'gold']
|
||||
- ['Utility Functions', 'util']
|
||||
---
|
||||
|
@ -785,6 +786,35 @@ useful for making the model less sensitive to capitalization.
|
|||
| `level` | The percentage of texts that will be augmented. ~~float~~ |
|
||||
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
||||
|
||||
## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"}
|
||||
|
||||
The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at
|
||||
several points in the lifecycle that can be used modify the `nlp` object.
|
||||
|
||||
### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"}
|
||||
|
||||
> #### Example config
|
||||
>
|
||||
> ```ini
|
||||
> [initialize.before_init]
|
||||
> @callbacks = "spacy.copy_from_base_model.v1"
|
||||
> tokenizer = "en_core_sci_md"
|
||||
> vocab = "en_core_sci_md"
|
||||
> ```
|
||||
|
||||
Copy the tokenizer and/or vocab from the specified models. It's similar to the
|
||||
v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in
|
||||
combination with
|
||||
[sourced components](/usage/processing-pipelines#sourced-components) when
|
||||
fine-tuning an existing pipeline. The vocab includes the lookups and the vectors
|
||||
from the specified model. Intended for use in `[initialize.before_init]`.
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~ |
|
||||
| `vocab` | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~ |
|
||||
| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ |
|
||||
|
||||
## Training data and alignment {#gold source="spacy/training"}
|
||||
|
||||
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||
|
|
Loading…
Reference in New Issue
Block a user