mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Add callback to copy vocab/tokenizer from model (#7750)
* Add callback to copy vocab/tokenizer from model Add callback `spacy.copy_from_base_model.v1` to copy the tokenizer settings and/or vocab (including vectors) from a base model. * Move spacy.copy_from_base_model.v1 to spacy.training.callbacks * Add documentation * Modify to specify model as tokenizer and vocab params
This commit is contained in:
parent
f68fc29130
commit
bdb485cc80
|
@ -501,6 +501,9 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E872 = ("Unable to copy tokenizer from base model due to different "
|
||||||
|
'tokenizer settings: current tokenizer config "{curr_config}" '
|
||||||
|
'vs. base model "{base_config}"')
|
||||||
E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
|
E873 = ("Unable to merge a span from doc.spans with key '{key}' and text "
|
||||||
"'{text}'. This is likely a bug in spaCy, so feel free to open an "
|
"'{text}'. This is likely a bug in spaCy, so feel free to open an "
|
||||||
"issue: https://github.com/explosion/spaCy/issues")
|
"issue: https://github.com/explosion/spaCy/issues")
|
||||||
|
|
|
@ -8,3 +8,4 @@ from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
||||||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||||
from .loggers import console_logger, wandb_logger # noqa: F401
|
from .loggers import console_logger, wandb_logger # noqa: F401
|
||||||
|
from .callbacks import create_copy_from_base_model # noqa: F401
|
||||||
|
|
32
spacy/training/callbacks.py
Normal file
32
spacy/training/callbacks.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
from typing import Optional
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..language import Language
|
||||||
|
from ..util import load_model, registry, logger
|
||||||
|
|
||||||
|
|
||||||
|
@registry.callbacks("spacy.copy_from_base_model.v1")
|
||||||
|
def create_copy_from_base_model(
|
||||||
|
tokenizer: Optional[str] = None,
|
||||||
|
vocab: Optional[str] = None,
|
||||||
|
) -> Language:
|
||||||
|
def copy_from_base_model(nlp):
|
||||||
|
if tokenizer:
|
||||||
|
logger.info(f"Copying tokenizer from: {tokenizer}")
|
||||||
|
base_nlp = load_model(tokenizer)
|
||||||
|
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||||
|
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E872.format(
|
||||||
|
curr_config=nlp.config["nlp"]["tokenizer"],
|
||||||
|
base_config=base_nlp.config["nlp"]["tokenizer"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if vocab:
|
||||||
|
logger.info(f"Copying vocab from: {vocab}")
|
||||||
|
# only reload if the vocab is from a different model
|
||||||
|
if tokenizer != vocab:
|
||||||
|
base_nlp = load_model(vocab)
|
||||||
|
nlp.vocab.from_bytes(base_nlp.vocab.to_bytes())
|
||||||
|
|
||||||
|
return copy_from_base_model
|
|
@ -8,6 +8,7 @@ menu:
|
||||||
- ['Readers', 'readers']
|
- ['Readers', 'readers']
|
||||||
- ['Batchers', 'batchers']
|
- ['Batchers', 'batchers']
|
||||||
- ['Augmenters', 'augmenters']
|
- ['Augmenters', 'augmenters']
|
||||||
|
- ['Callbacks', 'callbacks']
|
||||||
- ['Training & Alignment', 'gold']
|
- ['Training & Alignment', 'gold']
|
||||||
- ['Utility Functions', 'util']
|
- ['Utility Functions', 'util']
|
||||||
---
|
---
|
||||||
|
@ -785,6 +786,35 @@ useful for making the model less sensitive to capitalization.
|
||||||
| `level` | The percentage of texts that will be augmented. ~~float~~ |
|
| `level` | The percentage of texts that will be augmented. ~~float~~ |
|
||||||
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
| **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~ |
|
||||||
|
|
||||||
|
## Callbacks {#callbacks source="spacy/training/callbacks.py" new="3"}
|
||||||
|
|
||||||
|
The config supports [callbacks](/usage/training#custom-code-nlp-callbacks) at
|
||||||
|
several points in the lifecycle that can be used modify the `nlp` object.
|
||||||
|
|
||||||
|
### spacy.copy_from_base_model.v1 {#copy_from_base_model tag="registered function"}
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [initialize.before_init]
|
||||||
|
> @callbacks = "spacy.copy_from_base_model.v1"
|
||||||
|
> tokenizer = "en_core_sci_md"
|
||||||
|
> vocab = "en_core_sci_md"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Copy the tokenizer and/or vocab from the specified models. It's similar to the
|
||||||
|
v2 [base model](https://v2.spacy.io/api/cli#train) option and useful in
|
||||||
|
combination with
|
||||||
|
[sourced components](/usage/processing-pipelines#sourced-components) when
|
||||||
|
fine-tuning an existing pipeline. The vocab includes the lookups and the vectors
|
||||||
|
from the specified model. Intended for use in `[initialize.before_init]`.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ----------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `tokenizer` | The pipeline to copy the tokenizer from. Defaults to `None`. ~~Optional[str]~~ |
|
||||||
|
| `vocab` | The pipeline to copy the vocab from. The vocab includes the lookups and vectors. Defaults to `None`. ~~Optional[str]~~ |
|
||||||
|
| **CREATES** | A function that takes the current `nlp` object and modifies its `tokenizer` and `vocab`. ~~Callable[[Language], None]~~ |
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/training"}
|
## Training data and alignment {#gold source="spacy/training"}
|
||||||
|
|
||||||
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user