mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-13 09:42:26 +03:00
Move merge to independent command
This commit is contained in:
parent
9d0ae2407b
commit
03a0c2badc
|
@ -29,8 +29,9 @@ from .project.push import project_push # noqa: F401
|
||||||
from .project.pull import project_pull # noqa: F401
|
from .project.pull import project_pull # noqa: F401
|
||||||
from .project.document import project_document # noqa: F401
|
from .project.document import project_document # noqa: F401
|
||||||
from .find_threshold import find_threshold # noqa: F401
|
from .find_threshold import find_threshold # noqa: F401
|
||||||
from .configure import merge_pipelines, use_tok2vec, use_transformer # noqa: F401
|
from .configure import use_tok2vec, use_transformer # noqa: F401
|
||||||
from .configure import configure_resume_cli # noqa: F401
|
from .configure import configure_resume_cli # noqa: F401
|
||||||
|
from .merge import merge_pipelines # noqa: F401
|
||||||
|
|
||||||
|
|
||||||
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
|
||||||
|
|
|
@ -247,96 +247,3 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config:
|
||||||
msg.good("Saved config", output_file)
|
msg.good("Saved config", output_file)
|
||||||
|
|
||||||
return nlp.config
|
return nlp.config
|
||||||
|
|
||||||
|
|
||||||
def _inner_merge(
|
|
||||||
nlp: Language, nlp2: Language, replace_listeners: bool = False
|
|
||||||
) -> Language:
|
|
||||||
"""Actually do the merge.
|
|
||||||
|
|
||||||
nlp: Base pipeline to add components to.
|
|
||||||
nlp2: Pipeline to add components from.
|
|
||||||
replace_listeners (bool): Whether to replace listeners. Usually only true
|
|
||||||
if there's one listener.
|
|
||||||
returns: assembled pipeline.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# we checked earlier, so there's definitely just one
|
|
||||||
tok2vec_name = _get_tok2vecs(nlp2.config)[0]
|
|
||||||
rename = _check_pipeline_names(nlp, nlp2)
|
|
||||||
|
|
||||||
if len(_get_listeners(nlp2)) > 1:
|
|
||||||
if replace_listeners:
|
|
||||||
msg.warn(
|
|
||||||
"""
|
|
||||||
Replacing listeners for multiple components. Note this can make
|
|
||||||
your pipeline large and slow. Consider chaining pipelines (like
|
|
||||||
nlp2(nlp(text))) instead.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# TODO provide a guide for what to do here
|
|
||||||
msg.warn(
|
|
||||||
"""
|
|
||||||
The result of this merge will have two feature sources
|
|
||||||
(tok2vecs) and multiple listeners. This will work for
|
|
||||||
inference, but will probably not work when training without
|
|
||||||
extra adjustment. If you continue to train the pipelines
|
|
||||||
separately this is not a problem.
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
|
|
||||||
for comp in nlp2.pipe_names:
|
|
||||||
if replace_listeners and comp == tok2vec_name:
|
|
||||||
# the tok2vec should not be copied over
|
|
||||||
continue
|
|
||||||
if replace_listeners and _has_listener(nlp2, comp):
|
|
||||||
nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
|
|
||||||
nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
|
|
||||||
if comp in rename:
|
|
||||||
msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
|
|
||||||
return nlp
|
|
||||||
|
|
||||||
|
|
||||||
@configure_cli.command("merge")
|
|
||||||
def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
|
|
||||||
"""Combine components from multiple pipelines."""
|
|
||||||
nlp = spacy.load(base_model)
|
|
||||||
nlp2 = spacy.load(added_model)
|
|
||||||
|
|
||||||
# to merge models:
|
|
||||||
# - lang must be the same
|
|
||||||
# - vectors must be the same
|
|
||||||
# - vocabs must be the same
|
|
||||||
# - tokenizer must be the same (only partially checkable)
|
|
||||||
if nlp.lang != nlp2.lang:
|
|
||||||
msg.fail("Can't merge - languages don't match", exits=1)
|
|
||||||
|
|
||||||
# check vector equality
|
|
||||||
if (
|
|
||||||
nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
|
|
||||||
or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
|
|
||||||
or nlp.vocab.vectors.to_bytes(exclude=["strings"])
|
|
||||||
!= nlp2.vocab.vectors.to_bytes(exclude=["strings"])
|
|
||||||
):
|
|
||||||
msg.fail("Can't merge - vectors don't match", exits=1)
|
|
||||||
|
|
||||||
if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
|
|
||||||
msg.fail("Can't merge - tokenizers don't match", exits=1)
|
|
||||||
|
|
||||||
# Check that each pipeline only has one feature source
|
|
||||||
_check_single_tok2vec(base_model, nlp.config)
|
|
||||||
_check_single_tok2vec(added_model, nlp2.config)
|
|
||||||
|
|
||||||
# Check how many listeners there are and replace based on that
|
|
||||||
# TODO: option to recognize frozen tok2vecs
|
|
||||||
# TODO: take list of pipe names to copy, ignore others
|
|
||||||
listeners = _get_listeners(nlp2)
|
|
||||||
replace_listeners = len(listeners) == 1
|
|
||||||
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
|
||||||
|
|
||||||
# write the final pipeline
|
|
||||||
nlp.to_disk(output_file)
|
|
||||||
msg.info(f"Saved pipeline to: {output_file}")
|
|
||||||
|
|
||||||
return nlp
|
|
||||||
|
|
108
spacy/cli/merge.py
Normal file
108
spacy/cli/merge.py
Normal file
|
@ -0,0 +1,108 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from wasabi import msg
|
||||||
|
|
||||||
|
import spacy
|
||||||
|
from spacy.language import Language
|
||||||
|
|
||||||
|
from ._util import app, Arg, Opt
|
||||||
|
from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs
|
||||||
|
from .configure import _check_pipeline_names, _has_listener
|
||||||
|
|
||||||
|
|
||||||
|
def _inner_merge(
|
||||||
|
nlp: Language, nlp2: Language, replace_listeners: bool = False
|
||||||
|
) -> Language:
|
||||||
|
"""Actually do the merge.
|
||||||
|
|
||||||
|
nlp: Base pipeline to add components to.
|
||||||
|
nlp2: Pipeline to add components from.
|
||||||
|
replace_listeners (bool): Whether to replace listeners. Usually only true
|
||||||
|
if there's one listener.
|
||||||
|
returns: assembled pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# we checked earlier, so there's definitely just one
|
||||||
|
tok2vec_name = _get_tok2vecs(nlp2.config)[0]
|
||||||
|
rename = _check_pipeline_names(nlp, nlp2)
|
||||||
|
|
||||||
|
if len(_get_listeners(nlp2)) > 1:
|
||||||
|
if replace_listeners:
|
||||||
|
msg.warn(
|
||||||
|
"""
|
||||||
|
Replacing listeners for multiple components. Note this can make
|
||||||
|
your pipeline large and slow. Consider chaining pipelines (like
|
||||||
|
nlp2(nlp(text))) instead.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# TODO provide a guide for what to do here
|
||||||
|
msg.warn(
|
||||||
|
"""
|
||||||
|
The result of this merge will have two feature sources
|
||||||
|
(tok2vecs) and multiple listeners. This will work for
|
||||||
|
inference, but will probably not work when training without
|
||||||
|
extra adjustment. If you continue to train the pipelines
|
||||||
|
separately this is not a problem.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
for comp in nlp2.pipe_names:
|
||||||
|
if replace_listeners and comp == tok2vec_name:
|
||||||
|
# the tok2vec should not be copied over
|
||||||
|
continue
|
||||||
|
if replace_listeners and _has_listener(nlp2, comp):
|
||||||
|
nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
|
||||||
|
nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
|
||||||
|
if comp in rename:
|
||||||
|
msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
|
||||||
|
return nlp
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("merge")
|
||||||
|
def merge_pipelines(
|
||||||
|
# fmt: off
|
||||||
|
base_model: str = Arg(..., help="Name or path of base model"),
|
||||||
|
added_model: str = Arg(..., help="Name or path of model to be merged"),
|
||||||
|
output_file: Path = Arg(..., help="Path to save merged model")
|
||||||
|
# fmt: on
|
||||||
|
) -> Language:
|
||||||
|
"""Combine components from multiple pipelines."""
|
||||||
|
nlp = spacy.load(base_model)
|
||||||
|
nlp2 = spacy.load(added_model)
|
||||||
|
|
||||||
|
# to merge models:
|
||||||
|
# - lang must be the same
|
||||||
|
# - vectors must be the same
|
||||||
|
# - vocabs must be the same
|
||||||
|
# - tokenizer must be the same (only partially checkable)
|
||||||
|
if nlp.lang != nlp2.lang:
|
||||||
|
msg.fail("Can't merge - languages don't match", exits=1)
|
||||||
|
|
||||||
|
# check vector equality
|
||||||
|
if (
|
||||||
|
nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
|
||||||
|
or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
|
||||||
|
or nlp.vocab.vectors.to_bytes(exclude=["strings"])
|
||||||
|
!= nlp2.vocab.vectors.to_bytes(exclude=["strings"])
|
||||||
|
):
|
||||||
|
msg.fail("Can't merge - vectors don't match", exits=1)
|
||||||
|
|
||||||
|
if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
|
||||||
|
msg.fail("Can't merge - tokenizers don't match", exits=1)
|
||||||
|
|
||||||
|
# Check that each pipeline only has one feature source
|
||||||
|
_check_single_tok2vec(base_model, nlp.config)
|
||||||
|
_check_single_tok2vec(added_model, nlp2.config)
|
||||||
|
|
||||||
|
# Check how many listeners there are and replace based on that
|
||||||
|
# TODO: option to recognize frozen tok2vecs
|
||||||
|
# TODO: take list of pipe names to copy, ignore others
|
||||||
|
listeners = _get_listeners(nlp2)
|
||||||
|
replace_listeners = len(listeners) == 1
|
||||||
|
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
||||||
|
|
||||||
|
# write the final pipeline
|
||||||
|
nlp.to_disk(output_file)
|
||||||
|
msg.info(f"Saved pipeline to: {output_file}")
|
||||||
|
|
||||||
|
return nlp
|
|
@ -21,7 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list
|
||||||
from spacy.cli._util import substitute_project_variables
|
from spacy.cli._util import substitute_project_variables
|
||||||
from spacy.cli._util import validate_project_commands
|
from spacy.cli._util import validate_project_commands
|
||||||
from spacy.cli._util import upload_file, download_file
|
from spacy.cli._util import upload_file, download_file
|
||||||
from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines
|
from spacy.cli.configure import configure_resume_cli, use_tok2vec
|
||||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||||
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
|
from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
|
||||||
|
@ -30,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics
|
||||||
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
from spacy.cli.debug_data import _get_spans_length_freq_dist
|
||||||
from spacy.cli.download import get_compatibility, get_version
|
from spacy.cli.download import get_compatibility, get_version
|
||||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||||
|
from spacy.cli.merge import merge_pipelines
|
||||||
from spacy.cli.package import get_third_party_dependencies
|
from spacy.cli.package import get_third_party_dependencies
|
||||||
from spacy.cli.package import _is_permitted_package_name
|
from spacy.cli.package import _is_permitted_package_name
|
||||||
from spacy.cli.project.remote_storage import RemoteStorage
|
from spacy.cli.project.remote_storage import RemoteStorage
|
||||||
|
|
|
@ -8,6 +8,7 @@ menu:
|
||||||
- ['validate', 'validate']
|
- ['validate', 'validate']
|
||||||
- ['init', 'init']
|
- ['init', 'init']
|
||||||
- ['configure', 'configure']
|
- ['configure', 'configure']
|
||||||
|
- ['merge', 'merge']
|
||||||
- ['convert', 'convert']
|
- ['convert', 'convert']
|
||||||
- ['debug', 'debug']
|
- ['debug', 'debug']
|
||||||
- ['train', 'train']
|
- ['train', 'train']
|
||||||
|
@ -306,11 +307,10 @@ $ python -m spacy configure tok2vec [base_model] [output_file]
|
||||||
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
|
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
|
||||||
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
|
|
||||||
### configure merge {id="configure-merge", tag="command"}
|
## merge {id="merge", tag="command"}
|
||||||
|
|
||||||
Take two pipelines and create a new one with components from both of them,
|
Take two pipelines and create a new one with components from both of them,
|
||||||
handling the configuration of listeners. Note that unlike other commands, this
|
handling the configuration of listeners. The output is a serialized pipeline.
|
||||||
produces a whole pipeline, not just a config.
|
|
||||||
|
|
||||||
Components in the final pipeline are in the same order as in the original
|
Components in the final pipeline are in the same order as in the original
|
||||||
pipelines, with the base pipeline first and the added pipeline after. Because
|
pipelines, with the base pipeline first and the added pipeline after. Because
|
||||||
|
@ -329,7 +329,7 @@ textcat, and want to provide with one of the official pretrained pipelines or
|
||||||
another pipeline.
|
another pipeline.
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
|
$ python -m spacy merge [base_model] [added_model] [output_file]
|
||||||
```
|
```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user