diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 383548057..4041e01b6 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -29,8 +29,9 @@ from .project.push import project_push # noqa: F401 from .project.pull import project_pull # noqa: F401 from .project.document import project_document # noqa: F401 from .find_threshold import find_threshold # noqa: F401 -from .configure import merge_pipelines, use_tok2vec, use_transformer # noqa: F401 +from .configure import use_tok2vec, use_transformer # noqa: F401 from .configure import configure_resume_cli # noqa: F401 +from .merge import merge_pipelines # noqa: F401 @app.command("link", no_args_is_help=True, deprecated=True, hidden=True) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 884ba68fe..5f31ca0b4 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -247,96 +247,3 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config: msg.good("Saved config", output_file) return nlp.config - - -def _inner_merge( - nlp: Language, nlp2: Language, replace_listeners: bool = False -) -> Language: - """Actually do the merge. - - nlp: Base pipeline to add components to. - nlp2: Pipeline to add components from. - replace_listeners (bool): Whether to replace listeners. Usually only true - if there's one listener. - returns: assembled pipeline. - """ - - # we checked earlier, so there's definitely just one - tok2vec_name = _get_tok2vecs(nlp2.config)[0] - rename = _check_pipeline_names(nlp, nlp2) - - if len(_get_listeners(nlp2)) > 1: - if replace_listeners: - msg.warn( - """ - Replacing listeners for multiple components. Note this can make - your pipeline large and slow. Consider chaining pipelines (like - nlp2(nlp(text))) instead. - """ - ) - else: - # TODO provide a guide for what to do here - msg.warn( - """ - The result of this merge will have two feature sources - (tok2vecs) and multiple listeners. This will work for - inference, but will probably not work when training without - extra adjustment. If you continue to train the pipelines - separately this is not a problem. - """ - ) - - for comp in nlp2.pipe_names: - if replace_listeners and comp == tok2vec_name: - # the tok2vec should not be copied over - continue - if replace_listeners and _has_listener(nlp2, comp): - nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"]) - nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp)) - if comp in rename: - msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...") - return nlp - - -@configure_cli.command("merge") -def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language: - """Combine components from multiple pipelines.""" - nlp = spacy.load(base_model) - nlp2 = spacy.load(added_model) - - # to merge models: - # - lang must be the same - # - vectors must be the same - # - vocabs must be the same - # - tokenizer must be the same (only partially checkable) - if nlp.lang != nlp2.lang: - msg.fail("Can't merge - languages don't match", exits=1) - - # check vector equality - if ( - nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape - or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row - or nlp.vocab.vectors.to_bytes(exclude=["strings"]) - != nlp2.vocab.vectors.to_bytes(exclude=["strings"]) - ): - msg.fail("Can't merge - vectors don't match", exits=1) - - if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]: - msg.fail("Can't merge - tokenizers don't match", exits=1) - - # Check that each pipeline only has one feature source - _check_single_tok2vec(base_model, nlp.config) - _check_single_tok2vec(added_model, nlp2.config) - - # Check how many listeners there are and replace based on that - # TODO: option to recognize frozen tok2vecs - # TODO: take list of pipe names to copy, ignore others - listeners = _get_listeners(nlp2) - replace_listeners = len(listeners) == 1 - nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) - - # write the final pipeline - nlp.to_disk(output_file) - msg.info(f"Saved pipeline to: {output_file}") - - return nlp diff --git a/spacy/cli/merge.py b/spacy/cli/merge.py new file mode 100644 index 000000000..6fd9e8153 --- /dev/null +++ b/spacy/cli/merge.py @@ -0,0 +1,108 @@ +from pathlib import Path +from wasabi import msg + +import spacy +from spacy.language import Language + +from ._util import app, Arg, Opt +from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs +from .configure import _check_pipeline_names, _has_listener + + +def _inner_merge( + nlp: Language, nlp2: Language, replace_listeners: bool = False +) -> Language: + """Actually do the merge. + + nlp: Base pipeline to add components to. + nlp2: Pipeline to add components from. + replace_listeners (bool): Whether to replace listeners. Usually only true + if there's one listener. + returns: assembled pipeline. + """ + + # we checked earlier, so there's definitely just one + tok2vec_name = _get_tok2vecs(nlp2.config)[0] + rename = _check_pipeline_names(nlp, nlp2) + + if len(_get_listeners(nlp2)) > 1: + if replace_listeners: + msg.warn( + """ + Replacing listeners for multiple components. Note this can make + your pipeline large and slow. Consider chaining pipelines (like + nlp2(nlp(text))) instead. + """ + ) + else: + # TODO provide a guide for what to do here + msg.warn( + """ + The result of this merge will have two feature sources + (tok2vecs) and multiple listeners. This will work for + inference, but will probably not work when training without + extra adjustment. If you continue to train the pipelines + separately this is not a problem. + """ + ) + + for comp in nlp2.pipe_names: + if replace_listeners and comp == tok2vec_name: + # the tok2vec should not be copied over + continue + if replace_listeners and _has_listener(nlp2, comp): + nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"]) + nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp)) + if comp in rename: + msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...") + return nlp + + +@app.command("merge") +def merge_pipelines( + # fmt: off + base_model: str = Arg(..., help="Name or path of base model"), + added_model: str = Arg(..., help="Name or path of model to be merged"), + output_file: Path = Arg(..., help="Path to save merged model") + # fmt: on +) -> Language: + """Combine components from multiple pipelines.""" + nlp = spacy.load(base_model) + nlp2 = spacy.load(added_model) + + # to merge models: + # - lang must be the same + # - vectors must be the same + # - vocabs must be the same + # - tokenizer must be the same (only partially checkable) + if nlp.lang != nlp2.lang: + msg.fail("Can't merge - languages don't match", exits=1) + + # check vector equality + if ( + nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape + or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row + or nlp.vocab.vectors.to_bytes(exclude=["strings"]) + != nlp2.vocab.vectors.to_bytes(exclude=["strings"]) + ): + msg.fail("Can't merge - vectors don't match", exits=1) + + if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]: + msg.fail("Can't merge - tokenizers don't match", exits=1) + + # Check that each pipeline only has one feature source + _check_single_tok2vec(base_model, nlp.config) + _check_single_tok2vec(added_model, nlp2.config) + + # Check how many listeners there are and replace based on that + # TODO: option to recognize frozen tok2vecs + # TODO: take list of pipe names to copy, ignore others + listeners = _get_listeners(nlp2) + replace_listeners = len(listeners) == 1 + nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) + + # write the final pipeline + nlp.to_disk(output_file) + msg.info(f"Saved pipeline to: {output_file}") + + return nlp diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 8b7352048..242a3a885 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -21,7 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands from spacy.cli._util import upload_file, download_file -from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines +from spacy.cli.configure import configure_resume_cli, use_tok2vec from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_distribution, _get_kl_divergence @@ -30,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.merge import merge_pipelines from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.project.remote_storage import RemoteStorage diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 0c982f389..19f1ae3f8 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -8,6 +8,7 @@ menu: - ['validate', 'validate'] - ['init', 'init'] - ['configure', 'configure'] + - ['merge', 'merge'] - ['convert', 'convert'] - ['debug', 'debug'] - ['train', 'train'] @@ -306,11 +307,10 @@ $ python -m spacy configure tok2vec [base_model] [output_file] | `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | -### configure merge {id="configure-merge", tag="command"} +## merge {id="merge", tag="command"} Take two pipelines and create a new one with components from both of them, -handling the configuration of listeners. Note that unlike other commands, this -produces a whole pipeline, not just a config. +handling the configuration of listeners. The output is a serialized pipeline. Components in the final pipeline are in the same order as in the original pipelines, with the base pipeline first and the added pipeline after. Because @@ -329,7 +329,7 @@ textcat, and want to provide with one of the official pretrained pipelines or another pipeline. ```cli -$ python -m spacy configure tok2vec [base_model] [added_model] [output_file] +$ python -m spacy merge [base_model] [added_model] [output_file] ``` | Name | Description |