Move merge to independent command

2025-12-12 04:34:31 +03:00 · 2023-02-09 15:16:50 +09:00 · 2023-02-09 15:16:50 +09:00 · 03a0c2badc
commit 03a0c2badc
parent 9d0ae2407b
5 changed files with 116 additions and 99 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -29,8 +29,9 @@ from .project.push import project_push  # noqa: F401
 from .project.pull import project_pull  # noqa: F401
 from .project.document import project_document  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
-from .configure import merge_pipelines, use_tok2vec, use_transformer  # noqa: F401
+from .configure import use_tok2vec, use_transformer  # noqa: F401
 from .configure import configure_resume_cli  # noqa: F401
 from .merge import merge_pipelines  # noqa: F401
@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/configure.py
+++ b/spacy/cli/configure.py
@ -247,96 +247,3 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config:
        msg.good("Saved config", output_file)
    return nlp.config
 def _inner_merge(
    nlp: Language, nlp2: Language, replace_listeners: bool = False
 ) -> Language:
    """Actually do the merge.
    nlp: Base pipeline to add components to.
    nlp2: Pipeline to add components from.
    replace_listeners (bool): Whether to replace listeners. Usually only true
      if there's one listener.
    returns: assembled pipeline.
    """
    # we checked earlier, so there's definitely just one
    tok2vec_name = _get_tok2vecs(nlp2.config)[0]
    rename = _check_pipeline_names(nlp, nlp2)
    if len(_get_listeners(nlp2)) > 1:
        if replace_listeners:
            msg.warn(
                """
                Replacing listeners for multiple components. Note this can make
                your pipeline large and slow. Consider chaining pipelines (like
                nlp2(nlp(text))) instead.
                """
            )
        else:
            # TODO provide a guide for what to do here
            msg.warn(
                """
                The result of this merge will have two feature sources
                (tok2vecs) and multiple listeners. This will work for
                inference, but will probably not work when training without
                extra adjustment. If you continue to train the pipelines
                separately this is not a problem.
                """
            )
    for comp in nlp2.pipe_names:
        if replace_listeners and comp == tok2vec_name:
            # the tok2vec should not be copied over
            continue
        if replace_listeners and _has_listener(nlp2, comp):
            nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
        nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
        if comp in rename:
            msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
    return nlp
@configure_cli.command("merge")
 def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
    """Combine components from multiple pipelines."""
    nlp = spacy.load(base_model)
    nlp2 = spacy.load(added_model)
    # to merge models:
    # - lang must be the same
    # - vectors must be the same
    # - vocabs must be the same
    # - tokenizer must be the same (only partially checkable)
    if nlp.lang != nlp2.lang:
        msg.fail("Can't merge - languages don't match", exits=1)
    # check vector equality
    if (
        nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
        or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
        or nlp.vocab.vectors.to_bytes(exclude=["strings"])
        != nlp2.vocab.vectors.to_bytes(exclude=["strings"])
    ):
        msg.fail("Can't merge - vectors don't match", exits=1)
    if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
        msg.fail("Can't merge - tokenizers don't match", exits=1)
    # Check that each pipeline only has one feature source
    _check_single_tok2vec(base_model, nlp.config)
    _check_single_tok2vec(added_model, nlp2.config)
    # Check how many listeners there are and replace based on that
    # TODO: option to recognize frozen tok2vecs
    # TODO: take list of pipe names to copy, ignore others
    listeners = _get_listeners(nlp2)
    replace_listeners = len(listeners) == 1
    nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
    # write the final pipeline
    nlp.to_disk(output_file)
    msg.info(f"Saved pipeline to: {output_file}")
    return nlp
--- a/spacy/cli/merge.py
+++ b/spacy/cli/merge.py
@ -0,0 +1,108 @@
 from pathlib import Path
 from wasabi import msg
 import spacy
 from spacy.language import Language
 from ._util import app, Arg, Opt
 from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs
 from .configure import _check_pipeline_names, _has_listener
 def _inner_merge(
    nlp: Language, nlp2: Language, replace_listeners: bool = False
 ) -> Language:
    """Actually do the merge.
    nlp: Base pipeline to add components to.
    nlp2: Pipeline to add components from.
    replace_listeners (bool): Whether to replace listeners. Usually only true
      if there's one listener.
    returns: assembled pipeline.
    """
    # we checked earlier, so there's definitely just one
    tok2vec_name = _get_tok2vecs(nlp2.config)[0]
    rename = _check_pipeline_names(nlp, nlp2)
    if len(_get_listeners(nlp2)) > 1:
        if replace_listeners:
            msg.warn(
                """
                Replacing listeners for multiple components. Note this can make
                your pipeline large and slow. Consider chaining pipelines (like
                nlp2(nlp(text))) instead.
                """
            )
        else:
            # TODO provide a guide for what to do here
            msg.warn(
                """
                The result of this merge will have two feature sources
                (tok2vecs) and multiple listeners. This will work for
                inference, but will probably not work when training without
                extra adjustment. If you continue to train the pipelines
                separately this is not a problem.
                """
            )
    for comp in nlp2.pipe_names:
        if replace_listeners and comp == tok2vec_name:
            # the tok2vec should not be copied over
            continue
        if replace_listeners and _has_listener(nlp2, comp):
            nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
        nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
        if comp in rename:
            msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
    return nlp
@app.command("merge")
 def merge_pipelines(
    # fmt: off
    base_model: str = Arg(..., help="Name or path of base model"),
    added_model: str = Arg(..., help="Name or path of model to be merged"), 
    output_file: Path = Arg(..., help="Path to save merged model")
    # fmt: on
 ) -> Language:
    """Combine components from multiple pipelines."""
    nlp = spacy.load(base_model)
    nlp2 = spacy.load(added_model)
    # to merge models:
    # - lang must be the same
    # - vectors must be the same
    # - vocabs must be the same
    # - tokenizer must be the same (only partially checkable)
    if nlp.lang != nlp2.lang:
        msg.fail("Can't merge - languages don't match", exits=1)
    # check vector equality
    if (
        nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
        or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
        or nlp.vocab.vectors.to_bytes(exclude=["strings"])
        != nlp2.vocab.vectors.to_bytes(exclude=["strings"])
    ):
        msg.fail("Can't merge - vectors don't match", exits=1)
    if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
        msg.fail("Can't merge - tokenizers don't match", exits=1)
    # Check that each pipeline only has one feature source
    _check_single_tok2vec(base_model, nlp.config)
    _check_single_tok2vec(added_model, nlp2.config)
    # Check how many listeners there are and replace based on that
    # TODO: option to recognize frozen tok2vecs
    # TODO: take list of pipe names to copy, ignore others
    listeners = _get_listeners(nlp2)
    replace_listeners = len(listeners) == 1
    nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
    # write the final pipeline
    nlp.to_disk(output_file)
    msg.info(f"Saved pipeline to: {output_file}")
    return nlp
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -21,7 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
 from spacy.cli._util import upload_file, download_file
-from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines
+from spacy.cli.configure import configure_resume_cli, use_tok2vec
 from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
@ -30,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics
 from spacy.cli.debug_data import _get_spans_length_freq_dist
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
 from spacy.cli.merge import merge_pipelines
 from spacy.cli.package import get_third_party_dependencies
 from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.project.remote_storage import RemoteStorage
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -8,6 +8,7 @@ menu:
  - ['validate', 'validate']
  - ['init', 'init']
  - ['configure', 'configure']
  - ['merge', 'merge']
  - ['convert', 'convert']
  - ['debug', 'debug']
  - ['train', 'train']
@ -306,11 +307,10 @@ $ python -m spacy configure tok2vec [base_model] [output_file]
 | `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
 | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
-### configure merge {id="configure-merge", tag="command"}
+## merge {id="merge", tag="command"}
 Take two pipelines and create a new one with components from both of them,
-handling the configuration of listeners. Note that unlike other commands, this
+handling the configuration of listeners. The output is a serialized pipeline.
 produces a whole pipeline, not just a config.
 Components in the final pipeline are in the same order as in the original
 pipelines, with the base pipeline first and the added pipeline after. Because
@ -329,7 +329,7 @@ textcat, and want to provide with one of the official pretrained pipelines or
 another pipeline.
 ```cli
-$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
+$ python -m spacy merge [base_model] [added_model] [output_file]
 ```
 | Name          | Description                                                                              |