Move merge to independent command

2025-07-13 09:42:26 +03:00 · 2023-02-09 15:16:50 +09:00 · 2023-02-09 15:16:50 +09:00 · 03a0c2badc
commit 03a0c2badc
parent 9d0ae2407b
5 changed files with 116 additions and 99 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -29,8 +29,9 @@ from .project.push import project_push  # noqa: F401
 from .project.pull import project_pull  # noqa: F401
 from .project.document import project_document  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
-from .configure import merge_pipelines, use_tok2vec, use_transformer  # noqa: F401
+from .configure import use_tok2vec, use_transformer  # noqa: F401
 from .configure import configure_resume_cli  # noqa: F401
+from .merge import merge_pipelines  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/configure.py
+++ b/spacy/cli/configure.py
@ -247,96 +247,3 @@ def use_tok2vec(base_model: str, output_file: Path) -> Config:
        msg.good("Saved config", output_file)

    return nlp.config
-
-
-def _inner_merge(
-    nlp: Language, nlp2: Language, replace_listeners: bool = False
-) -> Language:
-    """Actually do the merge.
-
-    nlp: Base pipeline to add components to.
-    nlp2: Pipeline to add components from.
-    replace_listeners (bool): Whether to replace listeners. Usually only true
-      if there's one listener.
-    returns: assembled pipeline.
-    """
-
-    # we checked earlier, so there's definitely just one
-    tok2vec_name = _get_tok2vecs(nlp2.config)[0]
-    rename = _check_pipeline_names(nlp, nlp2)
-
-    if len(_get_listeners(nlp2)) > 1:
-        if replace_listeners:
-            msg.warn(
-                """
-                Replacing listeners for multiple components. Note this can make
-                your pipeline large and slow. Consider chaining pipelines (like
-                nlp2(nlp(text))) instead.
-                """
-            )
-        else:
-            # TODO provide a guide for what to do here
-            msg.warn(
-                """
-                The result of this merge will have two feature sources
-                (tok2vecs) and multiple listeners. This will work for
-                inference, but will probably not work when training without
-                extra adjustment. If you continue to train the pipelines
-                separately this is not a problem.
-                """
-            )
-
-    for comp in nlp2.pipe_names:
-        if replace_listeners and comp == tok2vec_name:
-            # the tok2vec should not be copied over
-            continue
-        if replace_listeners and _has_listener(nlp2, comp):
-            nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
-        nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
-        if comp in rename:
-            msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
-    return nlp
-
-
-@configure_cli.command("merge")
-def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
-    """Combine components from multiple pipelines."""
-    nlp = spacy.load(base_model)
-    nlp2 = spacy.load(added_model)
-
-    # to merge models:
-    # - lang must be the same
-    # - vectors must be the same
-    # - vocabs must be the same
-    # - tokenizer must be the same (only partially checkable)
-    if nlp.lang != nlp2.lang:
-        msg.fail("Can't merge - languages don't match", exits=1)
-
-    # check vector equality
-    if (
-        nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
-        or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
-        or nlp.vocab.vectors.to_bytes(exclude=["strings"])
-        != nlp2.vocab.vectors.to_bytes(exclude=["strings"])
-    ):
-        msg.fail("Can't merge - vectors don't match", exits=1)
-
-    if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
-        msg.fail("Can't merge - tokenizers don't match", exits=1)
-
-    # Check that each pipeline only has one feature source
-    _check_single_tok2vec(base_model, nlp.config)
-    _check_single_tok2vec(added_model, nlp2.config)
-
-    # Check how many listeners there are and replace based on that
-    # TODO: option to recognize frozen tok2vecs
-    # TODO: take list of pipe names to copy, ignore others
-    listeners = _get_listeners(nlp2)
-    replace_listeners = len(listeners) == 1
-    nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
-
-    # write the final pipeline
-    nlp.to_disk(output_file)
-    msg.info(f"Saved pipeline to: {output_file}")
-
-    return nlp
--- a/spacy/cli/merge.py
+++ b/spacy/cli/merge.py
@ -0,0 +1,108 @@
+from pathlib import Path
+from wasabi import msg
+
+import spacy
+from spacy.language import Language
+
+from ._util import app, Arg, Opt
+from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs
+from .configure import _check_pipeline_names, _has_listener
+
+
+def _inner_merge(
+    nlp: Language, nlp2: Language, replace_listeners: bool = False
+) -> Language:
+    """Actually do the merge.
+
+    nlp: Base pipeline to add components to.
+    nlp2: Pipeline to add components from.
+    replace_listeners (bool): Whether to replace listeners. Usually only true
+      if there's one listener.
+    returns: assembled pipeline.
+    """
+
+    # we checked earlier, so there's definitely just one
+    tok2vec_name = _get_tok2vecs(nlp2.config)[0]
+    rename = _check_pipeline_names(nlp, nlp2)
+
+    if len(_get_listeners(nlp2)) > 1:
+        if replace_listeners:
+            msg.warn(
+                """
+                Replacing listeners for multiple components. Note this can make
+                your pipeline large and slow. Consider chaining pipelines (like
+                nlp2(nlp(text))) instead.
+                """
+            )
+        else:
+            # TODO provide a guide for what to do here
+            msg.warn(
+                """
+                The result of this merge will have two feature sources
+                (tok2vecs) and multiple listeners. This will work for
+                inference, but will probably not work when training without
+                extra adjustment. If you continue to train the pipelines
+                separately this is not a problem.
+                """
+            )
+
+    for comp in nlp2.pipe_names:
+        if replace_listeners and comp == tok2vec_name:
+            # the tok2vec should not be copied over
+            continue
+        if replace_listeners and _has_listener(nlp2, comp):
+            nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
+        nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
+        if comp in rename:
+            msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
+    return nlp
+
+
+@app.command("merge")
+def merge_pipelines(
+    # fmt: off
+    base_model: str = Arg(..., help="Name or path of base model"),
+    added_model: str = Arg(..., help="Name or path of model to be merged"), 
+    output_file: Path = Arg(..., help="Path to save merged model")
+    # fmt: on
+) -> Language:
+    """Combine components from multiple pipelines."""
+    nlp = spacy.load(base_model)
+    nlp2 = spacy.load(added_model)
+
+    # to merge models:
+    # - lang must be the same
+    # - vectors must be the same
+    # - vocabs must be the same
+    # - tokenizer must be the same (only partially checkable)
+    if nlp.lang != nlp2.lang:
+        msg.fail("Can't merge - languages don't match", exits=1)
+
+    # check vector equality
+    if (
+        nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
+        or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
+        or nlp.vocab.vectors.to_bytes(exclude=["strings"])
+        != nlp2.vocab.vectors.to_bytes(exclude=["strings"])
+    ):
+        msg.fail("Can't merge - vectors don't match", exits=1)
+
+    if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
+        msg.fail("Can't merge - tokenizers don't match", exits=1)
+
+    # Check that each pipeline only has one feature source
+    _check_single_tok2vec(base_model, nlp.config)
+    _check_single_tok2vec(added_model, nlp2.config)
+
+    # Check how many listeners there are and replace based on that
+    # TODO: option to recognize frozen tok2vecs
+    # TODO: take list of pipe names to copy, ignore others
+    listeners = _get_listeners(nlp2)
+    replace_listeners = len(listeners) == 1
+    nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
+
+    # write the final pipeline
+    nlp.to_disk(output_file)
+    msg.info(f"Saved pipeline to: {output_file}")
+
+    return nlp
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -21,7 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
 from spacy.cli._util import upload_file, download_file
-from spacy.cli.configure import configure_resume_cli, use_tok2vec, merge_pipelines
+from spacy.cli.configure import configure_resume_cli, use_tok2vec
 from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
@ -30,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics
 from spacy.cli.debug_data import _get_spans_length_freq_dist
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
+from spacy.cli.merge import merge_pipelines
 from spacy.cli.package import get_third_party_dependencies
 from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.project.remote_storage import RemoteStorage
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -8,6 +8,7 @@ menu:
  - ['validate', 'validate']
  - ['init', 'init']
  - ['configure', 'configure']
+  - ['merge', 'merge']
  - ['convert', 'convert']
  - ['debug', 'debug']
  - ['train', 'train']
@ -306,11 +307,10 @@ $ python -m spacy configure tok2vec [base_model] [output_file]
 | `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
 | `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |

-### configure merge {id="configure-merge", tag="command"}
+## merge {id="merge", tag="command"}

 Take two pipelines and create a new one with components from both of them,
-handling the configuration of listeners. Note that unlike other commands, this
-produces a whole pipeline, not just a config.
+handling the configuration of listeners. The output is a serialized pipeline.

 Components in the final pipeline are in the same order as in the original
 pipelines, with the base pipeline first and the added pipeline after. Because
@ -329,7 +329,7 @@ textcat, and want to provide with one of the official pretrained pipelines or
 another pipeline.

 ```cli
-$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
+$ python -m spacy merge [base_model] [added_model] [output_file]
 ```

 | Name          | Description                                                                              |