Merge e668c2c21f into 6aa6b86d49

2025-08-04 20:30:24 +03:00 · 2023-03-03 16:51:04 +01:00 · 2023-03-03 16:51:04 +01:00 · 16145b2188
commit 16145b2188
parent 6aa6b86d49 e668c2c21f
6 changed files with 539 additions and 0 deletions
--- a/spacy/cli/init.py
+++ b/spacy/cli/init.py
@ -30,6 +30,9 @@ from .project.push import project_push  # noqa: F401
 from .project.pull import project_pull  # noqa: F401
 from .project.document import project_document  # noqa: F401
 from .find_threshold import find_threshold  # noqa: F401
+from .configure import use_transformer, use_tok2vec  # noqa: F401
+from .configure import configure_resume_cli  # noqa: F401
+from .merge import merge_pipelines  # noqa: F401


@app.command("link", no_args_is_help=True, deprecated=True, hidden=True)
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@ -48,6 +48,7 @@ and custom model implementations.
 """
 BENCHMARK_HELP = """Commands for benchmarking pipelines."""
 INIT_HELP = """Commands for initializing configs and pipeline packages."""
+CONFIGURE_HELP = """Commands for automatically modifying configs."""

 # Wrappers for Typer's annotations. Initially created to set defaults and to
 # keep the names short, but not needed at the moment.
@ -59,11 +60,13 @@ benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_he
 project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
 debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
 init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
+configure_cli = typer.Typer(name="configure", help=CONFIGURE_HELP, no_args_is_help=True)

 app.add_typer(project_cli)
 app.add_typer(debug_cli)
 app.add_typer(benchmark_cli)
 app.add_typer(init_cli)
+app.add_typer(configure_cli)


 def setup_cli() -> None:
--- a/spacy/cli/configure.py
+++ b/spacy/cli/configure.py
@ -0,0 +1,214 @@
+from pathlib import Path
+from wasabi import msg
+import typer
+from thinc.api import Config
+from typing import Any, Dict, Iterable, List, Union
+
+import spacy
+from spacy.language import Language
+
+from ._util import configure_cli, Arg, Opt
+
+# These are the architectures that are recognized as tok2vec/feature sources.
+TOK2VEC_ARCHS = [
+    ("spacy", "Tok2Vec"),
+    ("spacy", "HashEmbedCNN"),
+    ("spacy-transformers", "TransformerModel"),
+]
+# These are the listeners.
+LISTENER_ARCHS = [
+    ("spacy", "Tok2VecListener"),
+    ("spacy-transformers", "TransformerListener"),
+]
+
+
+def _deep_get(
+    obj: Union[Dict[str, Any], Config], key: Iterable[str], default: Any
+) -> Any:
+    """Given a multi-part key, try to get the key. If at any point this isn't
+    possible, return the default.
+    """
+    out = None
+    slot = obj
+    for notch in key:
+        if slot is None or notch not in slot:
+            return default
+        slot = slot[notch]
+    return slot
+
+
+def _get_tok2vecs(config: Config) -> List[str]:
+    """Given a pipeline config, return the names of components that are
+    tok2vecs (or Transformers).
+    """
+    out = []
+    for name, comp in config["components"].items():
+        arch = _deep_get(comp, ("model", "@architectures"), False)
+        if not arch:
+            continue
+
+        ns, model, ver = arch.split(".")
+        if (ns, model) in TOK2VEC_ARCHS:
+            out.append(name)
+    return out
+
+
+def _has_listener(nlp: Language, pipe_name: str):
+    """Given a pipeline and a component name, check if it has a listener."""
+    arch = _deep_get(
+        nlp.config,
+        ("components", pipe_name, "model", "tok2vec", "@architectures"),
+        False,
+    )
+    if not arch:
+        return False
+    ns, model, ver = arch.split(".")
+    return (ns, model) in LISTENER_ARCHS
+
+
+def _get_listeners(nlp: Language) -> List[str]:
+    """Get the name of every component that contains a listener.
+
+    Does not check that they listen to the same thing; assumes a pipeline has
+    only one feature source.
+    """
+    out = []
+    for name in nlp.pipe_names:
+        if _has_listener(nlp, name):
+            out.append(name)
+    return out
+
+
+def _check_single_tok2vec(name: str, config: Config) -> None:
+    """Check if there is just one tok2vec in a config.
+
+    A very simple check, but used in multiple functions.
+    """
+    tok2vecs = _get_tok2vecs(config)
+    fail_msg = f"""
+        Can't handle pipelines with more than one feature source, 
+        but {name} has {len(tok2vecs)}."""
+    if len(tok2vecs) > 1:
+        msg.fail(fail_msg, exits=1)
+
+
+@configure_cli.command("resume")
+def configure_resume_cli(
+    # fmt: off
+    base_model: Path = Arg(..., help="Path or name of base model to use for config"),
+    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
+    # fmt: on
+) -> Config:
+    """Create a config for resuming training.
+
+    A config for resuming training is the same as the input config, but with
+    all components sourced.
+
+    DOCS: https://spacy.io/api/cli#configure-resume
+    """
+
+    nlp = spacy.load(base_model)
+    conf = nlp.config
+
+    # Paths are not JSON serializable
+    path_str = str(base_model)
+
+    for comp in nlp.pipe_names:
+        conf["components"][comp] = {"source": path_str}
+
+    if str(output_file) == "-":
+        print(conf.to_str())
+    else:
+        conf.to_disk(output_file)
+        msg.good("Saved config", output_file)
+
+    return conf
+
+
+@configure_cli.command("transformer")
+def use_transformer(
+    base_model: str, output_file: Path, transformer_name: str = "roberta-base"
+) -> Config:
+    """Replace pipeline tok2vec with transformer.
+
+    DOCS: https://spacy.io/api/cli#configure-transformer
+    """
+
+    # 1. identify tok2vec
+    # 2. replace tok2vec
+    # 3. replace listeners
+    nlp = spacy.load(base_model)
+    _check_single_tok2vec(base_model, nlp.config)
+
+    tok2vecs = _get_tok2vecs(nlp.config)
+    assert len(tok2vecs) > 0, "Must have tok2vec to replace!"
+
+    nlp.remove_pipe(tok2vecs[0])
+    # the rest can be default values
+    trf_config = {
+        "model": {
+            "name": transformer_name,
+        }
+    }
+    try:
+        trf = nlp.add_pipe("transformer", config=trf_config, first=True)
+    except ValueError:
+        fail_msg = (
+            "Configuring a transformer requires spacy-transformers. "
+            "Install with: pip install spacy-transformers"
+        )
+        msg.fail(fail_msg, exits=1)
+
+    # now update the listeners
+    listeners = _get_listeners(nlp)
+    for listener in listeners:
+        listener_config = {
+            "@architectures": "spacy-transformers.TransformerListener.v1",
+            "grad_factor": 1.0,
+            "upstream": "transformer",
+            "pooling": {"@layers": "reduce_mean.v1"},
+        }
+        nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
+
+    if str(output_file) == "-":
+        print(nlp.config.to_str())
+    else:
+        nlp.config.to_disk(output_file)
+        msg.good("Saved config", output_file)
+
+    return nlp.config
+
+
+@configure_cli.command("tok2vec")
+def use_tok2vec(base_model: str, output_file: Path) -> Config:
+    """Replace pipeline tok2vec with CNN tok2vec.
+
+    DOCS: https://spacy.io/api/cli#configure-tok2vec
+    """
+    nlp = spacy.load(base_model)
+    _check_single_tok2vec(base_model, nlp.config)
+
+    tok2vecs = _get_tok2vecs(nlp.config)
+    assert len(tok2vecs) > 0, "Must have tok2vec to replace!"
+
+    nlp.remove_pipe(tok2vecs[0])
+
+    tok2vec = nlp.add_pipe("tok2vec", first=True)
+    width = "${components.tok2vec.model.encode:width}"
+
+    listeners = _get_listeners(nlp)
+    for listener in listeners:
+        listener_config = {
+            "@architectures": "spacy.Tok2VecListener.v1",
+            "width": width,
+            "upstream": "tok2vec",
+        }
+        nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
+
+    if str(output_file) == "-":
+        print(nlp.config.to_str())
+    else:
+        nlp.config.to_disk(output_file)
+        msg.good("Saved config", output_file)
+
+    return nlp.config
--- a/spacy/cli/merge.py
+++ b/spacy/cli/merge.py
@ -0,0 +1,160 @@
+from pathlib import Path
+import re
+from wasabi import msg
+
+import spacy
+from spacy.language import Language
+
+from ._util import app, Arg, Opt, Dict
+from .configure import _check_single_tok2vec, _get_listeners, _get_tok2vecs
+from .configure import _has_listener
+
+
+def _increment_suffix(name: str) -> str:
+    """Given a name, return an incremented version.
+
+    If no numeric suffix is found, return the original with "2" appended.
+
+    This is used to avoid name collisions in pipelines.
+    """
+
+    res = re.search(r"\d+$", name)
+    if res is None:
+        return f"{name}2"
+    else:
+        num = res.group()
+        prefix = name[0 : -len(num)]
+        return f"{prefix}{int(num) + 1}"
+
+
+def _make_unique_pipe_names(nlp: Language, nlp2: Language) -> Dict[str, str]:
+    """Given two pipelines, try to rename any collisions in component names.
+
+    If a simple increment of a numeric suffix doesn't work, will give up.
+    """
+
+    fail_msg = """
+        Tried automatically renaming {name} to {new_name}, but still
+        had a collision, so bailing out. Please make your pipe names
+        unique.
+        """
+
+    # map of components to be renamed
+    rename = {}
+    # check pipeline names
+    names = nlp.pipe_names
+    for name in nlp2.pipe_names:
+        if name in names:
+            inc = _increment_suffix(name)
+            if inc in names or inc in nlp2.pipe_names:
+                msg.fail(fail_msg.format(name=name, new_name=inc), exits=1)
+            rename[name] = inc
+    return rename
+
+
+def _inner_merge(
+    nlp: Language, nlp2: Language, replace_listeners: bool = False
+) -> Language:
+    """Actually do the merge.
+
+    nlp (Language): Base pipeline to add components to.
+    nlp2 (Language): Pipeline to add components from.
+    replace_listeners (bool): Whether to replace listeners. Usually only true
+      if there's one listener.
+    returns: assembled pipeline.
+    """
+
+    # The outer merge already verified there was exactly one tok2vec
+    tok2vec_name = _get_tok2vecs(nlp2.config)[0]
+    rename = _make_unique_pipe_names(nlp, nlp2)
+
+    if len(_get_listeners(nlp2)) > 1:
+        if replace_listeners:
+            msg.warn(
+                """
+                Replacing listeners for multiple components. Note this can make
+                your pipeline large and slow. Consider chaining pipelines (like
+                nlp2(nlp(text))) instead.
+                """
+            )
+        else:
+            # TODO provide a guide for what to do here
+            msg.warn(
+                """
+                The result of this merge will have two feature sources
+                (tok2vecs) and multiple listeners. This will work for
+                inference, but will probably not work when training without
+                extra adjustment. If you continue to train the pipelines
+                separately this is not a problem.
+                """
+            )
+
+    for comp in nlp2.pipe_names:
+        if replace_listeners and comp == tok2vec_name:
+            # the tok2vec should not be copied over
+            continue
+        if replace_listeners and _has_listener(nlp2, comp):
+            nlp2.replace_listeners(tok2vec_name, comp, ["model.tok2vec"])
+        nlp.add_pipe(comp, source=nlp2, name=rename.get(comp, comp))
+        if comp in rename:
+            msg.info(f"Renaming {comp} to {rename[comp]} to avoid collision...")
+    return nlp
+
+
+@app.command("merge")
+def merge_pipelines(
+    # fmt: off
+    base_model: str = Arg(..., help="Name or path of base model"),
+    added_model: str = Arg(..., help="Name or path of model to be merged"), 
+    output_file: Path = Arg(..., help="Path to save merged model")
+    # fmt: on
+) -> Language:
+    """Combine components from multiple pipelines.
+
+    Given two pipelines, the components from them are merged into a single
+    pipeline. The exact way this works depends on whether the second pipeline
+    has one listener or more than one listener. In the single listener case
+    `replace_listeners` is used, otherwise components are simply appended to
+    the base pipeline.
+
+    DOCS: https://spacy.io/api/cli#merge
+    """
+    nlp = spacy.load(base_model)
+    nlp2 = spacy.load(added_model)
+
+    # to merge models:
+    # - lang must be the same
+    # - vectors must be the same
+    # - vocabs must be the same
+    # - tokenizer must be the same (only partially checkable)
+    if nlp.lang != nlp2.lang:
+        msg.fail("Can't merge - languages don't match", exits=1)
+
+    # check vector equality
+    if (
+        nlp.vocab.vectors.shape != nlp2.vocab.vectors.shape
+        or nlp.vocab.vectors.key2row != nlp2.vocab.vectors.key2row
+        or nlp.vocab.vectors.to_bytes(exclude=["strings"])
+        != nlp2.vocab.vectors.to_bytes(exclude=["strings"])
+    ):
+        msg.fail("Can't merge - vectors don't match", exits=1)
+
+    if nlp.config["nlp"]["tokenizer"] != nlp2.config["nlp"]["tokenizer"]:
+        msg.fail("Can't merge - tokenizers don't match", exits=1)
+
+    # Check that each pipeline only has one feature source
+    _check_single_tok2vec(base_model, nlp.config)
+    _check_single_tok2vec(added_model, nlp2.config)
+
+    # Check how many listeners there are and replace based on that
+    # TODO: option to recognize frozen tok2vecs
+    # TODO: take list of pipe names to copy, ignore others
+    listeners = _get_listeners(nlp2)
+    replace_listeners = len(listeners) == 1
+    nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
+
+    # write the final pipeline
+    nlp.to_disk(output_file)
+    msg.info(f"Saved pipeline to: {output_file}")
+
+    return nlp
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -21,6 +21,7 @@ from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
 from spacy.cli._util import upload_file, download_file
+from spacy.cli.configure import configure_resume_cli, use_tok2vec
 from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.debug_data import _get_distribution, _get_kl_divergence
@ -29,6 +30,7 @@ from spacy.cli.debug_data import _print_span_characteristics
 from spacy.cli.debug_data import _get_spans_length_freq_dist
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
+from spacy.cli.merge import merge_pipelines
 from spacy.cli.package import get_third_party_dependencies
 from spacy.cli.package import _is_permitted_package_name
 from spacy.cli.project.remote_storage import RemoteStorage
@ -1180,6 +1182,71 @@ def test_upload_download_local_file():
            assert file_.read() == content


+def test_configure_resume(tmp_path):
+    nlp = spacy.blank("en")
+    nlp.add_pipe("ner")
+    nlp.add_pipe("textcat")
+    base_path = tmp_path / "base"
+    nlp.to_disk(base_path)
+
+    out_path = tmp_path / "resume.cfg"
+    conf = configure_resume_cli(base_path, out_path)
+
+    assert out_path.exists(), "Didn't save config file"
+
+    for comp, val in conf["components"].items():
+        assert "source" in val, f"Non-sourced component: {comp}"
+
+
+def test_use_tok2vec(tmp_path):
+    # Can't add a transformer here because spacy-transformers might not be present
+    nlp = spacy.blank("en")
+    nlp.add_pipe("tok2vec")
+    base_path = tmp_path / "tok2vec_sample_2"
+    nlp.to_disk(base_path)
+
+    out_path = tmp_path / "converted_to_tok2vec"
+    conf = use_tok2vec(base_path, out_path)
+    assert out_path.exists(), "No model saved"
+
+    assert "tok2vec" in conf["components"], "No tok2vec component"
+
+
+def test_merge_pipelines(tmp_path):
+
+    # width is a placeholder, since we won't actually train this
+    listener_config = {
+        "model": {
+            "tok2vec": {"@architectures": "spacy.Tok2VecListener.v1", "width": "0"}
+        }
+    }
+    # base pipeline
+    base = spacy.blank("en")
+    base.add_pipe("tok2vec")
+    base.add_pipe("ner", config=listener_config)
+    base_path = tmp_path / "merge_base"
+    base.to_disk(base_path)
+
+    # added pipeline
+    added = spacy.blank("en")
+    added.add_pipe("tok2vec")
+    added.add_pipe("ner", config=listener_config)
+    added_path = tmp_path / "merge_added"
+    added.to_disk(added_path)
+
+    # these should combine and not have a name collision
+    out_path = tmp_path / "merge_result"
+    merged = merge_pipelines(base_path, added_path, out_path)
+
+    # will give a key error if not present
+    merged.get_pipe("ner")
+    merged.get_pipe("ner2")
+
+    ner2_conf = merged.config["components"]["ner2"]
+    arch = ner2_conf["model"]["tok2vec"]["@architectures"]
+    assert arch == "spacy.HashEmbedCNN.v2", "Wrong arch - listener not replaced?"
+
+
 def test_walk_directory():
    with make_tempdir() as d:
        files = [
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@ -7,6 +7,8 @@ menu:
  - ['info', 'info']
  - ['validate', 'validate']
  - ['init', 'init']
+  - ['configure', 'configure']
+  - ['merge', 'merge']
  - ['convert', 'convert']
  - ['debug', 'debug']
  - ['train', 'train']
@ -250,6 +252,96 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The label files.                                                                                                                                                                                                   |

+## configure {id="configure", version="TODO"}
+
+Modify or combine existing configs. Example uses include swapping feature
+sources or creating configs to resume training. This may simplify parts of the
+development cycle. For example, it allows starting off with a config for a
+faster, less accurate model (using the CNN tok2vec) and conveniently switching
+to transformers later, without having to manually adjust the config.
+
+### configure resume {id="configure-resume", tag="command"}
+
+Modify the input config for use in resuming training. When resuming training,
+all components are sourced from the previously trained pipeline.
+
+```cli
+$ python -m spacy configure resume [base_model] [output_file]
+```
+
+| Name          | Description                                                                                                                                                                                                                           |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+
+### configure transformer {id="configure-transformer", tag="command"}
+
+Modify the base config to use a transformer component, optionally specifying the
+base transformer to use. Useful for converting a CNN tok2vec pipeline to use
+transformers.
+
+During development of a model, you can use a CNN tok2vec for faster training
+time and reduced hardware requirements, and then use this command to convert
+your pipeline to use a transformer once you've verified a proof of concept. This
+can also help isolate whether any training issues are transformer-related or
+not.
+
+```cli
+$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name]
+```
+
+| Name               | Description                                                                                                                                                                                                                           |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`       | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file`      | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~                                                                                                                                           |
+
+### configure tok2vec {id="configure-tok2vec", tag="command"}
+
+Modify the base model config to use a CNN tok2vec component. Useful for
+generating a config from a transformer-based model for faster training
+iteration.
+
+```cli
+$ python -m spacy configure tok2vec [base_model] [output_file]
+```
+
+| Name          | Description                                                                                                                                                                                                                           |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+
+## merge {id="merge", tag="command"}
+
+Take two pipelines and create a new one with components from both of them,
+handling the configuration of listeners. The output is a serialized pipeline.
+
+Components in the final pipeline are in the same order as in the original
+pipelines, with the base pipeline first and the added pipeline after. Because
+pipeline names must be unique, if there is a name collision in components, the
+later components will be automatically renamed.
+
+For components with listeners, the resulting pipeline structure depends on the
+number of listeners. If the second pipeline has only one listener, then
+[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be
+used. If there is more than one listener, `replace_listeners` will not be used.
+In the multi-listener case, the resulting pipeline may require more adjustment
+for training to work.
+
+This is useful if you have trained a specialized component, such as NER or
+textcat, and want to provide with one of the official pretrained pipelines or
+another pipeline.
+
+```cli
+$ python -m spacy merge [base_model] [added_model] [output_file]
+```
+
+| Name          | Description                                                                              |
+| ------------- | ---------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~         |
+| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ |
+| `output_file` | Path to output pipeline. ~~Path (positional)~~                                           |
+
 ## convert {id="convert",tag="command"}

 Convert files into spaCy's