Add docs for configure command

This also change the `output_file` arg to match other commands.
2025-07-04 20:03:13 +03:00 · 2023-01-11 16:06:50 +09:00 · 2023-01-11 16:06:50 +09:00 · f2bbab4623
commit f2bbab4623
parent 2791f0b552
2 changed files with 104 additions and 15 deletions
--- a/spacy/cli/configure.py
+++ b/spacy/cli/configure.py
@ -137,7 +137,7 @@ def _check_pipeline_names(nlp, nlp2):
 def configure_resume_cli(
    # fmt: off
    base_model: Path = Arg(..., help="Path or name of base model to use for config"),
-    output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
+    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
    # fmt: on
 ):
    """Create a config for resuming training.
@ -155,18 +155,18 @@ def configure_resume_cli(
    for comp in nlp.pipe_names:
        conf["components"][comp] = {"source": path_str}

-    if str(output_path) == "-":
+    if str(output_file) == "-":
        print(conf.to_str())
    else:
-        conf.to_disk(output_path)
-        msg.good("Saved config", output_path)
+        conf.to_disk(output_file)
+        msg.good("Saved config", output_file)

    return conf


@configure_cli.command("transformer")
 def use_transformer(
-    base_model: str, output_path: Path, transformer_name: str = "roberta-base"
+    base_model: str, output_file: Path, transformer_name: str = "roberta-base"
 ) -> Config:
    """Replace pipeline tok2vec with transformer."""

@ -208,17 +208,17 @@ def use_transformer(
        }
        nlp.config["components"][listener]["model"]["tok2vec"] = listener_config

-    if str(output_path) == "-":
+    if str(output_file) == "-":
        print(nlp.config.to_str())
    else:
-        nlp.config.to_disk(output_path)
-        msg.good("Saved config", output_path)
+        nlp.config.to_disk(output_file)
+        msg.good("Saved config", output_file)

    return nlp.config


@configure_cli.command("tok2vec")
-def use_tok2vec(base_model: str, output_path: Path) -> Config:
+def use_tok2vec(base_model: str, output_file: Path) -> Config:
    """Replace pipeline tok2vec with CNN tok2vec."""
    nlp = spacy.load(base_model)
    _check_single_tok2vec(base_model, nlp.config)
@ -240,11 +240,11 @@ def use_tok2vec(base_model: str, output_path: Path) -> Config:
        }
        nlp.config["components"][listener]["model"]["tok2vec"] = listener_config

-    if str(output_path) == "-":
+    if str(output_file) == "-":
        print(nlp.config.to_str())
    else:
-        nlp.config.to_disk(output_path)
-        msg.good("Saved config", output_path)
+        nlp.config.to_disk(output_file)
+        msg.good("Saved config", output_file)

    return nlp.config

@ -298,7 +298,7 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language:


@configure_cli.command("merge")
-def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language:
+def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
    """Combine components from multiple pipelines."""
    nlp = spacy.load(base_model)
    nlp2 = spacy.load(added_model)
@ -336,7 +336,7 @@ def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Lan
    nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)

    # write the final pipeline
-    nlp.to_disk(output_path)
-    msg.info(f"Saved pipeline to: {output_path}")
+    nlp.to_disk(output_file)
+    msg.info(f"Saved pipeline to: {output_file}")

    return nlp
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -7,6 +7,7 @@ menu:
  - ['info', 'info']
  - ['validate', 'validate']
  - ['init', 'init']
+  - ['configure', 'configure']
  - ['convert', 'convert']
  - ['debug', 'debug']
  - ['train', 'train']
@ -249,6 +250,94 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The label files.                                                                                                                                                                                                   |

+## configure {#configure new="TODO"}
+
+Modify or combine existing configs in high-level ways. Can be used to automate
+config changes made as part of the development cycle.
+
+### configure resume {#configure-resume tag="command"}
+
+Modify the input config for use in resuming training. When resuming training,
+all components are sourced from the previously trained pipeline.
+
+```cli
+$ python -m spacy configure resume [base_model] [output_file]
+```
+
+| Name          | Description                                                                                                                                                                                                                           |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+
+### configure transformer {#configure-transformer tag="command"}
+
+Modify the base config to use a transformer component, optionally specifying the
+base transformer to use. Useful for converting a CNN tok2vec pipeline to use
+transformers.
+
+During development of a model, you can use a CNN tok2vec for faster training
+time and reduced hardware requirements, and then use this command to convert
+your pipeline to use a transformer once you've verified a proof of concept. This
+can also help isolate whether any training issues are transformer-related or
+not.
+
+```cli
+$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name]
+```
+
+| Name               | Description                                                                                                                                                                                                                           |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`       | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file`      | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~                                                                                                                                           |
+
+### configure tok2vec {#configure-tok2vec tag="command"}
+
+Modify the base model config to use a CNN tok2vec component. Useful for
+generating a config from a transformer-based model for faster training
+iteration.
+
+```cli
+$ python -m spacy configure tok2vec [base_model] [output_file]
+```
+
+| Name          | Description                                                                                                                                                                                                                           |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+
+### configure merge {#configure-merge tag="command"}
+
+Take two pipelines and create a new one with components from both of them,
+handling the configuration of listeners. Note that unlike other commands, this
+produces a whole pipeline, not just a config.
+
+Components in the final pipeline are in the same order as in the original
+pipelines, with the base pipeline first and the added pipeline after. Because
+pipeline names must be unique, if there is a name collision in components, the
+later components will be automatically renamed.
+
+For components with listeners, the resulting pipeline structure depends on the
+number of listeners. If the second pipeline has only one listener, then
+[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be
+used. If there is more than one listener, `replace_listeners` will not be used.
+In the multi-listener case, the resulting pipeline may require more adjustment
+for training to work.
+
+This is useful if you have trained a specialized component, such as NER or
+textcat, and want to provide with one of the official pretrained pipelines or
+another pipeline.
+
+```cli
+$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
+```
+
+| Name          | Description                                                                              |
+| ------------- | ---------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~         |
+| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ |
+| `output_file` | Path to output pipeline. ~~Path (positional)~~                                           |
+
 ## convert {#convert tag="command"}

 Convert files into spaCy's