From f2bbab46236ecb6fbdf38f44708a0313a7f02673 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Wed, 11 Jan 2023 16:06:50 +0900
Subject: [PATCH] Add docs for configure command

This also change the `output_file` arg to match other commands.
---
 spacy/cli/configure.py  | 30 +++++++-------
 website/docs/api/cli.md | 89 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 15 deletions(-)

diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py
index 052851b62..75d115ab7 100644
--- a/spacy/cli/configure.py
+++ b/spacy/cli/configure.py
@@ -137,7 +137,7 @@ def _check_pipeline_names(nlp, nlp2):
 def configure_resume_cli(
     # fmt: off
     base_model: Path = Arg(..., help="Path or name of base model to use for config"),
-    output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
+    output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
     # fmt: on
 ):
     """Create a config for resuming training.
@@ -155,18 +155,18 @@ def configure_resume_cli(
     for comp in nlp.pipe_names:
         conf["components"][comp] = {"source": path_str}
 
-    if str(output_path) == "-":
+    if str(output_file) == "-":
         print(conf.to_str())
     else:
-        conf.to_disk(output_path)
-        msg.good("Saved config", output_path)
+        conf.to_disk(output_file)
+        msg.good("Saved config", output_file)
 
     return conf
 
 
 @configure_cli.command("transformer")
 def use_transformer(
-    base_model: str, output_path: Path, transformer_name: str = "roberta-base"
+    base_model: str, output_file: Path, transformer_name: str = "roberta-base"
 ) -> Config:
     """Replace pipeline tok2vec with transformer."""
 
@@ -208,17 +208,17 @@ def use_transformer(
         }
         nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
 
-    if str(output_path) == "-":
+    if str(output_file) == "-":
         print(nlp.config.to_str())
     else:
-        nlp.config.to_disk(output_path)
-        msg.good("Saved config", output_path)
+        nlp.config.to_disk(output_file)
+        msg.good("Saved config", output_file)
 
     return nlp.config
 
 
 @configure_cli.command("tok2vec")
-def use_tok2vec(base_model: str, output_path: Path) -> Config:
+def use_tok2vec(base_model: str, output_file: Path) -> Config:
     """Replace pipeline tok2vec with CNN tok2vec."""
     nlp = spacy.load(base_model)
     _check_single_tok2vec(base_model, nlp.config)
@@ -240,11 +240,11 @@ def use_tok2vec(base_model: str, output_path: Path) -> Config:
         }
         nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
 
-    if str(output_path) == "-":
+    if str(output_file) == "-":
         print(nlp.config.to_str())
     else:
-        nlp.config.to_disk(output_path)
-        msg.good("Saved config", output_path)
+        nlp.config.to_disk(output_file)
+        msg.good("Saved config", output_file)
 
     return nlp.config
 
@@ -298,7 +298,7 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language:
 
 
 @configure_cli.command("merge")
-def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language:
+def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
     """Combine components from multiple pipelines."""
     nlp = spacy.load(base_model)
     nlp2 = spacy.load(added_model)
@@ -336,7 +336,7 @@ def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Lan
     nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
 
     # write the final pipeline
-    nlp.to_disk(output_path)
-    msg.info(f"Saved pipeline to: {output_path}")
+    nlp.to_disk(output_file)
+    msg.info(f"Saved pipeline to: {output_file}")
 
     return nlp
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 275e37ee0..c2ba9d933 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -7,6 +7,7 @@ menu:
   - ['info', 'info']
   - ['validate', 'validate']
   - ['init', 'init']
+  - ['configure', 'configure']
   - ['convert', 'convert']
   - ['debug', 'debug']
   - ['train', 'train']
@@ -249,6 +250,94 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The label files.                                                                                                                                                                                                   |
 
+## configure {#configure new="TODO"}
+
+Modify or combine existing configs in high-level ways. Can be used to automate
+config changes made as part of the development cycle.
+
+### configure resume {#configure-resume tag="command"}
+
+Modify the input config for use in resuming training. When resuming training,
+all components are sourced from the previously trained pipeline.
+
+```cli
+$ python -m spacy configure resume [base_model] [output_file]
+```
+
+| Name          | Description                                                                                                                                                                                                                           |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+
+### configure transformer {#configure-transformer tag="command"}
+
+Modify the base config to use a transformer component, optionally specifying the
+base transformer to use. Useful for converting a CNN tok2vec pipeline to use
+transformers.
+
+During development of a model, you can use a CNN tok2vec for faster training
+time and reduced hardware requirements, and then use this command to convert
+your pipeline to use a transformer once you've verified a proof of concept. This
+can also help isolate whether any training issues are transformer-related or
+not.
+
+```cli
+$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name]
+```
+
+| Name               | Description                                                                                                                                                                                                                           |
+| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`       | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file`      | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~                                                                                                                                           |
+
+### configure tok2vec {#configure-tok2vec tag="command"}
+
+Modify the base model config to use a CNN tok2vec component. Useful for
+generating a config from a transformer-based model for faster training
+iteration.
+
+```cli
+$ python -m spacy configure tok2vec [base_model] [output_file]
+```
+
+| Name          | Description                                                                                                                                                                                                                           |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline to resume training (package name or path). ~~str (positional)~~                                                                                                                                                    |
+| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
+
+### configure merge {#configure-merge tag="command"}
+
+Take two pipelines and create a new one with components from both of them,
+handling the configuration of listeners. Note that unlike other commands, this
+produces a whole pipeline, not just a config.
+
+Components in the final pipeline are in the same order as in the original
+pipelines, with the base pipeline first and the added pipeline after. Because
+pipeline names must be unique, if there is a name collision in components, the
+later components will be automatically renamed.
+
+For components with listeners, the resulting pipeline structure depends on the
+number of listeners. If the second pipeline has only one listener, then
+[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be
+used. If there is more than one listener, `replace_listeners` will not be used.
+In the multi-listener case, the resulting pipeline may require more adjustment
+for training to work.
+
+This is useful if you have trained a specialized component, such as NER or
+textcat, and want to provide with one of the official pretrained pipelines or
+another pipeline.
+
+```cli
+$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
+```
+
+| Name          | Description                                                                              |
+| ------------- | ---------------------------------------------------------------------------------------- |
+| `base_model`  | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~         |
+| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ |
+| `output_file` | Path to output pipeline. ~~Path (positional)~~                                           |
+
 ## convert {#convert tag="command"}
 
 Convert files into spaCy's