From f2bbab46236ecb6fbdf38f44708a0313a7f02673 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 11 Jan 2023 16:06:50 +0900 Subject: [PATCH] Add docs for configure command This also change the `output_file` arg to match other commands. --- spacy/cli/configure.py | 30 +++++++------- website/docs/api/cli.md | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 15 deletions(-) diff --git a/spacy/cli/configure.py b/spacy/cli/configure.py index 052851b62..75d115ab7 100644 --- a/spacy/cli/configure.py +++ b/spacy/cli/configure.py @@ -137,7 +137,7 @@ def _check_pipeline_names(nlp, nlp2): def configure_resume_cli( # fmt: off base_model: Path = Arg(..., help="Path or name of base model to use for config"), - output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), + output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), # fmt: on ): """Create a config for resuming training. @@ -155,18 +155,18 @@ def configure_resume_cli( for comp in nlp.pipe_names: conf["components"][comp] = {"source": path_str} - if str(output_path) == "-": + if str(output_file) == "-": print(conf.to_str()) else: - conf.to_disk(output_path) - msg.good("Saved config", output_path) + conf.to_disk(output_file) + msg.good("Saved config", output_file) return conf @configure_cli.command("transformer") def use_transformer( - base_model: str, output_path: Path, transformer_name: str = "roberta-base" + base_model: str, output_file: Path, transformer_name: str = "roberta-base" ) -> Config: """Replace pipeline tok2vec with transformer.""" @@ -208,17 +208,17 @@ def use_transformer( } nlp.config["components"][listener]["model"]["tok2vec"] = listener_config - if str(output_path) == "-": + if str(output_file) == "-": print(nlp.config.to_str()) else: - nlp.config.to_disk(output_path) - msg.good("Saved config", output_path) + nlp.config.to_disk(output_file) + msg.good("Saved config", output_file) return nlp.config @configure_cli.command("tok2vec") -def use_tok2vec(base_model: str, output_path: Path) -> Config: +def use_tok2vec(base_model: str, output_file: Path) -> Config: """Replace pipeline tok2vec with CNN tok2vec.""" nlp = spacy.load(base_model) _check_single_tok2vec(base_model, nlp.config) @@ -240,11 +240,11 @@ def use_tok2vec(base_model: str, output_path: Path) -> Config: } nlp.config["components"][listener]["model"]["tok2vec"] = listener_config - if str(output_path) == "-": + if str(output_file) == "-": print(nlp.config.to_str()) else: - nlp.config.to_disk(output_path) - msg.good("Saved config", output_path) + nlp.config.to_disk(output_file) + msg.good("Saved config", output_file) return nlp.config @@ -298,7 +298,7 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language: @configure_cli.command("merge") -def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language: +def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language: """Combine components from multiple pipelines.""" nlp = spacy.load(base_model) nlp2 = spacy.load(added_model) @@ -336,7 +336,7 @@ def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Lan nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) # write the final pipeline - nlp.to_disk(output_path) - msg.info(f"Saved pipeline to: {output_path}") + nlp.to_disk(output_file) + msg.info(f"Saved pipeline to: {output_file}") return nlp diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 275e37ee0..c2ba9d933 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -7,6 +7,7 @@ menu: - ['info', 'info'] - ['validate', 'validate'] - ['init', 'init'] + - ['configure', 'configure'] - ['convert', 'convert'] - ['debug', 'debug'] - ['train', 'train'] @@ -249,6 +250,94 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [ | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | **CREATES** | The label files. | +## configure {#configure new="TODO"} + +Modify or combine existing configs in high-level ways. Can be used to automate +config changes made as part of the development cycle. + +### configure resume {#configure-resume tag="command"} + +Modify the input config for use in resuming training. When resuming training, +all components are sourced from the previously trained pipeline. + +```cli +$ python -m spacy configure resume [base_model] [output_file] +``` + +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | + +### configure transformer {#configure-transformer tag="command"} + +Modify the base config to use a transformer component, optionally specifying the +base transformer to use. Useful for converting a CNN tok2vec pipeline to use +transformers. + +During development of a model, you can use a CNN tok2vec for faster training +time and reduced hardware requirements, and then use this command to convert +your pipeline to use a transformer once you've verified a proof of concept. This +can also help isolate whether any training issues are transformer-related or +not. + +```cli +$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name] +``` + +| Name | Description | +| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | +| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~ | + +### configure tok2vec {#configure-tok2vec tag="command"} + +Modify the base model config to use a CNN tok2vec component. Useful for +generating a config from a transformer-based model for faster training +iteration. + +```cli +$ python -m spacy configure tok2vec [base_model] [output_file] +``` + +| Name | Description | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ | +| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ | + +### configure merge {#configure-merge tag="command"} + +Take two pipelines and create a new one with components from both of them, +handling the configuration of listeners. Note that unlike other commands, this +produces a whole pipeline, not just a config. + +Components in the final pipeline are in the same order as in the original +pipelines, with the base pipeline first and the added pipeline after. Because +pipeline names must be unique, if there is a name collision in components, the +later components will be automatically renamed. + +For components with listeners, the resulting pipeline structure depends on the +number of listeners. If the second pipeline has only one listener, then +[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be +used. If there is more than one listener, `replace_listeners` will not be used. +In the multi-listener case, the resulting pipeline may require more adjustment +for training to work. + +This is useful if you have trained a specialized component, such as NER or +textcat, and want to provide with one of the official pretrained pipelines or +another pipeline. + +```cli +$ python -m spacy configure tok2vec [base_model] [added_model] [output_file] +``` + +| Name | Description | +| ------------- | ---------------------------------------------------------------------------------------- | +| `base_model` | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~ | +| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ | +| `output_file` | Path to output pipeline. ~~Path (positional)~~ | + ## convert {#convert tag="command"} Convert files into spaCy's