Add docs for configure command

This also change the `output_file` arg to match other commands.
This commit is contained in:
Paul O'Leary McCann 2023-01-11 16:06:50 +09:00
parent 2791f0b552
commit f2bbab4623
2 changed files with 104 additions and 15 deletions

View File

@ -137,7 +137,7 @@ def _check_pipeline_names(nlp, nlp2):
def configure_resume_cli( def configure_resume_cli(
# fmt: off # fmt: off
base_model: Path = Arg(..., help="Path or name of base model to use for config"), base_model: Path = Arg(..., help="Path or name of base model to use for config"),
output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True), output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
# fmt: on # fmt: on
): ):
"""Create a config for resuming training. """Create a config for resuming training.
@ -155,18 +155,18 @@ def configure_resume_cli(
for comp in nlp.pipe_names: for comp in nlp.pipe_names:
conf["components"][comp] = {"source": path_str} conf["components"][comp] = {"source": path_str}
if str(output_path) == "-": if str(output_file) == "-":
print(conf.to_str()) print(conf.to_str())
else: else:
conf.to_disk(output_path) conf.to_disk(output_file)
msg.good("Saved config", output_path) msg.good("Saved config", output_file)
return conf return conf
@configure_cli.command("transformer") @configure_cli.command("transformer")
def use_transformer( def use_transformer(
base_model: str, output_path: Path, transformer_name: str = "roberta-base" base_model: str, output_file: Path, transformer_name: str = "roberta-base"
) -> Config: ) -> Config:
"""Replace pipeline tok2vec with transformer.""" """Replace pipeline tok2vec with transformer."""
@ -208,17 +208,17 @@ def use_transformer(
} }
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
if str(output_path) == "-": if str(output_file) == "-":
print(nlp.config.to_str()) print(nlp.config.to_str())
else: else:
nlp.config.to_disk(output_path) nlp.config.to_disk(output_file)
msg.good("Saved config", output_path) msg.good("Saved config", output_file)
return nlp.config return nlp.config
@configure_cli.command("tok2vec") @configure_cli.command("tok2vec")
def use_tok2vec(base_model: str, output_path: Path) -> Config: def use_tok2vec(base_model: str, output_file: Path) -> Config:
"""Replace pipeline tok2vec with CNN tok2vec.""" """Replace pipeline tok2vec with CNN tok2vec."""
nlp = spacy.load(base_model) nlp = spacy.load(base_model)
_check_single_tok2vec(base_model, nlp.config) _check_single_tok2vec(base_model, nlp.config)
@ -240,11 +240,11 @@ def use_tok2vec(base_model: str, output_path: Path) -> Config:
} }
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
if str(output_path) == "-": if str(output_file) == "-":
print(nlp.config.to_str()) print(nlp.config.to_str())
else: else:
nlp.config.to_disk(output_path) nlp.config.to_disk(output_file)
msg.good("Saved config", output_path) msg.good("Saved config", output_file)
return nlp.config return nlp.config
@ -298,7 +298,7 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language:
@configure_cli.command("merge") @configure_cli.command("merge")
def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language: def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
"""Combine components from multiple pipelines.""" """Combine components from multiple pipelines."""
nlp = spacy.load(base_model) nlp = spacy.load(base_model)
nlp2 = spacy.load(added_model) nlp2 = spacy.load(added_model)
@ -336,7 +336,7 @@ def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Lan
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners) nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
# write the final pipeline # write the final pipeline
nlp.to_disk(output_path) nlp.to_disk(output_file)
msg.info(f"Saved pipeline to: {output_path}") msg.info(f"Saved pipeline to: {output_file}")
return nlp return nlp

View File

@ -7,6 +7,7 @@ menu:
- ['info', 'info'] - ['info', 'info']
- ['validate', 'validate'] - ['validate', 'validate']
- ['init', 'init'] - ['init', 'init']
- ['configure', 'configure']
- ['convert', 'convert'] - ['convert', 'convert']
- ['debug', 'debug'] - ['debug', 'debug']
- ['train', 'train'] - ['train', 'train']
@ -249,6 +250,94 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | | overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The label files. | | **CREATES** | The label files. |
## configure {#configure new="TODO"}
Modify or combine existing configs in high-level ways. Can be used to automate
config changes made as part of the development cycle.
### configure resume {#configure-resume tag="command"}
Modify the input config for use in resuming training. When resuming training,
all components are sourced from the previously trained pipeline.
```cli
$ python -m spacy configure resume [base_model] [output_file]
```
| Name | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
### configure transformer {#configure-transformer tag="command"}
Modify the base config to use a transformer component, optionally specifying the
base transformer to use. Useful for converting a CNN tok2vec pipeline to use
transformers.
During development of a model, you can use a CNN tok2vec for faster training
time and reduced hardware requirements, and then use this command to convert
your pipeline to use a transformer once you've verified a proof of concept. This
can also help isolate whether any training issues are transformer-related or
not.
```cli
$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name]
```
| Name | Description |
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~ |
### configure tok2vec {#configure-tok2vec tag="command"}
Modify the base model config to use a CNN tok2vec component. Useful for
generating a config from a transformer-based model for faster training
iteration.
```cli
$ python -m spacy configure tok2vec [base_model] [output_file]
```
| Name | Description |
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
### configure merge {#configure-merge tag="command"}
Take two pipelines and create a new one with components from both of them,
handling the configuration of listeners. Note that unlike other commands, this
produces a whole pipeline, not just a config.
Components in the final pipeline are in the same order as in the original
pipelines, with the base pipeline first and the added pipeline after. Because
pipeline names must be unique, if there is a name collision in components, the
later components will be automatically renamed.
For components with listeners, the resulting pipeline structure depends on the
number of listeners. If the second pipeline has only one listener, then
[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be
used. If there is more than one listener, `replace_listeners` will not be used.
In the multi-listener case, the resulting pipeline may require more adjustment
for training to work.
This is useful if you have trained a specialized component, such as NER or
textcat, and want to provide with one of the official pretrained pipelines or
another pipeline.
```cli
$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
```
| Name | Description |
| ------------- | ---------------------------------------------------------------------------------------- |
| `base_model` | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~ |
| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ |
| `output_file` | Path to output pipeline. ~~Path (positional)~~ |
## convert {#convert tag="command"} ## convert {#convert tag="command"}
Convert files into spaCy's Convert files into spaCy's