mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-04 20:03:13 +03:00
Add docs for configure command
This also change the `output_file` arg to match other commands.
This commit is contained in:
parent
2791f0b552
commit
f2bbab4623
|
@ -137,7 +137,7 @@ def _check_pipeline_names(nlp, nlp2):
|
||||||
def configure_resume_cli(
|
def configure_resume_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
base_model: Path = Arg(..., help="Path or name of base model to use for config"),
|
base_model: Path = Arg(..., help="Path or name of base model to use for config"),
|
||||||
output_path: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""Create a config for resuming training.
|
"""Create a config for resuming training.
|
||||||
|
@ -155,18 +155,18 @@ def configure_resume_cli(
|
||||||
for comp in nlp.pipe_names:
|
for comp in nlp.pipe_names:
|
||||||
conf["components"][comp] = {"source": path_str}
|
conf["components"][comp] = {"source": path_str}
|
||||||
|
|
||||||
if str(output_path) == "-":
|
if str(output_file) == "-":
|
||||||
print(conf.to_str())
|
print(conf.to_str())
|
||||||
else:
|
else:
|
||||||
conf.to_disk(output_path)
|
conf.to_disk(output_file)
|
||||||
msg.good("Saved config", output_path)
|
msg.good("Saved config", output_file)
|
||||||
|
|
||||||
return conf
|
return conf
|
||||||
|
|
||||||
|
|
||||||
@configure_cli.command("transformer")
|
@configure_cli.command("transformer")
|
||||||
def use_transformer(
|
def use_transformer(
|
||||||
base_model: str, output_path: Path, transformer_name: str = "roberta-base"
|
base_model: str, output_file: Path, transformer_name: str = "roberta-base"
|
||||||
) -> Config:
|
) -> Config:
|
||||||
"""Replace pipeline tok2vec with transformer."""
|
"""Replace pipeline tok2vec with transformer."""
|
||||||
|
|
||||||
|
@ -208,17 +208,17 @@ def use_transformer(
|
||||||
}
|
}
|
||||||
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
|
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
|
||||||
|
|
||||||
if str(output_path) == "-":
|
if str(output_file) == "-":
|
||||||
print(nlp.config.to_str())
|
print(nlp.config.to_str())
|
||||||
else:
|
else:
|
||||||
nlp.config.to_disk(output_path)
|
nlp.config.to_disk(output_file)
|
||||||
msg.good("Saved config", output_path)
|
msg.good("Saved config", output_file)
|
||||||
|
|
||||||
return nlp.config
|
return nlp.config
|
||||||
|
|
||||||
|
|
||||||
@configure_cli.command("tok2vec")
|
@configure_cli.command("tok2vec")
|
||||||
def use_tok2vec(base_model: str, output_path: Path) -> Config:
|
def use_tok2vec(base_model: str, output_file: Path) -> Config:
|
||||||
"""Replace pipeline tok2vec with CNN tok2vec."""
|
"""Replace pipeline tok2vec with CNN tok2vec."""
|
||||||
nlp = spacy.load(base_model)
|
nlp = spacy.load(base_model)
|
||||||
_check_single_tok2vec(base_model, nlp.config)
|
_check_single_tok2vec(base_model, nlp.config)
|
||||||
|
@ -240,11 +240,11 @@ def use_tok2vec(base_model: str, output_path: Path) -> Config:
|
||||||
}
|
}
|
||||||
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
|
nlp.config["components"][listener]["model"]["tok2vec"] = listener_config
|
||||||
|
|
||||||
if str(output_path) == "-":
|
if str(output_file) == "-":
|
||||||
print(nlp.config.to_str())
|
print(nlp.config.to_str())
|
||||||
else:
|
else:
|
||||||
nlp.config.to_disk(output_path)
|
nlp.config.to_disk(output_file)
|
||||||
msg.good("Saved config", output_path)
|
msg.good("Saved config", output_file)
|
||||||
|
|
||||||
return nlp.config
|
return nlp.config
|
||||||
|
|
||||||
|
@ -298,7 +298,7 @@ def _inner_merge(nlp, nlp2, replace_listeners=False) -> Language:
|
||||||
|
|
||||||
|
|
||||||
@configure_cli.command("merge")
|
@configure_cli.command("merge")
|
||||||
def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Language:
|
def merge_pipelines(base_model: str, added_model: str, output_file: Path) -> Language:
|
||||||
"""Combine components from multiple pipelines."""
|
"""Combine components from multiple pipelines."""
|
||||||
nlp = spacy.load(base_model)
|
nlp = spacy.load(base_model)
|
||||||
nlp2 = spacy.load(added_model)
|
nlp2 = spacy.load(added_model)
|
||||||
|
@ -336,7 +336,7 @@ def merge_pipelines(base_model: str, added_model: str, output_path: Path) -> Lan
|
||||||
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
nlp_out = _inner_merge(nlp, nlp2, replace_listeners=replace_listeners)
|
||||||
|
|
||||||
# write the final pipeline
|
# write the final pipeline
|
||||||
nlp.to_disk(output_path)
|
nlp.to_disk(output_file)
|
||||||
msg.info(f"Saved pipeline to: {output_path}")
|
msg.info(f"Saved pipeline to: {output_file}")
|
||||||
|
|
||||||
return nlp
|
return nlp
|
||||||
|
|
|
@ -7,6 +7,7 @@ menu:
|
||||||
- ['info', 'info']
|
- ['info', 'info']
|
||||||
- ['validate', 'validate']
|
- ['validate', 'validate']
|
||||||
- ['init', 'init']
|
- ['init', 'init']
|
||||||
|
- ['configure', 'configure']
|
||||||
- ['convert', 'convert']
|
- ['convert', 'convert']
|
||||||
- ['debug', 'debug']
|
- ['debug', 'debug']
|
||||||
- ['train', 'train']
|
- ['train', 'train']
|
||||||
|
@ -249,6 +250,94 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The label files. |
|
| **CREATES** | The label files. |
|
||||||
|
|
||||||
|
## configure {#configure new="TODO"}
|
||||||
|
|
||||||
|
Modify or combine existing configs in high-level ways. Can be used to automate
|
||||||
|
config changes made as part of the development cycle.
|
||||||
|
|
||||||
|
### configure resume {#configure-resume tag="command"}
|
||||||
|
|
||||||
|
Modify the input config for use in resuming training. When resuming training,
|
||||||
|
all components are sourced from the previously trained pipeline.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy configure resume [base_model] [output_file]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
|
||||||
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
|
|
||||||
|
### configure transformer {#configure-transformer tag="command"}
|
||||||
|
|
||||||
|
Modify the base config to use a transformer component, optionally specifying the
|
||||||
|
base transformer to use. Useful for converting a CNN tok2vec pipeline to use
|
||||||
|
transformers.
|
||||||
|
|
||||||
|
During development of a model, you can use a CNN tok2vec for faster training
|
||||||
|
time and reduced hardware requirements, and then use this command to convert
|
||||||
|
your pipeline to use a transformer once you've verified a proof of concept. This
|
||||||
|
can also help isolate whether any training issues are transformer-related or
|
||||||
|
not.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy configure transformer [base_model] [output_file] [--transformer_name]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
|
||||||
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
|
| `transformer_name` | The name of the base HuggingFace model to use. Defaults to `roberta-base`. ~~str (option)~~ |
|
||||||
|
|
||||||
|
### configure tok2vec {#configure-tok2vec tag="command"}
|
||||||
|
|
||||||
|
Modify the base model config to use a CNN tok2vec component. Useful for
|
||||||
|
generating a config from a transformer-based model for faster training
|
||||||
|
iteration.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy configure tok2vec [base_model] [output_file]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `base_model` | A trained pipeline to resume training (package name or path). ~~str (positional)~~ |
|
||||||
|
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
|
||||||
|
|
||||||
|
### configure merge {#configure-merge tag="command"}
|
||||||
|
|
||||||
|
Take two pipelines and create a new one with components from both of them,
|
||||||
|
handling the configuration of listeners. Note that unlike other commands, this
|
||||||
|
produces a whole pipeline, not just a config.
|
||||||
|
|
||||||
|
Components in the final pipeline are in the same order as in the original
|
||||||
|
pipelines, with the base pipeline first and the added pipeline after. Because
|
||||||
|
pipeline names must be unique, if there is a name collision in components, the
|
||||||
|
later components will be automatically renamed.
|
||||||
|
|
||||||
|
For components with listeners, the resulting pipeline structure depends on the
|
||||||
|
number of listeners. If the second pipeline has only one listener, then
|
||||||
|
[`replace_listeners`](https://spacy.io/api/language/#replace_listeners) will be
|
||||||
|
used. If there is more than one listener, `replace_listeners` will not be used.
|
||||||
|
In the multi-listener case, the resulting pipeline may require more adjustment
|
||||||
|
for training to work.
|
||||||
|
|
||||||
|
This is useful if you have trained a specialized component, such as NER or
|
||||||
|
textcat, and want to provide with one of the official pretrained pipelines or
|
||||||
|
another pipeline.
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy configure tok2vec [base_model] [added_model] [output_file]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ------------- | ---------------------------------------------------------------------------------------- |
|
||||||
|
| `base_model` | A trained pipeline (package name or path) to use as a base. ~~str (positional)~~ |
|
||||||
|
| `added_model` | A trained pipeline (package name or path) to combine with the base. ~~str (positional)~~ |
|
||||||
|
| `output_file` | Path to output pipeline. ~~Path (positional)~~ |
|
||||||
|
|
||||||
## convert {#convert tag="command"}
|
## convert {#convert tag="command"}
|
||||||
|
|
||||||
Convert files into spaCy's
|
Convert files into spaCy's
|
||||||
|
|
Loading…
Reference in New Issue
Block a user