Update argument handling and documentation

This commit is contained in:
Ines Montani 2020-12-08 20:41:18 +11:00
parent d25b1606d6
commit 94a5a9814f
9 changed files with 156 additions and 118 deletions

View File

@ -272,7 +272,11 @@ def show_validation_error(
msg.fail(title)
print(err.text.strip())
if hint_fill and "value_error.missing" in err.error_types:
config_path = file_path if file_path is not None else "config.cfg"
path = (
file_path
if file_path is not None and str(file_path) != "-"
else "config.cfg"
)
msg.text(
"If your config contains missing values, you can run the 'init "
"fill-config' command to fill in all the defaults, if possible:",

View File

@ -19,7 +19,7 @@ from .. import util
def debug_config_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")

View File

@ -37,7 +37,7 @@ BLANK_MODEL_THRESHOLD = 2000
def debug_data_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
ignore_warnings: bool = Opt(False, "--ignore-warnings", "-IW", help="Ignore warnings, only show stats and errors"),
verbose: bool = Opt(False, "--verbose", "-V", help="Print additional information and explanations"),

View File

@ -22,7 +22,7 @@ from .. import util
def debug_model_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
component: str = Arg(..., help="Name of the pipeline component of which the model should be analysed"),
layers: str = Opt("", "--layers", "-l", help="Comma-separated names of layer IDs to print"),
dimensions: bool = Opt(False, "--dimensions", "-DIM", help="Show dimensions"),

View File

@ -62,7 +62,7 @@ def update_lexemes(nlp: Language, jsonl_loc: Path) -> None:
def init_pipeline_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the prepared data"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
@ -88,7 +88,7 @@ def init_pipeline_cli(
def init_labels_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
output_path: Path = Arg(..., help="Output directory for the labels"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),

View File

@ -17,7 +17,7 @@ from ..util import load_config
def pretrain_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False),
config_path: Path = Arg(..., help="Path to config file", exists=True, dir_okay=False, allow_dash=True),
output_dir: Path = Arg(..., help="Directory to write weights to on each epoch"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
resume_path: Optional[Path] = Opt(None, "--resume-path", "-r", help="Path to pretrained weights from which to resume pretraining"),
@ -79,7 +79,7 @@ def pretrain_cli(
def verify_cli_args(config_path, output_dir, resume_path, epoch_resume):
if not config_path or not config_path.exists():
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)
if output_dir.exists() and [p for p in output_dir.iterdir()]:
if resume_path:

View File

@ -126,7 +126,7 @@ $ python -m spacy init config [output_file] [--lang] [--pipeline] [--optimize] [
| Name | Description |
| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `output_file` | Path to output `.cfg` file or `-` to write the config to stdout (so you can pipe it forward to a file or to the `train` command). Note that if you're writing to stdout, no additional logging info is printed. ~~Path (positional)~~ |
| `--lang`, `-l` | Optional code of the [language](/usage/models#languages) to use. Defaults to `"en"`. ~~str (option)~~ |
| `--pipeline`, `-p` | Comma-separated list of trainable [pipeline components](/usage/processing-pipelines#built-in) to include. Defaults to `"tagger,parser,ner"`. ~~str (option)~~ |
| `--optimize`, `-o` | `"efficiency"` or `"accuracy"`. Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters. Defaults to `"efficiency"`. ~~str (option)~~ |
@ -224,8 +224,8 @@ $ python -m spacy init labels [config_path] [output_path] [--code] [--verbose] [
```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `output_path` | Output directory for the label files. Will create one JSON file per component. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
@ -428,7 +428,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
| Name | Description |
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ |
| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
@ -601,8 +601,8 @@ will not be available.
</Accordion>
| Name | Description |
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--ignore-warnings`, `-IW` | Ignore warnings, only show stats and errors. ~~bool (flag)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
@ -743,8 +743,8 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
</Accordion>
| Name | Description |
| ----------------------- | --------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `component` | Name of the pipeline component of which the model should be analyzed. ~~str (positional)~~ |
| `--layers`, `-l` | Comma-separated names of layer IDs to print. ~~str (option)~~ |
| `--dimensions`, `-DIM` | Show dimensions of each layer. ~~bool (flag)~~ |
@ -788,8 +788,8 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
```
| Name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `--output`, `-o` | Directory to store trained pipeline in. Will be created if it doesn't exist. ~~Optional[Path] \(positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--verbose`, `-V` | Show more detailed messages during training. ~~bool (flag)~~ |
@ -828,8 +828,8 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
```
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. If `-`, the data will be [read from stdin](/usage/training#config-stdin). ~~Union[Path, str] \(positional)~~ |
| `output_dir` | Directory to save binary weights to on each epoch. ~~Path (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--resume-path`, `-r` | Path to pretrained weights from which to resume pretraining. ~~Optional[Path] \(option)~~ |

View File

@ -264,6 +264,26 @@ defined in the config file.
$ SPACY_CONFIG_OVERRIDES="--system.gpu_allocator pytorch --training.batch_size 128" ./your_script.sh
```
### Reading from standard input {#config-stdin}
Setting the config path to `-` on the command line lets you read the config from
standard input and pipe it forward from a different process, like
[`init config`](/api/cli#init-config) or your own custom script. This is
especially useful for quick experiments, as it lets you generate a config on the
fly without having to save to and load from disk.
> #### 💡 Tip: Writing to stdout
>
> When you run `init config`, you can set the output path to `-` to write to
> stdout. In a custom script, you can print the string config, e.g.
> `print(nlp.config.to_str())`.
```cli
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
```
<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
@ -378,7 +398,8 @@ weights and [resume training](/api/language#resume_training).
If you don't want a component to be updated, you can **freeze** it by adding it
to the `frozen_components` list in the `[training]` block. Frozen components are
**not updated** during training and are included in the final trained pipeline
as-is. They are also excluded when calling [`nlp.initialize`](/api/language#initialize).
as-is. They are also excluded when calling
[`nlp.initialize`](/api/language#initialize).
> #### Note on frozen components
>
@ -551,8 +572,8 @@ or TensorFlow, make **custom modifications** to the `nlp` object, create custom
optimizers or schedules, or **stream in data** and preprocesses it on the fly
while training.
Each custom function can have any number of arguments that are passed in via
the [config](#config), just the built-in functions. If your function defines
Each custom function can have any number of arguments that are passed in via the
[config](#config), just the built-in functions. If your function defines
**default argument values**, spaCy is able to auto-fill your config when you run
[`init fill-config`](/api/cli#init-fill-config). If you want to make sure that a
given parameter is always explicitly set in the config, avoid setting a default
@ -958,10 +979,10 @@ data assets, track changes and share your end-to-end processes with your team.
</Infobox>
The binary `.spacy` format is a serialized [`DocBin`](/api/docbin) containing
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in
storage**, especially when packing multiple documents together. You can also
create `Doc` objects manually, so you can write your own custom logic to convert
and store existing annotations for use in spaCy.
one or more [`Doc`](/api/doc) objects. It's extremely **efficient in storage**,
especially when packing multiple documents together. You can also create `Doc`
objects manually, so you can write your own custom logic to convert and store
existing annotations for use in spaCy.
```python
### Training data from Doc objects {highlight="6-9"}
@ -1300,10 +1321,10 @@ mapping so they know which worker owns which parameter.
As training proceeds, every worker will be computing gradients for **all** of
the model parameters. When they compute gradients for parameters they don't own,
they'll **send them to the worker** that does own that parameter, along with a
version identifier so that the owner can decide whether to discard the
gradient. Workers use the gradients they receive and the ones they compute
locally to update the parameters they own, and then broadcast the updated array
and a new version ID to the other workers.
version identifier so that the owner can decide whether to discard the gradient.
Workers use the gradients they receive and the ones they compute locally to
update the parameters they own, and then broadcast the updated array and a new
version ID to the other workers.
This training procedure is **asynchronous** and **non-blocking**. Workers always
push their gradient increments and parameter updates, they do not have to pull

View File

@ -120,13 +120,9 @@ function parseArgs(raw) {
return result
}
function formatCode(html, lang, prompt) {
if (lang === 'cli') {
function convertLine(line, i) {
console.log(line, i)
const cliRegex = /^(\$ )?python -m spacy/
const lines = html
.trim()
.split('\n')
.map((line, i) => {
if (cliRegex.test(line)) {
const text = line.replace(cliRegex, '')
const args = parseArgs(text)
@ -135,8 +131,8 @@ function formatCode(html, lang, prompt) {
return value === null || value === true || i === 0 ? key : `${key} ${value}`
})
return (
<Fragment key={i}>
<span data-prompt="$" className={classes.cliArgSubtle}>
<Fragment key={line}>
<span data-prompt={i === 0 ? '$' : null} className={classes.cliArgSubtle}>
python -m
</span>{' '}
<span>spacy</span>{' '}
@ -165,7 +161,24 @@ function formatCode(html, lang, prompt) {
}
const htmlLine = replacePrompt(highlightCode('bash', line), '$')
return htmlToReact(htmlLine)
})
}
function formatCode(html, lang, prompt) {
if (lang === 'cli') {
const lines = html
.trim()
.split('\n')
.map(line =>
line
.split(' | ')
.map((l, i) => convertLine(l, i))
.map((l, j) => (
<Fragment>
{j !== 0 && <span> | </span>}
{l}
</Fragment>
))
)
return lines.map((line, i) => (
<Fragment key={i}>
{i !== 0 && <br />}