Merge branch 'develop' into fix/language-config-interpolate-disk-bytes

This commit is contained in:
Ines Montani 2020-08-28 13:21:17 +02:00
commit a51b4f3a19
7 changed files with 233 additions and 42 deletions

View File

@ -7,7 +7,7 @@ ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0
pathy
# Third party dependencies

View File

@ -44,7 +44,7 @@ install_requires =
blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0
catalogue>=0.0.7,<1.1.0
catalogue>=2.0.1,<2.1.0
typer>=0.3.0,<0.4.0
pathy
# Third-party dependencies

View File

@ -11,6 +11,7 @@ from .profile import profile # noqa: F401
from .train import train_cli # noqa: F401
from .pretrain import pretrain # noqa: F401
from .debug_data import debug_data # noqa: F401
from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .evaluate import evaluate # noqa: F401
from .convert import convert # noqa: F401

93
spacy/cli/debug_config.py Normal file
View File

@ -0,0 +1,93 @@
from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg, table
from thinc.api import Config
from thinc.config import VARIABLE_RE
import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli
from .. import util
@debug_cli.command(
"config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_config_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
# fmt: on
):
"""Debug a config.cfg file and show validation errors. The command will
create all objects in the tree and validate them. Note that some config
validation errors are blocking and will prevent the rest of the config from
being resolved. This means that you may not see all validation errors at
once and some issues are only shown once previous errors have been fixed.
Similar as with the 'train' command, you can override settings from the config
as command line options. For instance, --training.batch_size 128 overrides
the value of "batch_size" in the block "[training]".
"""
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
debug_config(
config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
)
def debug_config(
config_path: Path,
*,
overrides: Dict[str, Any] = {},
show_funcs: bool = False,
show_vars: bool = False,
):
msg.divider("Config validation")
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp, _ = util.load_model_from_config(config)
msg.good("Config is valid")
if show_vars:
variables = get_variables(config)
msg.divider(f"Variables ({len(variables)})")
head = ("Variable", "Value")
msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2)
if show_funcs:
funcs = get_registered_funcs(config)
msg.divider(f"Registered functions ({len(funcs)})")
for func in funcs:
func_data = {
"Registry": f"@{func['registry']}",
"Name": func["name"],
"Module": func["module"],
"File": f"{func['file']} (line {func['line_no']})",
}
msg.info(f"[{func['path']}]")
print(table(func_data).strip())
def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]:
result = []
for key, value in util.walk_dict(config):
if not key[-1].startswith("@"):
continue
# We have a reference to a registered function
reg_name = key[-1][1:]
registry = getattr(util.registry, reg_name)
path = ".".join(key[:-1])
info = registry.find(value)
result.append({"name": value, "registry": reg_name, "path": path, **info})
return result
def get_variables(config: Config) -> Dict[str, Any]:
result = {}
for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))):
path = variable[2:-1].replace(":", ".")
value = util.dot_to_object(config, path)
result[variable] = repr(value)
return result

View File

@ -23,34 +23,6 @@ BLANK_MODEL_MIN_THRESHOLD = 100
BLANK_MODEL_THRESHOLD = 2000
@debug_cli.command(
"config",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_config_cli(
# fmt: off
ctx: typer.Context, # This is only used to read additional arguments
config_path: Path = Arg(..., help="Path to config file", exists=True),
code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
# fmt: on
):
"""Debug a config.cfg file and show validation errors. The command will
create all objects in the tree and validate them. Note that some config
validation errors are blocking and will prevent the rest of the config from
being resolved. This means that you may not see all validation errors at
once and some issues are only shown once previous errors have been fixed.
Similar as with the 'train' command, you can override settings from the config
as command line options. For instance, --training.batch_size 128 overrides
the value of "batch_size" in the block "[training]".
"""
overrides = parse_config_overrides(ctx.args)
import_code(code_path)
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp, _ = util.load_model_from_config(config)
msg.good("Original config is valid")
@debug_cli.command(
"data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)

View File

@ -1,4 +1,6 @@
import pytest
from click import NoSuchOption
from spacy.gold import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English
@ -372,9 +374,18 @@ def test_parse_config_overrides(args, expected):
@pytest.mark.parametrize(
"args",
[["--foo"], ["--x.foo", "bar", "--baz"], ["--x.foo", "bar", "baz"], ["x.foo"]],
[["--foo"], ["--x.foo", "bar", "--baz"]],
)
def test_parse_config_overrides_invalid(args):
with pytest.raises(NoSuchOption):
parse_config_overrides(args)
@pytest.mark.parametrize(
"args",
[["--x.foo", "bar", "baz"], ["x.foo"]],
)
def test_parse_config_overrides_invalid_2(args):
with pytest.raises(SystemExit):
parse_config_overrides(args)

View File

@ -246,19 +246,19 @@ some config validation errors are blocking and will prevent the rest of the
config from being resolved. This means that you may not see all validation
errors at once and some issues are only shown once previous errors have been
fixed. To auto-fill a partial config and save the result, you can use the
[`init fillconfig`](/api/cli#init-fill-config) command.
[`init fill-config`](/api/cli#init-fill-config) command.
```cli
$ python -m spacy debug config [config_path] [--code_path] [overrides]
$ python -m spacy debug config [config_path] [--code-path] [--show-functions] [--show-variables] [overrides]
```
> #### Example
>
> ```cli
> $ python -m spacy debug config ./config.cfg
> $ python -m spacy debug config config.cfg
> ```
<Accordion title="Example output" spaced>
<Accordion title="Example output (validation error)">
```
✘ Config validation error
@ -277,13 +277,127 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start
</Accordion>
| Name | Description |
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Config validation errors, if available. |
<Accordion title="Example output (valid config and all options)" spaced>
```cli
$ python -m spacy debug config ./config.cfg --show-functions --show-variables
```
```
============================= Config validation =============================
✔ Config is valid
=============================== Variables (6) ===============================
Variable Value
----------------------------------------- ----------------------------------
${components.tok2vec.model.encode.width} 96
${paths.dev} 'hello'
${paths.init_tok2vec} None
${paths.raw} None
${paths.train} ''
${system.seed} 0
========================= Registered functions (17) =========================
[nlp.tokenizer]
Registry @tokenizers
Name spacy.Tokenizer.v1
Module spacy.language
File /path/to/spacy/language.py (line 64)
[components.ner.model]
Registry @architectures
Name spacy.TransitionBasedParser.v1
Module spacy.ml.models.parser
File /path/to/spacy/ml/models/parser.py (line 11)
[components.ner.model.tok2vec]
Registry @architectures
Name spacy.Tok2VecListener.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 16)
[components.parser.model]
Registry @architectures
Name spacy.TransitionBasedParser.v1
Module spacy.ml.models.parser
File /path/to/spacy/ml/models/parser.py (line 11)
[components.parser.model.tok2vec]
Registry @architectures
Name spacy.Tok2VecListener.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 16)
[components.tagger.model]
Registry @architectures
Name spacy.Tagger.v1
Module spacy.ml.models.tagger
File /path/to/spacy/ml/models/tagger.py (line 9)
[components.tagger.model.tok2vec]
Registry @architectures
Name spacy.Tok2VecListener.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 16)
[components.tok2vec.model]
Registry @architectures
Name spacy.Tok2Vec.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 72)
[components.tok2vec.model.embed]
Registry @architectures
Name spacy.MultiHashEmbed.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 93)
[components.tok2vec.model.encode]
Registry @architectures
Name spacy.MaxoutWindowEncoder.v1
Module spacy.ml.models.tok2vec
File /path/to/spacy/ml/models/tok2vec.py (line 207)
[training.logger]
Registry @loggers
Name spacy.ConsoleLogger.v1
Module spacy.gold.loggers
File /path/to/spacy/gold/loggers.py (line 8)
[training.batcher]
Registry @batchers
Name batch_by_words.v1
Module spacy.gold.batchers
File /path/to/spacy/gold/batchers.py (line 49)
[training.batcher.size]
Registry @schedules
Name compounding.v1
Module thinc.schedules
File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43)
[training.dev_corpus]
Registry @readers
Name spacy.Corpus.v1
Module spacy.gold.corpus
File /path/to/spacy/gold/corpus.py (line 18)
[training.optimizer]
Registry @optimizers
Name Adam.v1
Module thinc.optimizers
File /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58)
[training.optimizer.learn_rate]
Registry @schedules
Name warmup_linear.v1
Module thinc.schedules
File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91)
[training.train_corpus]
Registry @readers
Name spacy.Corpus.v1
Module spacy.gold.corpus
File /path/to/spacy/gold/corpus.py (line 18)
```
</Accordion>
| Name | Description |
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ |
| `--code-path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ |
| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **PRINTS** | Config validation errors, if available. |
### debug data {#debug-data tag="command"}