diff --git a/requirements.txt b/requirements.txt index b68fc89ae..9b108de8d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,7 +7,7 @@ ml_datasets>=0.1.1 murmurhash>=0.28.0,<1.1.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 -catalogue>=0.0.7,<1.1.0 +catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy # Third party dependencies diff --git a/setup.cfg b/setup.cfg index 90c314cc5..fc33abedb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -44,7 +44,7 @@ install_requires = blis>=0.4.0,<0.5.0 wasabi>=0.8.0,<1.1.0 srsly>=2.1.0,<3.0.0 - catalogue>=0.0.7,<1.1.0 + catalogue>=2.0.1,<2.1.0 typer>=0.3.0,<0.4.0 pathy # Third-party dependencies diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 1503acd24..b47c1c16b 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -11,6 +11,7 @@ from .profile import profile # noqa: F401 from .train import train_cli # noqa: F401 from .pretrain import pretrain # noqa: F401 from .debug_data import debug_data # noqa: F401 +from .debug_config import debug_config # noqa: F401 from .debug_model import debug_model # noqa: F401 from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py new file mode 100644 index 000000000..2944cd364 --- /dev/null +++ b/spacy/cli/debug_config.py @@ -0,0 +1,93 @@ +from typing import Optional, Dict, Any, Union, List +from pathlib import Path +from wasabi import msg, table +from thinc.api import Config +from thinc.config import VARIABLE_RE +import typer + +from ._util import Arg, Opt, show_validation_error, parse_config_overrides +from ._util import import_code, debug_cli +from .. import util + + +@debug_cli.command( + "config", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def debug_config_cli( + # fmt: off + ctx: typer.Context, # This is only used to read additional arguments + config_path: Path = Arg(..., help="Path to config file", exists=True), + code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), + show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"), + show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.") + # fmt: on +): + """Debug a config.cfg file and show validation errors. The command will + create all objects in the tree and validate them. Note that some config + validation errors are blocking and will prevent the rest of the config from + being resolved. This means that you may not see all validation errors at + once and some issues are only shown once previous errors have been fixed. + Similar as with the 'train' command, you can override settings from the config + as command line options. For instance, --training.batch_size 128 overrides + the value of "batch_size" in the block "[training]". + """ + overrides = parse_config_overrides(ctx.args) + import_code(code_path) + debug_config( + config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars + ) + + +def debug_config( + config_path: Path, + *, + overrides: Dict[str, Any] = {}, + show_funcs: bool = False, + show_vars: bool = False, +): + msg.divider("Config validation") + with show_validation_error(config_path): + config = util.load_config(config_path, overrides=overrides) + nlp, _ = util.load_model_from_config(config) + msg.good("Config is valid") + if show_vars: + variables = get_variables(config) + msg.divider(f"Variables ({len(variables)})") + head = ("Variable", "Value") + msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2) + if show_funcs: + funcs = get_registered_funcs(config) + msg.divider(f"Registered functions ({len(funcs)})") + for func in funcs: + func_data = { + "Registry": f"@{func['registry']}", + "Name": func["name"], + "Module": func["module"], + "File": f"{func['file']} (line {func['line_no']})", + } + msg.info(f"[{func['path']}]") + print(table(func_data).strip()) + + +def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]: + result = [] + for key, value in util.walk_dict(config): + if not key[-1].startswith("@"): + continue + # We have a reference to a registered function + reg_name = key[-1][1:] + registry = getattr(util.registry, reg_name) + path = ".".join(key[:-1]) + info = registry.find(value) + result.append({"name": value, "registry": reg_name, "path": path, **info}) + return result + + +def get_variables(config: Config) -> Dict[str, Any]: + result = {} + for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))): + path = variable[2:-1].replace(":", ".") + value = util.dot_to_object(config, path) + result[variable] = repr(value) + return result diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index b23705311..2f48a29cd 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -23,34 +23,6 @@ BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 -@debug_cli.command( - "config", - context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, -) -def debug_config_cli( - # fmt: off - ctx: typer.Context, # This is only used to read additional arguments - config_path: Path = Arg(..., help="Path to config file", exists=True), - code_path: Optional[Path] = Opt(None, "--code-path", "-c", help="Path to Python file with additional code (registered functions) to be imported"), - # fmt: on -): - """Debug a config.cfg file and show validation errors. The command will - create all objects in the tree and validate them. Note that some config - validation errors are blocking and will prevent the rest of the config from - being resolved. This means that you may not see all validation errors at - once and some issues are only shown once previous errors have been fixed. - Similar as with the 'train' command, you can override settings from the config - as command line options. For instance, --training.batch_size 128 overrides - the value of "batch_size" in the block "[training]". - """ - overrides = parse_config_overrides(ctx.args) - import_code(code_path) - with show_validation_error(config_path): - config = util.load_config(config_path, overrides=overrides) - nlp, _ = util.load_model_from_config(config) - msg.good("Original config is valid") - - @debug_cli.command( "data", context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, ) diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 448aaf202..240683f7d 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,4 +1,6 @@ import pytest +from click import NoSuchOption + from spacy.gold import docs_to_json, biluo_tags_from_offsets from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.lang.en import English @@ -372,9 +374,18 @@ def test_parse_config_overrides(args, expected): @pytest.mark.parametrize( "args", - [["--foo"], ["--x.foo", "bar", "--baz"], ["--x.foo", "bar", "baz"], ["x.foo"]], + [["--foo"], ["--x.foo", "bar", "--baz"]], ) def test_parse_config_overrides_invalid(args): + with pytest.raises(NoSuchOption): + parse_config_overrides(args) + + +@pytest.mark.parametrize( + "args", + [["--x.foo", "bar", "baz"], ["x.foo"]], +) +def test_parse_config_overrides_invalid_2(args): with pytest.raises(SystemExit): parse_config_overrides(args) diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 028036f87..9070855fa 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -246,19 +246,19 @@ some config validation errors are blocking and will prevent the rest of the config from being resolved. This means that you may not see all validation errors at once and some issues are only shown once previous errors have been fixed. To auto-fill a partial config and save the result, you can use the -[`init fillconfig`](/api/cli#init-fill-config) command. +[`init fill-config`](/api/cli#init-fill-config) command. ```cli -$ python -m spacy debug config [config_path] [--code_path] [overrides] +$ python -m spacy debug config [config_path] [--code-path] [--show-functions] [--show-variables] [overrides] ``` > #### Example > > ```cli -> $ python -m spacy debug config ./config.cfg +> $ python -m spacy debug config config.cfg > ``` - + ``` ✘ Config validation error @@ -277,13 +277,127 @@ python -m spacy init fill-config tmp/starter-config_invalid.cfg --base tmp/start -| Name | Description | -| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | -| `--code_path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | -| **PRINTS** | Config validation errors, if available. | + + +```cli +$ python -m spacy debug config ./config.cfg --show-functions --show-variables +``` + +``` +============================= Config validation ============================= +✔ Config is valid + +=============================== Variables (6) =============================== + +Variable Value +----------------------------------------- ---------------------------------- +${components.tok2vec.model.encode.width} 96 +${paths.dev} 'hello' +${paths.init_tok2vec} None +${paths.raw} None +${paths.train} '' +${system.seed} 0 + + +========================= Registered functions (17) ========================= +ℹ [nlp.tokenizer] +Registry @tokenizers +Name spacy.Tokenizer.v1 +Module spacy.language +File /path/to/spacy/language.py (line 64) +ℹ [components.ner.model] +Registry @architectures +Name spacy.TransitionBasedParser.v1 +Module spacy.ml.models.parser +File /path/to/spacy/ml/models/parser.py (line 11) +ℹ [components.ner.model.tok2vec] +Registry @architectures +Name spacy.Tok2VecListener.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 16) +ℹ [components.parser.model] +Registry @architectures +Name spacy.TransitionBasedParser.v1 +Module spacy.ml.models.parser +File /path/to/spacy/ml/models/parser.py (line 11) +ℹ [components.parser.model.tok2vec] +Registry @architectures +Name spacy.Tok2VecListener.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 16) +ℹ [components.tagger.model] +Registry @architectures +Name spacy.Tagger.v1 +Module spacy.ml.models.tagger +File /path/to/spacy/ml/models/tagger.py (line 9) +ℹ [components.tagger.model.tok2vec] +Registry @architectures +Name spacy.Tok2VecListener.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 16) +ℹ [components.tok2vec.model] +Registry @architectures +Name spacy.Tok2Vec.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 72) +ℹ [components.tok2vec.model.embed] +Registry @architectures +Name spacy.MultiHashEmbed.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 93) +ℹ [components.tok2vec.model.encode] +Registry @architectures +Name spacy.MaxoutWindowEncoder.v1 +Module spacy.ml.models.tok2vec +File /path/to/spacy/ml/models/tok2vec.py (line 207) +ℹ [training.logger] +Registry @loggers +Name spacy.ConsoleLogger.v1 +Module spacy.gold.loggers +File /path/to/spacy/gold/loggers.py (line 8) +ℹ [training.batcher] +Registry @batchers +Name batch_by_words.v1 +Module spacy.gold.batchers +File /path/to/spacy/gold/batchers.py (line 49) +ℹ [training.batcher.size] +Registry @schedules +Name compounding.v1 +Module thinc.schedules +File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 43) +ℹ [training.dev_corpus] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.gold.corpus +File /path/to/spacy/gold/corpus.py (line 18) +ℹ [training.optimizer] +Registry @optimizers +Name Adam.v1 +Module thinc.optimizers +File /Users/ines/Repos/explosion/thinc/thinc/optimizers.py (line 58) +ℹ [training.optimizer.learn_rate] +Registry @schedules +Name warmup_linear.v1 +Module thinc.schedules +File /Users/ines/Repos/explosion/thinc/thinc/schedules.py (line 91) +ℹ [training.train_corpus] +Registry @readers +Name spacy.Corpus.v1 +Module spacy.gold.corpus +File /path/to/spacy/gold/corpus.py (line 18) +``` + + + +| Name | Description | +| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config_path` | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. ~~Path (positional)~~ | +| `--code-path`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--show-functions`, `-F` | Show an overview of all registered function blocks used in the config and where those functions come from, including the module name, Python file and line number. ~~bool (flag)~~ | +| `--show-variables`, `-V` | Show an overview of all variables referenced in the config, e.g. `${paths.train}` and their values that will be used. This also reflects any config overrides provided on the CLI, e.g. `--paths.train /path`. ~~bool (flag)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ | +| **PRINTS** | Config validation errors, if available. | ### debug data {#debug-data tag="command"}