spaCy/spacy/cli/debug_config.py

from pathlib import Path
from typing import Any, Dict, List, Optional, Union

import typer
from thinc.api import Config
from thinc.config import VARIABLE_RE
from wasabi import msg, table

from .. import util
from ..schemas import ConfigSchemaInit, ConfigSchemaTraining
from ..util import registry
from ._util import (
    Arg,
    Opt,
    debug_cli,
    import_code_paths,
    parse_config_overrides,
    show_validation_error,
)


@debug_cli.command(
    "config",
    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def debug_config_cli(
    # fmt: off
    ctx: typer.Context,  # This is only used to read additional arguments
    config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),
    code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),
    show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),
    show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")
    # fmt: on
):
    """Debug a config file and show validation errors. The command will
    create all objects in the tree and validate them. Note that some config
    validation errors are blocking and will prevent the rest of the config from
    being resolved. This means that you may not see all validation errors at
    once and some issues are only shown once previous errors have been fixed.
    Similar as with the 'train' command, you can override settings from the config
    as command line options. For instance, --training.batch_size 128 overrides
    the value of "batch_size" in the block "[training]".

    DOCS: https://spacy.io/api/cli#debug-config
    """
    overrides = parse_config_overrides(ctx.args)
    import_code_paths(code_path)
    debug_config(
        config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars
    )


def debug_config(
    config_path: Path,
    *,
    overrides: Dict[str, Any] = {},
    show_funcs: bool = False,
    show_vars: bool = False,
):
    msg.divider("Config validation")
    with show_validation_error(config_path):
        config = util.load_config(config_path, overrides=overrides)
        nlp = util.load_model_from_config(config)
        config = nlp.config.interpolate()
    msg.divider("Config validation for [initialize]")
    with show_validation_error(config_path):
        T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)
    msg.divider("Config validation for [training]")
    with show_validation_error(config_path):
        T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
        dot_names = [T["train_corpus"], T["dev_corpus"]]
        util.resolve_dot_names(config, dot_names)
    msg.good("Config is valid")
    if show_vars:
        variables = get_variables(config)
        msg.divider(f"Variables ({len(variables)})")
        head = ("Variable", "Value")
        msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2)
    if show_funcs:
        funcs = get_registered_funcs(config)
        msg.divider(f"Registered functions ({len(funcs)})")
        for func in funcs:
            func_data = {
                "Registry": f"@{func['registry']}",
                "Name": func["name"],
                "Module": func["module"],
                "File": f"{func['file']} (line {func['line_no']})",
            }
            msg.info(f"[{func['path']}]")
            print(table(func_data).strip())


def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]:
    result = []
    for key, value in util.walk_dict(config):
        if not key[-1].startswith("@"):
            continue
        # We have a reference to a registered function
        reg_name = key[-1][1:]
        registry = getattr(util.registry, reg_name)
        path = ".".join(key[:-1])
        info = registry.find(value)
        result.append({"name": value, "registry": reg_name, "path": path, **info})
    return result


def get_variables(config: Config) -> Dict[str, Any]:
    result = {}
    for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))):
        path = variable[2:-1].replace(":", ".")
        value = util.dot_to_object(config, path)
        result[variable] = repr(value)
    return result
Add more info to debug config 2020-08-27 19:17:58 +03:00			`from pathlib import Path`
isort all the things 2023-06-26 12:41:03 +03:00			`from typing import Any, Dict, List, Optional, Union`

			`import typer`
Refactor CLI 2020-09-28 16:09:59 +03:00			`from thinc.api import Config`
Improve CLI config validation with latest Thinc 2020-09-26 14:13:57 +03:00			`from thinc.config import VARIABLE_RE`
isort all the things 2023-06-26 12:41:03 +03:00			`from wasabi import msg, table`
Add more info to debug config 2020-08-27 19:17:58 +03:00
isort all the things 2023-06-26 12:41:03 +03:00			`from .. import util`
Expand initialize/training config validation Validate both `[initialize]` and `[training]` in `debug data` and `nlp.initialize()` with separate config validation error blocks that indicate which block of the config is being validated. 2021-01-12 19:17:00 +03:00			`from ..schemas import ConfigSchemaInit, ConfigSchemaTraining`
Fix small issues, resolve_dot_names and debug model 2020-09-29 21:38:35 +03:00			`from ..util import registry`
isort all the things 2023-06-26 12:41:03 +03:00			`from ._util import (`
			`Arg,`
			`Opt,`
			`debug_cli,`
Accept multiple code files in all CLI commands (#12101) * Add support for multiple code files to all relevant commands Prior to this, only the package command supported multiple code files. * Update docs * Add debug data test, plus generic fixtures One tricky thing here: it's tempting to create the config by creating a pipeline in code, but that requires declaring the custom components here. However the CliRunner appears to be run in the same process or otherwise have access to our registry, so it works even without any code arguments. So it's necessary to avoid declaring the components in the tests. * Add debug config test and restructure The code argument imports the provided file. If it adds item to the registry, that affects global state, which CliRunner doesn't isolate. Since there's no standard way to remove things from the registry, this instead uses subprocess.run to run commands. * Use a more generic, parametrized test * Add output arg for assemble and pretrain Assemble and pretrain require an output argument. This commit adds assemble testing, but not pretrain, as that requires an actual trainable component, which is not currently in the test config. * Add evaluate test and some cleanup * Mark tests as slow * Revert argument name change * Apply suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Format API CLI docs * isort * Fix imports in tests * isort * Undo changes to package CLI help * Fix python executable and lang code in test * Fix executable in another test --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com> 2023-08-01 16:24:02 +03:00			`import_code_paths,`
isort all the things 2023-06-26 12:41:03 +03:00			`parse_config_overrides,`
			`show_validation_error,`
			`)`
Add more info to debug config 2020-08-27 19:17:58 +03:00

			`@debug_cli.command(`
			`"config",`
			`context_settings={"allow_extra_args": True, "ignore_unknown_options": True},`
			`)`
			`def debug_config_cli(`
			`# fmt: off`
			`ctx: typer.Context, # This is only used to read additional arguments`
Update argument handling and documentation 2020-12-08 12:41:18 +03:00			`config_path: Path = Arg(..., help="Path to config file", exists=True, allow_dash=True),`
Accept multiple code files in all CLI commands (#12101) * Add support for multiple code files to all relevant commands Prior to this, only the package command supported multiple code files. * Update docs * Add debug data test, plus generic fixtures One tricky thing here: it's tempting to create the config by creating a pipeline in code, but that requires declaring the custom components here. However the CliRunner appears to be run in the same process or otherwise have access to our registry, so it works even without any code arguments. So it's necessary to avoid declaring the components in the tests. * Add debug config test and restructure The code argument imports the provided file. If it adds item to the registry, that affects global state, which CliRunner doesn't isolate. Since there's no standard way to remove things from the registry, this instead uses subprocess.run to run commands. * Use a more generic, parametrized test * Add output arg for assemble and pretrain Assemble and pretrain require an output argument. This commit adds assemble testing, but not pretrain, as that requires an actual trainable component, which is not currently in the test config. * Add evaluate test and some cleanup * Mark tests as slow * Revert argument name change * Apply suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Format API CLI docs * isort * Fix imports in tests * isort * Undo changes to package CLI help * Fix python executable and lang code in test * Fix executable in another test --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com> 2023-08-01 16:24:02 +03:00			`code_path: str = Opt("", "--code", "-c", help="Comma-separated paths to Python files with additional code (registered functions) to be imported"),`
Add more info to debug config 2020-08-27 19:17:58 +03:00			`show_funcs: bool = Opt(False, "--show-functions", "-F", help="Show an overview of all registered functions used in the config and where they come from (modules, files etc.)"),`
			`show_vars: bool = Opt(False, "--show-variables", "-V", help="Show an overview of all variables referenced in the config and their values. This will also reflect variables overwritten on the CLI.")`
			`# fmt: on`
			`):`
Fix references to config file in the docs & UX (#9961) * doc fixes around config file * fix typo * clarify default 2022-01-04 16:31:26 +03:00			`"""Debug a config file and show validation errors. The command will`
Add more info to debug config 2020-08-27 19:17:58 +03:00			`create all objects in the tree and validate them. Note that some config`
			`validation errors are blocking and will prevent the rest of the config from`
			`being resolved. This means that you may not see all validation errors at`
			`once and some issues are only shown once previous errors have been fixed.`
			`Similar as with the 'train' command, you can override settings from the config`
			`as command line options. For instance, --training.batch_size 128 overrides`
			`the value of "batch_size" in the block "[training]".`
Update docs links in codebase 2020-09-04 13:58:50 +03:00
Replace links to nightly docs [ci skip] 2021-01-30 12:09:38 +03:00			`DOCS: https://spacy.io/api/cli#debug-config`
Add more info to debug config 2020-08-27 19:17:58 +03:00			`"""`
			`overrides = parse_config_overrides(ctx.args)`
Accept multiple code files in all CLI commands (#12101) * Add support for multiple code files to all relevant commands Prior to this, only the package command supported multiple code files. * Update docs * Add debug data test, plus generic fixtures One tricky thing here: it's tempting to create the config by creating a pipeline in code, but that requires declaring the custom components here. However the CliRunner appears to be run in the same process or otherwise have access to our registry, so it works even without any code arguments. So it's necessary to avoid declaring the components in the tests. * Add debug config test and restructure The code argument imports the provided file. If it adds item to the registry, that affects global state, which CliRunner doesn't isolate. Since there's no standard way to remove things from the registry, this instead uses subprocess.run to run commands. * Use a more generic, parametrized test * Add output arg for assemble and pretrain Assemble and pretrain require an output argument. This commit adds assemble testing, but not pretrain, as that requires an actual trainable component, which is not currently in the test config. * Add evaluate test and some cleanup * Mark tests as slow * Revert argument name change * Apply suggestions from code review Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Format API CLI docs * isort * Fix imports in tests * isort * Undo changes to package CLI help * Fix python executable and lang code in test * Fix executable in another test --------- Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Raphael Mitsch <r.mitsch@outlook.com> 2023-08-01 16:24:02 +03:00			`import_code_paths(code_path)`
Add more info to debug config 2020-08-27 19:17:58 +03:00			`debug_config(`
			`config_path, overrides=overrides, show_funcs=show_funcs, show_vars=show_vars`
			`)`


			`def debug_config(`
			`config_path: Path,`
			`*,`
			`overrides: Dict[str, Any] = {},`
			`show_funcs: bool = False,`
			`show_vars: bool = False,`
			`):`
			`msg.divider("Config validation")`
			`with show_validation_error(config_path):`
			`config = util.load_config(config_path, overrides=overrides)`
Update config resolution to use new Thinc 2020-09-27 23:21:31 +03:00			`nlp = util.load_model_from_config(config)`
Fix small issues, resolve_dot_names and debug model 2020-09-29 21:38:35 +03:00			`config = nlp.config.interpolate()`
Expand initialize/training config validation Validate both `[initialize]` and `[training]` in `debug data` and `nlp.initialize()` with separate config validation error blocks that indicate which block of the config is being validated. 2021-01-12 19:17:00 +03:00			`msg.divider("Config validation for [initialize]")`
			`with show_validation_error(config_path):`
			`T = registry.resolve(config["initialize"], schema=ConfigSchemaInit)`
			`msg.divider("Config validation for [training]")`
			`with show_validation_error(config_path):`
Fix small issues, resolve_dot_names and debug model 2020-09-29 21:38:35 +03:00			`T = registry.resolve(config["training"], schema=ConfigSchemaTraining)`
			`dot_names = [T["train_corpus"], T["dev_corpus"]]`
			`util.resolve_dot_names(config, dot_names)`
Add more info to debug config 2020-08-27 19:17:58 +03:00			`msg.good("Config is valid")`
			`if show_vars:`
			`variables = get_variables(config)`
			`msg.divider(f"Variables ({len(variables)})")`
			`head = ("Variable", "Value")`
			`msg.table(variables, header=head, divider=True, widths=(41, 34), spacing=2)`
			`if show_funcs:`
			`funcs = get_registered_funcs(config)`
			`msg.divider(f"Registered functions ({len(funcs)})")`
			`for func in funcs:`
			`func_data = {`
			`"Registry": f"@{func['registry']}",`
			`"Name": func["name"],`
			`"Module": func["module"],`
			`"File": f"{func['file']} (line {func['line_no']})",`
			`}`
			`msg.info(f"[{func['path']}]")`
			`print(table(func_data).strip())`


			`def get_registered_funcs(config: Config) -> List[Dict[str, Optional[Union[str, int]]]]:`
			`result = []`
			`for key, value in util.walk_dict(config):`
			`if not key[-1].startswith("@"):`
			`continue`
			`# We have a reference to a registered function`
			`reg_name = key[-1][1:]`
			`registry = getattr(util.registry, reg_name)`
			`path = ".".join(key[:-1])`
			`info = registry.find(value)`
			`result.append({"name": value, "registry": reg_name, "path": path, **info})`
			`return result`


			`def get_variables(config: Config) -> Dict[str, Any]:`
			`result = {}`
			`for variable in sorted(set(VARIABLE_RE.findall(config.to_str()))):`
			`path = variable[2:-1].replace(":", ".")`
			`value = util.dot_to_object(config, path)`
			`result[variable] = repr(value)`
			`return result`