diff --git a/spacy/cli/debug_config.py b/spacy/cli/debug_config.py index d1dcc45b9..a6c7345f0 100644 --- a/spacy/cli/debug_config.py +++ b/spacy/cli/debug_config.py @@ -7,6 +7,8 @@ import typer from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import import_code, debug_cli +from ..schemas import ConfigSchemaTraining +from ..util import registry from .. import util @@ -52,8 +54,10 @@ def debug_config( with show_validation_error(config_path): config = util.load_config(config_path, overrides=overrides) nlp = util.load_model_from_config(config) - dot_names = ["training.dev_corpus", "training.train_corpus"] - util.resolve_dot_names(nlp.config, dot_names) + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) + dot_names = [T["train_corpus"], T["dev_corpus"]] + util.resolve_dot_names(config, dot_names) msg.good("Config is valid") if show_vars: variables = get_variables(config) diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index eca85dc04..3b8ba7dae 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -2,7 +2,7 @@ from typing import Dict, Any, Optional, Iterable from pathlib import Path from spacy.training import Example -from spacy.util import dot_to_object +from spacy.util import resolve_dot_names from wasabi import msg from thinc.api import fix_random_seed, set_dropout_rate, Adam from thinc.api import Model, data_validation, set_gpu_allocator @@ -15,7 +15,10 @@ from ..util import registry from .. import util -@debug_cli.command("model") +@debug_cli.command( + "model", + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) def debug_model_cli( # fmt: off ctx: typer.Context, # This is only used to read additional arguments @@ -57,15 +60,14 @@ def debug_model_cli( raw_config = util.load_config( config_path, overrides=config_overrides, interpolate=False ) - config = raw_config.iterpolate() + config = raw_config.interpolate() allocator = config["training"]["gpu_allocator"] if use_gpu >= 0 and allocator: set_gpu_allocator(allocator) with show_validation_error(config_path): nlp = util.load_model_from_config(raw_config) - T = registry.resolve( - nlp.config.interpolate()["training"], schema=ConfigSchemaTraining - ) + config = nlp.config.interpolate() + T = registry.resolve(config["training"], schema=ConfigSchemaTraining) seed = T["seed"] if seed is not None: msg.info(f"Fixing random seed: {seed}") @@ -77,11 +79,16 @@ def debug_model_cli( exits=1, ) model = pipe.model - debug_model(T, nlp, model, print_settings=print_settings) + debug_model(config, T, nlp, model, print_settings=print_settings) def debug_model( - config, nlp, model: Model, *, print_settings: Optional[Dict[str, Any]] = None + config, + resolved_train_config, + nlp, + model: Model, + *, + print_settings: Optional[Dict[str, Any]] = None, ): if not isinstance(model, Model): msg.fail( @@ -102,13 +109,16 @@ def debug_model( # The output vector might differ from the official type of the output layer with data_validation(False): try: - train_corpus = dot_to_object(config, config["training"]["train_corpus"]) - nlp.initialize(lambda: train_corpus(nlp)) + dot_names = [resolved_train_config["train_corpus"]] + with show_validation_error(): + (train_corpus,) = resolve_dot_names(config, dot_names) + nlp.initialize(lambda: train_corpus(nlp)) msg.info("Initialized the model with the training corpus.") except ValueError: try: _set_output_dim(nO=7, model=model) - nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) + with show_validation_error(): + nlp.initialize(lambda: [Example.from_dict(x, {}) for x in X]) msg.info("Initialized the model with dummy data.") except Exception: msg.fail( diff --git a/spacy/schemas.py b/spacy/schemas.py index 555a505d7..d9a31c742 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -389,14 +389,12 @@ class ConfigSchema(BaseModel): arbitrary_types_allowed = True -class TrainingSchema(BaseModel): - training: ConfigSchemaTraining - pretraining: Union[ConfigSchemaPretrain, ConfigSchemaPretrainEmpty] = {} - corpora: Dict[str, Reader] - - class Config: - extra = "allow" - arbitrary_types_allowed = True +CONFIG_SCHEMAS = { + "nlp": ConfigSchemaNlp, + "training": ConfigSchemaTraining, + "pretraining": ConfigSchemaPretrain, + "initialize": ConfigSchemaInit, +} # Project config Schema diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index f48cfba00..f710a38eb 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -128,10 +128,10 @@ def test_resolve_dot_names(): "training": {"optimizer": {"@optimizers": "Adam.v1"}}, "foo": {"bar": "training.optimizer", "baz": "training.xyz"}, } - result = util.resolve_dot_names(config, ["foo.bar"]) + result = util.resolve_dot_names(config, ["training.optimizer"]) assert isinstance(result[0], Optimizer) with pytest.raises(ConfigValidationError) as e: - util.resolve_dot_names(config, ["foo.baz", "foo.bar"]) + util.resolve_dot_names(config, ["training.xyz", "training.optimizer"]) errors = e.value.errors assert len(errors) == 1 assert errors[0]["loc"] == ["training", "xyz"] diff --git a/spacy/tests/training/test_readers.py b/spacy/tests/training/test_readers.py index ea39e8b90..9d82ca50a 100644 --- a/spacy/tests/training/test_readers.py +++ b/spacy/tests/training/test_readers.py @@ -39,12 +39,12 @@ def test_readers(): config = Config().from_str(config_string) nlp = load_model_from_config(config, auto_fill=True) - dot_names = ["training.train_corpus", "training.dev_corpus"] - train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) - assert isinstance(train_corpus, Callable) T = registry.resolve( nlp.config.interpolate()["training"], schema=ConfigSchemaTraining ) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) + assert isinstance(train_corpus, Callable) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) @@ -92,11 +92,11 @@ def test_cat_readers(reader, additional_config): config["corpora"]["@readers"] = reader config["corpora"].update(additional_config) nlp = load_model_from_config(config, auto_fill=True) - dot_names = ["training.train_corpus", "training.dev_corpus"] - train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) T = registry.resolve( nlp.config["training"].interpolate(), schema=ConfigSchemaTraining ) + dot_names = [T["train_corpus"], T["dev_corpus"]] + train_corpus, dev_corpus = resolve_dot_names(nlp.config, dot_names) optimizer = T["optimizer"] # simulate a training loop nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer) diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py index aa5edde5d..09ac2b0ac 100644 --- a/spacy/training/initialize.py +++ b/spacy/training/initialize.py @@ -130,12 +130,12 @@ def init_tok2vec( init_tok2vec = ensure_path(I["init_tok2vec"]) if init_tok2vec is not None: if P["objective"].get("type") == "vectors" and not I["vectors"]: - err = 'need initialize.vocab.vectors if pretraining.objective.type is "vectors"' - errors = [{"loc": ["initialize", "vocab"], "msg": err}] + err = 'need initialize.vectors if pretraining.objective.type is "vectors"' + errors = [{"loc": ["initialize"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) if not init_tok2vec.exists(): err = f"can't find pretrained tok2vec: {init_tok2vec}" - errors = [{"loc": ["initialize", "vocab", "init_tok2vec"], "msg": err}] + errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}] raise ConfigValidationError(config=nlp.config, errors=errors) with init_tok2vec.open("rb") as file_: weights_data = file_.read() diff --git a/spacy/training/loop.py b/spacy/training/loop.py index 41e6464e0..e20cddd3e 100644 --- a/spacy/training/loop.py +++ b/spacy/training/loop.py @@ -29,9 +29,7 @@ def train( output_path (Path): Optional output path to save trained model to. use_gpu (int): Whether to train on GPU. Make sure to call require_gpu before calling this function. - logger (Callable[[Any], Any]): Optional logger exposing the methods info, - error, debug and warn. Defaults to regular spaCy logger but can be - swapped for CLI logger. + silent (bool): Whether to pretty-print outputs. RETURNS (Path / None): The path to the final exported model. """ msg = Printer(no_print=silent) diff --git a/spacy/util.py b/spacy/util.py index 98c2a4083..2dfd00e2f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -392,7 +392,6 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A we could find the lowest part of the tree. """ # TODO: include schema? - # TODO: clean this up and avoid duplication resolved = {} output = [] errors = [] @@ -403,34 +402,20 @@ def resolve_dot_names(config: Config, dot_names: List[Optional[str]]) -> Tuple[A section = name.split(".")[0] # We want to avoid resolving the same thing twice if section not in resolved: - resolved[section] = registry.resolve(config[section]) + if registry.is_promise(config[section]): + # Otherwise we can't resolve [corpus] if it's a promise + result = registry.resolve({"config": config[section]})["config"] + else: + result = registry.resolve(config[section]) + resolved[section] = result try: output.append(dot_to_object(resolved, name)) except KeyError: msg = f"not a valid section reference: {name}" errors.append({"loc": name.split("."), "msg": msg}) - objects = [] - for ref in output: - if not isinstance(ref, str): - objects.append(ref) - continue - section = ref.split(".")[0] - # We want to avoid resolving the same thing twice - if section not in resolved: - if registry.is_promise(config[section]): - # Otherwise we can't resolve [corpus] if it's a promise - result = registry.resolve({"config": config[section]})["config"] - else: - result = registry.resolve(config[section]) - resolved[section] = result - try: - objects.append(dot_to_object(resolved, ref)) - except KeyError: - msg = f"not a valid section reference: {name}" - errors.append({"loc": ref.split("."), "msg": msg}) if errors: raise ConfigValidationError(config=config, errors=errors) - return tuple(objects) + return tuple(output) def load_model_from_init_py(