diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index cb65b8c8b..901b382bf 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -20,6 +20,7 @@ import spacy from spacy import util from spacy.util import minibatch, compounding from spacy.gold import Example +from thinc.api import Config @plac.annotations( @@ -42,8 +43,8 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non output_dir.mkdir() print(f"Loading nlp model from {config_path}") - nlp_config = util.load_config(config_path, create_objects=False)["nlp"] - nlp = util.load_model_from_config(nlp_config) + nlp_config = Config().from_disk(config_path) + nlp, _ = util.load_model_from_config(nlp_config, auto_fill=True) # ensure the nlp object was defined with a textcat component if "textcat" not in nlp.pipe_names: diff --git a/examples/training/train_textcat_config.cfg b/examples/training/train_textcat_config.cfg index 7c0f36b57..a1f4e91ce 100644 --- a/examples/training/train_textcat_config.cfg +++ b/examples/training/train_textcat_config.cfg @@ -1,19 +1,14 @@ [nlp] lang = "en" +pipeline = ["textcat"] -[nlp.pipeline.textcat] +[components] + +[components.textcat] factory = "textcat" -[nlp.pipeline.textcat.model] -@architectures = "spacy.TextCatCNN.v1" -exclusive_classes = false - -[nlp.pipeline.textcat.model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 96 -depth = 4 -embed_size = 2000 -window_size = 1 -maxout_pieces = 3 -subword_features = true +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = true +ngram_size = 1 +no_output_layer = false diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 936a7492e..88e060238 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -8,6 +8,7 @@ import typer from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides from .. import util from ..lang.en import English +from ..util import dot_to_object @debug_cli.command("model") @@ -60,16 +61,7 @@ def debug_model_cli( msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) - component = config - parts = section.split(".") - for item in parts: - try: - component = component[item] - except KeyError: - msg.fail( - f"The section '{section}' is not a valid section in the provided config.", - exits=1, - ) + component = dot_to_object(config, section) if hasattr(component, "model"): model = component.model else: diff --git a/spacy/errors.py b/spacy/errors.py index 5d26407ad..204a3a2c9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,7 +483,8 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master - E952 = ("Invalid requirement specified by component '{name}': {req}") + E951 = ("Invalid requirement specified by component '{name}': {req}") + E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.") @@ -593,7 +594,7 @@ class Errors: "for the `nlp` pipeline with components {names}.") E993 = ("The config for 'nlp' needs to include a key 'lang' specifying " "the code of the language to initialize it with (for example " - "'en' for English).\n\n{config}") + "'en' for English) - this can't be 'None'.\n\n{config}") E996 = ("Could not parse {file}: {msg}") E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " diff --git a/spacy/language.py b/spacy/language.py index 26ac85a5d..8fd2d6521 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -213,6 +213,9 @@ class Language: self._meta["labels"] = self.pipe_labels reqs = {p: self.get_pipe_meta(p).package_requirements for p in self.pipe_names} self._meta["requirements"].extend(util.merge_pipe_requirements(reqs)) + # TODO: Adding this back to prevent breaking people's code etc., but + # we should consider removing it + self._meta["pipeline"] = self.pipe_names return self._meta @meta.setter diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 65c33c54a..3a6c0fd95 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -2,7 +2,13 @@ import pytest from .util import get_random_doc -from spacy.util import minibatch_by_words +from spacy import util +from spacy.util import minibatch_by_words, dot_to_object +from thinc.api import Config, Optimizer + +from ..lang.en import English +from ..lang.nl import Dutch +from ..language import DEFAULT_CONFIG_PATH @pytest.mark.parametrize( @@ -56,3 +62,49 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) ) assert [len(batch) for batch in batches] == expected_batches + + +def test_util_dot_section(): + cfg_string = """ + [nlp] + lang = "en" + pipeline = ["textcat"] + load_vocab_data = false + + [components] + + [components.textcat] + factory = "textcat" + + [components.textcat.model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = true + ngram_size = 1 + no_output_layer = false + """ + nlp_config = Config().from_str(cfg_string) + en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True) + + default_config = Config().from_disk(DEFAULT_CONFIG_PATH) + default_config["nlp"]["lang"] = "nl" + nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True) + + # Test that creation went OK + assert isinstance(en_nlp, English) + assert isinstance(nl_nlp, Dutch) + assert nl_nlp.pipe_names == [] + assert en_nlp.pipe_names == ["textcat"] + assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes + + # Test that default values got overwritten + assert not en_config["nlp"]["load_vocab_data"] + assert nl_config["nlp"]["load_vocab_data"] # default value True + + # Test proper functioning of 'dot_to_object' + with pytest.raises(KeyError): + obj = dot_to_object(en_config, "nlp.pipeline.tagger") + with pytest.raises(KeyError): + obj = dot_to_object(en_config, "nlp.unknownattribute") + assert not dot_to_object(en_config, "nlp.load_vocab_data") + assert dot_to_object(nl_config, "nlp.load_vocab_data") + assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) diff --git a/spacy/util.py b/spacy/util.py index ccd6d3869..d4a2ca90a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -259,7 +259,7 @@ def load_model_from_config( if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) nlp_config = config["nlp"] - if "lang" not in nlp_config: + if "lang" not in nlp_config or nlp_config["lang"] is None: raise ValueError(Errors.E993.format(config=nlp_config)) # This will automatically handle all codes registered via the languages # registry, including custom subclasses provided via entry points @@ -1136,6 +1136,25 @@ def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]: return {".".join(key): value for key, value in walk_dict(obj)} +def dot_to_object(config: Config, section: str): + """Convert dot notation of a "section" to a specific part of the Config. + e.g. "training.optimizer" would return the Optimizer object. + Throws an error if the section is not defined in this config. + + config (Config): The config. + section (str): The dot notation of the section in the config. + RETURNS: The object denoted by the section + """ + component = config + parts = section.split(".") + for item in parts: + try: + component = component[item] + except (KeyError, TypeError): + raise KeyError(Errors.E952.format(name=section)) + return component + + def walk_dict( node: Dict[str, Any], parent: List[str] = [] ) -> Iterator[Tuple[List[str], Any]]: