From 674c39bff9483ffe584d2c95d52d37131c69f279 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 27 Jul 2020 16:48:21 +0200 Subject: [PATCH 1/6] fix train_textcat script --- examples/training/train_textcat.py | 6 ++++-- examples/training/train_textcat_config.cfg | 23 +++++++++------------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index cb65b8c8b..d94fbfd4a 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -20,6 +20,7 @@ import spacy from spacy import util from spacy.util import minibatch, compounding from spacy.gold import Example +from thinc.api import Config @plac.annotations( @@ -42,8 +43,9 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non output_dir.mkdir() print(f"Loading nlp model from {config_path}") - nlp_config = util.load_config(config_path, create_objects=False)["nlp"] - nlp = util.load_model_from_config(nlp_config) + nlp_config = Config().from_disk(config_path) + print(f"config: {nlp_config}") + nlp, _ = util.load_model_from_config(nlp_config) # ensure the nlp object was defined with a textcat component if "textcat" not in nlp.pipe_names: diff --git a/examples/training/train_textcat_config.cfg b/examples/training/train_textcat_config.cfg index 7c0f36b57..a1f4e91ce 100644 --- a/examples/training/train_textcat_config.cfg +++ b/examples/training/train_textcat_config.cfg @@ -1,19 +1,14 @@ [nlp] lang = "en" +pipeline = ["textcat"] -[nlp.pipeline.textcat] +[components] + +[components.textcat] factory = "textcat" -[nlp.pipeline.textcat.model] -@architectures = "spacy.TextCatCNN.v1" -exclusive_classes = false - -[nlp.pipeline.textcat.model.tok2vec] -@architectures = "spacy.HashEmbedCNN.v1" -pretrained_vectors = null -width = 96 -depth = 4 -embed_size = 2000 -window_size = 1 -maxout_pieces = 3 -subword_features = true +[components.textcat.model] +@architectures = "spacy.TextCatBOW.v1" +exclusive_classes = true +ngram_size = 1 +no_output_layer = false From 61068e0fb1b3851a06f82cc95289299154799cab Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 27 Jul 2020 17:50:12 +0200 Subject: [PATCH 2/6] util function dot_to_object and corresponding unit test --- examples/training/train_textcat.py | 2 +- spacy/cli/debug_model.py | 12 ++----- spacy/errors.py | 2 +- spacy/tests/test_util.py | 55 +++++++++++++++++++++++++++++- spacy/util.py | 22 +++++++++++- 5 files changed, 79 insertions(+), 14 deletions(-) diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index d94fbfd4a..92cdba054 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -45,7 +45,7 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non print(f"Loading nlp model from {config_path}") nlp_config = Config().from_disk(config_path) print(f"config: {nlp_config}") - nlp, _ = util.load_model_from_config(nlp_config) + nlp, _ = util.load_model_from_config(nlp_config, auto_fill=True) # ensure the nlp object was defined with a textcat component if "textcat" not in nlp.pipe_names: diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py index 936a7492e..88e060238 100644 --- a/spacy/cli/debug_model.py +++ b/spacy/cli/debug_model.py @@ -8,6 +8,7 @@ import typer from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides from .. import util from ..lang.en import English +from ..util import dot_to_object @debug_cli.command("model") @@ -60,16 +61,7 @@ def debug_model_cli( msg.info(f"Fixing random seed: {seed}") fix_random_seed(seed) - component = config - parts = section.split(".") - for item in parts: - try: - component = component[item] - except KeyError: - msg.fail( - f"The section '{section}' is not a valid section in the provided config.", - exits=1, - ) + component = dot_to_object(config, section) if hasattr(component, "model"): model = component.model else: diff --git a/spacy/errors.py b/spacy/errors.py index df6f82757..862cc71d8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -592,7 +592,7 @@ class Errors: "for the `nlp` pipeline with components {names}.") E993 = ("The config for 'nlp' needs to include a key 'lang' specifying " "the code of the language to initialize it with (for example " - "'en' for English).\n\n{config}") + "'en' for English) - this can't be 'None'.\n\n{config}") E996 = ("Could not parse {file}: {msg}") E997 = ("Tokenizer special cases are not allowed to modify the text. " "This would map '{chunk}' to '{orth}' given token attributes " diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 65c33c54a..2577ec675 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -2,7 +2,13 @@ import pytest from .util import get_random_doc -from spacy.util import minibatch_by_words +from spacy import util +from spacy.util import minibatch_by_words, dot_to_object +from thinc.api import Config, Optimizer + +from ..lang.en import English +from ..lang.nl import Dutch +from ..language import DEFAULT_CONFIG_PATH @pytest.mark.parametrize( @@ -56,3 +62,50 @@ def test_util_minibatch_oversize(doc_sizes, expected_batches): minibatch_by_words(docs, size=batch_size, tolerance=tol, discard_oversize=False) ) assert [len(batch) for batch in batches] == expected_batches + + +def test_util_dot_section(): + cfg_string = """ + [nlp] + lang = "en" + pipeline = ["textcat"] + load_vocab_data = false + + [components] + + [components.textcat] + factory = "textcat" + + [components.textcat.model] + @architectures = "spacy.TextCatBOW.v1" + exclusive_classes = true + ngram_size = 1 + no_output_layer = false + """ + nlp_config = Config().from_str(cfg_string) + en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True) + print(en_config) + + default_config = Config().from_disk(DEFAULT_CONFIG_PATH) + default_config["nlp"]["lang"] = "nl" + nl_nlp, nl_config = util.load_model_from_config(default_config, auto_fill=True) + + # Test that creation went OK + assert isinstance(en_nlp, English) + assert isinstance(nl_nlp, Dutch) + assert nl_nlp.pipe_names == [] + assert en_nlp.pipe_names == ["textcat"] + assert en_nlp.get_pipe("textcat").model.attrs["multi_label"] == False # not exclusive_classes + + # Test that default values got overwritten + assert not en_config["nlp"]["load_vocab_data"] + assert nl_config["nlp"]["load_vocab_data"] # default value True + + # Test proper functioning of 'dot_to_object' + with pytest.raises(KeyError): + obj = dot_to_object(en_config, "nlp.pipeline.tagger") + with pytest.raises(KeyError): + obj = dot_to_object(en_config, "nlp.unknownattribute") + assert not dot_to_object(en_config, "nlp.load_vocab_data") + assert dot_to_object(nl_config, "nlp.load_vocab_data") + assert isinstance(dot_to_object(nl_config, "training.optimizer"), Optimizer) diff --git a/spacy/util.py b/spacy/util.py index c98ce2354..5d8b70961 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -258,7 +258,7 @@ def load_model_from_config( if "nlp" not in config: raise ValueError(Errors.E985.format(config=config)) nlp_config = config["nlp"] - if "lang" not in nlp_config: + if "lang" not in nlp_config or nlp_config["lang"] is None: raise ValueError(Errors.E993.format(config=nlp_config)) # This will automatically handle all codes registered via the languages # registry, including custom subclasses provided via entry points @@ -1107,6 +1107,26 @@ def dict_to_dot(obj: Dict[str, dict]) -> Dict[str, Any]: return {".".join(key): value for key, value in walk_dict(obj)} +def dot_to_object(config: Config, section: str): + """Convert dot notation of a "section" to a specific part of the Config. + e.g. "training.optimizer" would return the Optimizer object. + Throws an error if the section is not defined in this config. + + config (Config): The config. + section (str): The dot notation of the section in the config. + RETURNS: The object denoted by the section + """ + component = config + parts = section.split(".") + for item in parts: + try: + component = component[item] + except (KeyError, TypeError) as e: + msg = f"The section '{section}' is not a valid section in the provided config." + raise KeyError(msg) + return component + + def walk_dict( node: Dict[str, Any], parent: List[str] = [] ) -> Iterator[Tuple[List[str], Any]]: From 8353ca5a516ba7e1bfb4d8d0207f2854de8d73f4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 27 Jul 2020 17:53:36 +0200 Subject: [PATCH 3/6] remove printing of config --- examples/training/train_textcat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/training/train_textcat.py b/examples/training/train_textcat.py index 92cdba054..901b382bf 100644 --- a/examples/training/train_textcat.py +++ b/examples/training/train_textcat.py @@ -44,7 +44,6 @@ def main(config_path, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=Non print(f"Loading nlp model from {config_path}") nlp_config = Config().from_disk(config_path) - print(f"config: {nlp_config}") nlp, _ = util.load_model_from_config(nlp_config, auto_fill=True) # ensure the nlp object was defined with a textcat component From 85b2dcfd677c4888958cc7be840d748263528ec4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 27 Jul 2020 17:54:44 +0200 Subject: [PATCH 4/6] cleanup --- spacy/tests/test_util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 2577ec675..3a6c0fd95 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -84,7 +84,6 @@ def test_util_dot_section(): """ nlp_config = Config().from_str(cfg_string) en_nlp, en_config = util.load_model_from_config(nlp_config, auto_fill=True) - print(en_config) default_config = Config().from_disk(DEFAULT_CONFIG_PATH) default_config["nlp"]["lang"] = "nl" From 274824921725b24bd271fc689eff9b8c20c8d01d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 28 Jul 2020 16:14:23 +0200 Subject: [PATCH 5/6] Re-add meta["pipeline"] for now --- spacy/language.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index cade90b24..79fceec95 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -210,6 +210,9 @@ class Language: "name": self.vocab.vectors.name, } self._meta["labels"] = self.pipe_labels + # TODO: Adding this back to prevent breaking people's code etc., but + # we should consider removing it + self._meta["pipeline"] = self.pipe_names return self._meta @meta.setter From ba22111ff407c3e6ea462626323c9048a3d9a4e4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 28 Jul 2020 16:24:14 +0200 Subject: [PATCH 6/6] Move error to Errors --- spacy/errors.py | 1 + spacy/util.py | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 862cc71d8..a10e5d9bd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -483,6 +483,7 @@ class Errors: E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") # TODO: fix numbering after merging develop into master + E952 = ("The section '{name}' is not a valid section in the provided config.") E953 = ("Mismatched IDs received by the Tok2Vec listener: {id1} vs. {id2}") E954 = ("The Tok2Vec listener did not receive a valid input.") E955 = ("Can't find table '{table}' for language '{lang}' in spacy-lookups-data.") diff --git a/spacy/util.py b/spacy/util.py index eba60795a..d1951145f 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1121,9 +1121,8 @@ def dot_to_object(config: Config, section: str): for item in parts: try: component = component[item] - except (KeyError, TypeError) as e: - msg = f"The section '{section}' is not a valid section in the provided config." - raise KeyError(msg) + except (KeyError, TypeError): + raise KeyError(Errors.E952.format(name=section)) return component