From 1aeef3bfbbf71210d467b616787e37eef5f6e258 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 22:33:46 +0200 Subject: [PATCH 1/2] Make corpus paths default to None and improve errors --- spacy/cli/templates/quickstart_training.jinja | 4 ++-- spacy/cli/train.py | 2 +- spacy/default_config.cfg | 4 ++-- spacy/errors.py | 2 ++ spacy/tests/serialize/test_serialize_config.py | 10 +++++----- spacy/training/corpus.py | 6 ++++-- 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index efe19d315..0e498ee20 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements. {%- set use_transformer = (transformer_data and hardware != "cpu") -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} [paths] -train = "" -dev = "" +train = null +dev = null [system] {% if use_transformer -%} diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 0b3e2580e..e8a422926 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -66,7 +66,7 @@ def init_pipeline( nlp.to_disk(init_path) msg.good(f"Saved initialized pipeline to {init_path}") else: - nlp = util.load_model(init_path) + nlp = util.load_model(init_path).from_config(config) if must_reinitialize(config, nlp.config): msg.warn("Config has changed: need to re-initialize pipeline") nlp = init_nlp(config, **init_kwargs) diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index 222ef7d38..272dc7848 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,6 +1,6 @@ [paths] -train = "" -dev = "" +train = null +dev = null vectors = null vocab_data = null init_tok2vec = null diff --git a/spacy/errors.py b/spacy/errors.py index 09b722a7b..233ff29bd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,6 +477,8 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " + "config.cfg or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " "return the nlp object but got: {value}. Maybe you forgot to return " "the modified object in your function?") diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 663e76550..da048f3d6 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -14,8 +14,8 @@ from ..util import make_tempdir nlp_config_string = """ [paths] -train = "" -dev = "" +train = null +dev = null [corpora] @@ -309,7 +309,7 @@ def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) assert config["corpora"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["corpora"]["train"]["path"] == "" + assert interpolated["corpora"]["train"]["path"] is None nlp = English.from_config(config) assert nlp.config["corpora"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config @@ -317,10 +317,10 @@ def test_config_interpolation(): assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["corpora"]["train"]["path"] == "" + assert interpolated2["corpora"]["train"]["path"] is None assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["corpora"]["train"]["path"] == "" + assert nlp2.config["corpora"]["train"]["path"] is None assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 90eb62474..e85b50cd2 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -7,7 +7,7 @@ import srsly from .. import util from .augment import dont_augment from .example import Example -from ..errors import Warnings +from ..errors import Warnings, Errors from ..tokens import DocBin, Doc from ..vocab import Vocab @@ -20,12 +20,14 @@ FILE_TYPE = ".spacy" @util.registry.readers("spacy.Corpus.v1") def create_docbin_reader( - path: Path, + path: Optional[Path], gold_preproc: bool, max_length: int = 0, limit: int = 0, augmenter: Optional[Callable] = None, ) -> Callable[["Language"], Iterable[Example]]: + if path is None: + raise ValueError(Errors.E913) return Corpus( path, gold_preproc=gold_preproc, From c334a7d45f5a895950a139f40ac7fb6ff24af5a0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Sep 2020 22:38:39 +0200 Subject: [PATCH 2/2] Remove --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index e8a422926..0b3e2580e 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -66,7 +66,7 @@ def init_pipeline( nlp.to_disk(init_path) msg.good(f"Saved initialized pipeline to {init_path}") else: - nlp = util.load_model(init_path).from_config(config) + nlp = util.load_model(init_path) if must_reinitialize(config, nlp.config): msg.warn("Config has changed: need to re-initialize pipeline") nlp = init_nlp(config, **init_kwargs)