Make corpus paths default to None and improve errors

2025-08-08 14:14:57 +03:00 · 2020-09-29 22:33:46 +02:00 · 2020-09-29 22:33:46 +02:00 · 1aeef3bfbb
commit 1aeef3bfbb
parent 0250bcf6a3
6 changed files with 16 additions and 12 deletions
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
 {%- set use_transformer = (transformer_data and hardware != "cpu") -%}
 {%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null

 [system]
 {% if use_transformer -%}
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -66,7 +66,7 @@ def init_pipeline(
            nlp.to_disk(init_path)
            msg.good(f"Saved initialized pipeline to {init_path}")
        else:
-            nlp = util.load_model(init_path)
+            nlp = util.load_model(init_path).from_config(config)
            if must_reinitialize(config, nlp.config):
                msg.warn("Config has changed: need to re-initialize pipeline")
                nlp = init_nlp(config, **init_kwargs)
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -1,6 +1,6 @@
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null
 vectors = null
 vocab_data = null
 init_tok2vec = null
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -477,6 +477,8 @@ class Errors:
    E201 = ("Span index out of range.")

    # TODO: fix numbering after merging develop into master
+    E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
+            "config.cfg or override it on the CLI?")
    E914 = ("Executing {name} callback failed. Expected the function to "
            "return the nlp object but got: {value}. Maybe you forgot to return "
            "the modified object in your function?")
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -14,8 +14,8 @@ from ..util import make_tempdir

 nlp_config_string = """
 [paths]
-train = ""
-dev = ""
+train = null
+dev = null

 [corpora]

@ -309,7 +309,7 @@ def test_config_interpolation():
    config = Config().from_str(nlp_config_string, interpolate=False)
    assert config["corpora"]["train"]["path"] == "${paths.train}"
    interpolated = config.interpolate()
-    assert interpolated["corpora"]["train"]["path"] == ""
+    assert interpolated["corpora"]["train"]["path"] is None
    nlp = English.from_config(config)
    assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
    # Ensure that variables are preserved in nlp config
@ -317,10 +317,10 @@ def test_config_interpolation():
    assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
    assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
    interpolated2 = nlp.config.interpolate()
-    assert interpolated2["corpora"]["train"]["path"] == ""
+    assert interpolated2["corpora"]["train"]["path"] is None
    assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
    nlp2 = English.from_config(interpolated)
-    assert nlp2.config["corpora"]["train"]["path"] == ""
+    assert nlp2.config["corpora"]["train"]["path"] is None
    assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342


--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@ -7,7 +7,7 @@ import srsly
 from .. import util
 from .augment import dont_augment
 from .example import Example
-from ..errors import Warnings
+from ..errors import Warnings, Errors
 from ..tokens import DocBin, Doc
 from ..vocab import Vocab

@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"

@util.registry.readers("spacy.Corpus.v1")
 def create_docbin_reader(
-    path: Path,
+    path: Optional[Path],
    gold_preproc: bool,
    max_length: int = 0,
    limit: int = 0,
    augmenter: Optional[Callable] = None,
 ) -> Callable[["Language"], Iterable[Example]]:
+    if path is None:
+        raise ValueError(Errors.E913)
    return Corpus(
        path,
        gold_preproc=gold_preproc,