diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index adad72995..69dac0aa1 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements. {%- set use_transformer = (transformer_data and hardware != "cpu") -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} [paths] -train = "" -dev = "" +train = null +dev = null [system] {% if use_transformer -%} diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index a55fad097..ea4f26255 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -1,6 +1,6 @@ [paths] -train = "" -dev = "" +train = null +dev = null vectors = null vocab_data = null init_tok2vec = null diff --git a/spacy/errors.py b/spacy/errors.py index 09b722a7b..233ff29bd 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -477,6 +477,8 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E913 = ("Corpus path can't be None. Maybe you forgot to define it in your " + "config.cfg or override it on the CLI?") E914 = ("Executing {name} callback failed. Expected the function to " "return the nlp object but got: {value}. Maybe you forgot to return " "the modified object in your function?") diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index 663e76550..da048f3d6 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -14,8 +14,8 @@ from ..util import make_tempdir nlp_config_string = """ [paths] -train = "" -dev = "" +train = null +dev = null [corpora] @@ -309,7 +309,7 @@ def test_config_interpolation(): config = Config().from_str(nlp_config_string, interpolate=False) assert config["corpora"]["train"]["path"] == "${paths.train}" interpolated = config.interpolate() - assert interpolated["corpora"]["train"]["path"] == "" + assert interpolated["corpora"]["train"]["path"] is None nlp = English.from_config(config) assert nlp.config["corpora"]["train"]["path"] == "${paths.train}" # Ensure that variables are preserved in nlp config @@ -317,10 +317,10 @@ def test_config_interpolation(): assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width interpolated2 = nlp.config.interpolate() - assert interpolated2["corpora"]["train"]["path"] == "" + assert interpolated2["corpora"]["train"]["path"] is None assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 nlp2 = English.from_config(interpolated) - assert nlp2.config["corpora"]["train"]["path"] == "" + assert nlp2.config["corpora"]["train"]["path"] is None assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342 diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 85079f41c..9331ecefb 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -7,7 +7,7 @@ import srsly from .. import util from .augment import dont_augment from .example import Example -from ..errors import Warnings +from ..errors import Warnings, Errors from ..tokens import DocBin, Doc from ..vocab import Vocab @@ -20,12 +20,14 @@ FILE_TYPE = ".spacy" @util.registry.readers("spacy.Corpus.v1") def create_docbin_reader( - path: Path, + path: Optional[Path], gold_preproc: bool, max_length: int = 0, limit: int = 0, augmenter: Optional[Callable] = None, ) -> Callable[["Language"], Iterable[Example]]: + if path is None: + raise ValueError(Errors.E913) util.logger.debug(f"Loading corpus from path: {path}") return Corpus( path,