Make corpus paths default to None and improve errors

This commit is contained in:
Ines Montani 2020-09-29 22:33:46 +02:00
parent 0250bcf6a3
commit 1aeef3bfbb
6 changed files with 16 additions and 12 deletions

View File

@ -4,8 +4,8 @@ can help generate the best possible configuration, given a user's requirements.
{%- set use_transformer = (transformer_data and hardware != "cpu") -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
[paths]
train = ""
dev = ""
train = null
dev = null
[system]
{% if use_transformer -%}

View File

@ -66,7 +66,7 @@ def init_pipeline(
nlp.to_disk(init_path)
msg.good(f"Saved initialized pipeline to {init_path}")
else:
nlp = util.load_model(init_path)
nlp = util.load_model(init_path).from_config(config)
if must_reinitialize(config, nlp.config):
msg.warn("Config has changed: need to re-initialize pipeline")
nlp = init_nlp(config, **init_kwargs)

View File

@ -1,6 +1,6 @@
[paths]
train = ""
dev = ""
train = null
dev = null
vectors = null
vocab_data = null
init_tok2vec = null

View File

@ -477,6 +477,8 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
E913 = ("Corpus path can't be None. Maybe you forgot to define it in your "
"config.cfg or override it on the CLI?")
E914 = ("Executing {name} callback failed. Expected the function to "
"return the nlp object but got: {value}. Maybe you forgot to return "
"the modified object in your function?")

View File

@ -14,8 +14,8 @@ from ..util import make_tempdir
nlp_config_string = """
[paths]
train = ""
dev = ""
train = null
dev = null
[corpora]
@ -309,7 +309,7 @@ def test_config_interpolation():
config = Config().from_str(nlp_config_string, interpolate=False)
assert config["corpora"]["train"]["path"] == "${paths.train}"
interpolated = config.interpolate()
assert interpolated["corpora"]["train"]["path"] == ""
assert interpolated["corpora"]["train"]["path"] is None
nlp = English.from_config(config)
assert nlp.config["corpora"]["train"]["path"] == "${paths.train}"
# Ensure that variables are preserved in nlp config
@ -317,10 +317,10 @@ def test_config_interpolation():
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
interpolated2 = nlp.config.interpolate()
assert interpolated2["corpora"]["train"]["path"] == ""
assert interpolated2["corpora"]["train"]["path"] is None
assert interpolated2["components"]["tagger"]["model"]["tok2vec"]["width"] == 342
nlp2 = English.from_config(interpolated)
assert nlp2.config["corpora"]["train"]["path"] == ""
assert nlp2.config["corpora"]["train"]["path"] is None
assert nlp2.config["components"]["tagger"]["model"]["tok2vec"]["width"] == 342

View File

@ -7,7 +7,7 @@ import srsly
from .. import util
from .augment import dont_augment
from .example import Example
from ..errors import Warnings
from ..errors import Warnings, Errors
from ..tokens import DocBin, Doc
from ..vocab import Vocab
@ -20,12 +20,14 @@ FILE_TYPE = ".spacy"
@util.registry.readers("spacy.Corpus.v1")
def create_docbin_reader(
path: Path,
path: Optional[Path],
gold_preproc: bool,
max_length: int = 0,
limit: int = 0,
augmenter: Optional[Callable] = None,
) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
return Corpus(
path,
gold_preproc=gold_preproc,