Raise if empty examples (#6052)

* raise error if no valid Example objects were found during initialization

* fix max_length parameter

* remove commit from other branch

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
This commit is contained in:
Sofie Van Landeghem 2020-09-12 21:01:53 +02:00 committed by GitHub
parent 24e138b8ac
commit e92e850c72
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 11 additions and 5 deletions

View File

@ -1166,14 +1166,20 @@ class Language:
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err)
valid_examples = False
for example in get_examples():
if not isinstance(example, Example):
err = Errors.E978.format(
name="Language.begin_training", types=type(example)
)
raise ValueError(err)
else:
valid_examples = True
for word in [t.text for t in example.reference]:
_ = self.vocab[word] # noqa: F841
if not valid_examples:
err = Errors.E930.format(name="Language", obj="empty list")
raise ValueError(err)
if device >= 0: # TODO: do we need this here?
require_gpu(device)
if self.vocab.vectors.data.shape[1] >= 1:

View File

@ -47,7 +47,7 @@ class Corpus:
*,
limit: int = 0,
gold_preproc: bool = False,
max_length: bool = False,
max_length: int = 0,
) -> None:
self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc
@ -89,7 +89,7 @@ class Corpus:
if self.gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
examples = self.make_examples(nlp, ref_docs, self.max_length)
examples = self.make_examples(nlp, ref_docs)
yield from examples
def _make_example(
@ -108,18 +108,18 @@ class Corpus:
return Example(nlp.make_doc(reference.text), reference)
def make_examples(
self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0
self, nlp: "Language", reference_docs: Iterable[Doc]
) -> Iterator[Example]:
for reference in reference_docs:
if len(reference) == 0:
continue
elif max_length == 0 or len(reference) < max_length:
elif self.max_length == 0 or len(reference) < self.max_length:
yield self._make_example(nlp, reference, False)
elif reference.is_sentenced:
for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
elif max_length == 0 or len(ref_sent) < max_length:
elif self.max_length == 0 or len(ref_sent) < self.max_length:
yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(