diff --git a/spacy/language.py b/spacy/language.py index 70dad59f3..c72467ae7 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1166,14 +1166,20 @@ class Language: if not hasattr(get_examples, "__call__"): err = Errors.E930.format(name="Language", obj=type(get_examples)) raise ValueError(err) + valid_examples = False for example in get_examples(): if not isinstance(example, Example): err = Errors.E978.format( name="Language.begin_training", types=type(example) ) raise ValueError(err) + else: + valid_examples = True for word in [t.text for t in example.reference]: _ = self.vocab[word] # noqa: F841 + if not valid_examples: + err = Errors.E930.format(name="Language", obj="empty list") + raise ValueError(err) if device >= 0: # TODO: do we need this here? require_gpu(device) if self.vocab.vectors.data.shape[1] >= 1: diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py index 545f01eaa..8f26b0ed2 100644 --- a/spacy/training/corpus.py +++ b/spacy/training/corpus.py @@ -47,7 +47,7 @@ class Corpus: *, limit: int = 0, gold_preproc: bool = False, - max_length: bool = False, + max_length: int = 0, ) -> None: self.path = util.ensure_path(path) self.gold_preproc = gold_preproc @@ -89,7 +89,7 @@ class Corpus: if self.gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) else: - examples = self.make_examples(nlp, ref_docs, self.max_length) + examples = self.make_examples(nlp, ref_docs) yield from examples def _make_example( @@ -108,18 +108,18 @@ class Corpus: return Example(nlp.make_doc(reference.text), reference) def make_examples( - self, nlp: "Language", reference_docs: Iterable[Doc], max_length: int = 0 + self, nlp: "Language", reference_docs: Iterable[Doc] ) -> Iterator[Example]: for reference in reference_docs: if len(reference) == 0: continue - elif max_length == 0 or len(reference) < max_length: + elif self.max_length == 0 or len(reference) < self.max_length: yield self._make_example(nlp, reference, False) elif reference.is_sentenced: for ref_sent in reference.sents: if len(ref_sent) == 0: continue - elif max_length == 0 or len(ref_sent) < max_length: + elif self.max_length == 0 or len(ref_sent) < self.max_length: yield self._make_example(nlp, ref_sent.as_doc(), False) def make_examples_gold_preproc(