From f5532757a34ea55142fcb0134de51a1b11bf1f40 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 15:02:37 +0200 Subject: [PATCH] Filter out 0-length examples in Corpus --- spacy/gold/corpus.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 42637ce5c..602edc59a 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -48,15 +48,19 @@ class Corpus: if len(reference) >= max_length >= 1: if reference.is_sentenced: for ref_sent in reference.sents: - yield Example( + eg = Example( nlp.make_doc(ref_sent.text), ref_sent.as_doc() ) + if len(eg.x): + yield eg else: - yield Example( + eg = Example( nlp.make_doc(reference.text), reference ) + if len(eg.x): + yield eg def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: @@ -65,7 +69,7 @@ class Corpus: else: ref_sents = [reference] for ref_sent in ref_sents: - yield Example( + eg = Example( Doc( nlp.vocab, words=[w.text for w in ref_sent], @@ -73,6 +77,8 @@ class Corpus: ), ref_sent ) + if len(eg.x): + yield eg def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """