diff --git a/spacy/about.py b/spacy/about.py index a441508f6..6bbb615b1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev13" +__version__ = "3.0.0.dev14" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6fbb44d65..0853bfb56 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -332,13 +332,14 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): ) n_words = sum(len(ex.predicted) for ex in dev_examples) + batch_size = cfg.get("evaluation_batch_size", 128) start_time = timer() if optimizer.averages: with nlp.use_params(optimizer.averages): - scorer = nlp.evaluate(dev_examples, batch_size=32) + scorer = nlp.evaluate(dev_examples, batch_size=batch_size) else: - scorer = nlp.evaluate(dev_examples, batch_size=32) + scorer = nlp.evaluate(dev_examples, batch_size=batch_size) end_time = timer() wps = n_words / (end_time - start_time) scores = scorer.scores diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 42637ce5c..9a688987c 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -45,18 +45,22 @@ class Corpus: def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: - if len(reference) >= max_length >= 1: - if reference.is_sentenced: - for ref_sent in reference.sents: - yield Example( - nlp.make_doc(ref_sent.text), - ref_sent.as_doc() - ) - else: + if len(reference) == 0: + continue + elif max_length == 0 or len(reference) < max_length: yield Example( nlp.make_doc(reference.text), reference ) + elif reference.is_sentenced: + for ref_sent in reference.sents: + if len(ref_sent) == 0: + continue + elif max_length == 0 or len(ref_sent) < max_length: + yield Example( + nlp.make_doc(ref_sent.text), + ref_sent.as_doc() + ) def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: @@ -65,7 +69,7 @@ class Corpus: else: ref_sents = [reference] for ref_sent in ref_sents: - yield Example( + eg = Example( Doc( nlp.vocab, words=[w.text for w in ref_sent], @@ -73,6 +77,8 @@ class Corpus: ), ref_sent ) + if len(eg.x): + yield eg def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """ diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ceaea3c9c..743b4ca1d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -449,7 +449,7 @@ cdef class Parser: if component is self: break if hasattr(component, "pipe"): - doc_sample = list(component.pipe(doc_sample)) + doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] if doc_sample: