Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-11-03 01:17:52 +03:00 · 2020-07-01 15:33:57 +02:00 · 2020-07-01 15:33:57 +02:00 · 4f42bcdd13
commit 4f42bcdd13
parent 38f226bda8 0ada186dda
4 changed files with 20 additions and 13 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.0.dev13"
+__version__ = "3.0.0.dev14"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -332,13 +332,14 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg):
        )

        n_words = sum(len(ex.predicted) for ex in dev_examples)
+        batch_size = cfg.get("evaluation_batch_size", 128)
        start_time = timer()

        if optimizer.averages:
            with nlp.use_params(optimizer.averages):
-                scorer = nlp.evaluate(dev_examples, batch_size=32)
+                scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
        else:
-            scorer = nlp.evaluate(dev_examples, batch_size=32)
+            scorer = nlp.evaluate(dev_examples, batch_size=batch_size)
        end_time = timer()
        wps = n_words / (end_time - start_time)
        scores = scorer.scores
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -45,18 +45,22 @@ class Corpus:

    def make_examples(self, nlp, reference_docs, max_length=0):
        for reference in reference_docs:
-            if len(reference) >= max_length >= 1:
-                if reference.is_sentenced:
-                    for ref_sent in reference.sents:
-                        yield Example(
-                            nlp.make_doc(ref_sent.text),
-                            ref_sent.as_doc()
-                        )
-            else:
+            if len(reference) == 0:
+                continue
+            elif max_length == 0 or len(reference) < max_length:
                yield Example(
                    nlp.make_doc(reference.text),
                    reference
                )
+            elif reference.is_sentenced:
+                for ref_sent in reference.sents:
+                    if len(ref_sent) == 0:
+                        continue
+                    elif max_length == 0 or len(ref_sent) < max_length:
+                        yield Example(
+                            nlp.make_doc(ref_sent.text),
+                            ref_sent.as_doc()
+                        )
    
    def make_examples_gold_preproc(self, nlp, reference_docs):
        for reference in reference_docs:
@ -65,7 +69,7 @@ class Corpus:
            else:
                ref_sents = [reference]
            for ref_sent in ref_sents:
-                yield Example(
+                eg = Example(
                    Doc(
                        nlp.vocab, 
                        words=[w.text for w in ref_sent],
@ -73,6 +77,8 @@ class Corpus:
                    ),
                    ref_sent
                )
+                if len(eg.x):
+                    yield eg

    def read_docbin(self, vocab, locs):
        """ Yield training examples as example dicts """
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -449,7 +449,7 @@ cdef class Parser:
                if component is self:
                    break
                if hasattr(component, "pipe"):
-                    doc_sample = list(component.pipe(doc_sample))
+                    doc_sample = list(component.pipe(doc_sample, batch_size=8))
                else:
                    doc_sample = [component(doc) for doc in doc_sample]
        if doc_sample: