From f5532757a34ea55142fcb0134de51a1b11bf1f40 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 15:02:37 +0200 Subject: [PATCH 1/6] Filter out 0-length examples in Corpus --- spacy/gold/corpus.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 42637ce5c..602edc59a 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -48,15 +48,19 @@ class Corpus: if len(reference) >= max_length >= 1: if reference.is_sentenced: for ref_sent in reference.sents: - yield Example( + eg = Example( nlp.make_doc(ref_sent.text), ref_sent.as_doc() ) + if len(eg.x): + yield eg else: - yield Example( + eg = Example( nlp.make_doc(reference.text), reference ) + if len(eg.x): + yield eg def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: @@ -65,7 +69,7 @@ class Corpus: else: ref_sents = [reference] for ref_sent in ref_sents: - yield Example( + eg = Example( Doc( nlp.vocab, words=[w.text for w in ref_sent], @@ -73,6 +77,8 @@ class Corpus: ), ref_sent ) + if len(eg.x): + yield eg def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """ From c5d12d1a22a10a2b392433cd9a5af1797dc79633 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 15:04:36 +0200 Subject: [PATCH 2/6] Allow batch size to be set for evaluation in spacy train --- spacy/cli/train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 6fbb44d65..85a29f807 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -332,13 +332,14 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): ) n_words = sum(len(ex.predicted) for ex in dev_examples) + batch_size = cfg.get("evaluation_batch_size", 128) start_time = timer() if optimizer.averages: with nlp.use_params(optimizer.averages): - scorer = nlp.evaluate(dev_examples, batch_size=32) + scorer = nlp.evaluate(dev_examples, batch_size=eval_batch_size) else: - scorer = nlp.evaluate(dev_examples, batch_size=32) + scorer = nlp.evaluate(dev_examples, batch_size=eval_batch_size) end_time = timer() wps = n_words / (end_time - start_time) scores = scorer.scores From 2fa56484b251baf1eb0245b033db0104aad29b23 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 15:16:25 +0200 Subject: [PATCH 3/6] Fix eval batch size --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 85a29f807..0853bfb56 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -337,9 +337,9 @@ def create_evaluation_callback(nlp, optimizer, corpus, cfg): if optimizer.averages: with nlp.use_params(optimizer.averages): - scorer = nlp.evaluate(dev_examples, batch_size=eval_batch_size) + scorer = nlp.evaluate(dev_examples, batch_size=batch_size) else: - scorer = nlp.evaluate(dev_examples, batch_size=eval_batch_size) + scorer = nlp.evaluate(dev_examples, batch_size=batch_size) end_time = timer() wps = n_words / (end_time - start_time) scores = scorer.scores From 1f7709e9a6a6877a72f9497788c96674d679c39e Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 15:16:43 +0200 Subject: [PATCH 4/6] Improve max length check in corpus --- spacy/gold/corpus.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 602edc59a..9a688987c 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -45,22 +45,22 @@ class Corpus: def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: - if len(reference) >= max_length >= 1: - if reference.is_sentenced: - for ref_sent in reference.sents: - eg = Example( - nlp.make_doc(ref_sent.text), - ref_sent.as_doc() - ) - if len(eg.x): - yield eg - else: - eg = Example( + if len(reference) == 0: + continue + elif max_length == 0 or len(reference) < max_length: + yield Example( nlp.make_doc(reference.text), reference ) - if len(eg.x): - yield eg + elif reference.is_sentenced: + for ref_sent in reference.sents: + if len(ref_sent) == 0: + continue + elif max_length == 0 or len(ref_sent) < max_length: + yield Example( + nlp.make_doc(ref_sent.text), + ref_sent.as_doc() + ) def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: From 7734cbc34dd9510baeb28d256cd921d80ad76e12 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 1 Jul 2020 15:16:59 +0200 Subject: [PATCH 5/6] Set batch size in begin_training --- spacy/syntax/nn_parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index ceaea3c9c..743b4ca1d 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -449,7 +449,7 @@ cdef class Parser: if component is self: break if hasattr(component, "pipe"): - doc_sample = list(component.pipe(doc_sample)) + doc_sample = list(component.pipe(doc_sample, batch_size=8)) else: doc_sample = [component(doc) for doc in doc_sample] if doc_sample: From 0ada186dda89e135cb4460015759c14d8ec2f60b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 1 Jul 2020 15:31:04 +0200 Subject: [PATCH 6/6] Set version to v3.0.0.dev14 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index a441508f6..6bbb615b1 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,6 +1,6 @@ # fmt: off __title__ = "spacy" -__version__ = "3.0.0.dev13" +__version__ = "3.0.0.dev14" __release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"