From a68d0e63f08dc2c4e2c4cccb7247ef2293d09f0c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Jun 2020 22:57:40 +0200 Subject: [PATCH] Support max_length in Corpus --- spacy/cli/train.py | 3 ++- spacy/gold/corpus.py | 45 ++++++++++++++++++++++++++++---------------- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d7c9035b5..d199236b9 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -301,7 +301,8 @@ def create_train_batches(nlp, corpus, cfg): train_examples = list(corpus.train_dataset( nlp, shuffle=True, - gold_preproc=cfg["gold_preproc"] + gold_preproc=cfg["gold_preproc"], + max_length=cfg["max_length"] )) if len(train_examples) == 0: raise ValueError(Errors.E988) diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9e6fed52f..0d36325d2 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -43,24 +43,36 @@ class Corpus: locs.append(path) return locs - def make_examples(self, nlp, reference_docs): + def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: - predicted = nlp.make_doc(reference.text) - yield Example(predicted, reference) + if max_length >= 1 and len(reference) >= max_length: + if reference.is_sentenced: + for ref_sent in reference.sents: + yield Example( + nlp.make_doc(ref_sent.text), + ref_sent.as_doc() + ) + else: + yield Example( + nlp.make_doc(reference.text), + reference + ) def make_examples_gold_preproc(self, nlp, reference_docs): - for whole_reference in reference_docs: - if whole_reference.is_sentenced: - references = [sent.as_doc() for sent in whole_reference.sents] + for reference in reference_docs: + if reference.is_sentenced: + ref_sents = [sent.as_doc() for sent in reference.sents] else: - references = [whole_reference] - for reference in references: - predicted = Doc( - nlp.vocab, - words=[t.text for t in reference], - spaces=[bool(t.whitespace_) for t in reference] + ref_sents = [reference] + for ref_sent in ref_sents: + yield Example( + Doc( + nlp.vocab, + words=[w.text for w in ref_sent], + spaces=[bool(w.whitespace_) for w in ref_sent] + ), + ref_sent ) - yield Example(predicted, reference) def read_docbin(self, vocab, locs): """ Yield training examples as example dicts """ @@ -86,12 +98,13 @@ class Corpus: i += 1 return n - def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False, **kwargs): + def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False, + max_length=0, **kwargs): ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) else: - examples = self.make_examples(nlp, ref_docs) + examples = self.make_examples(nlp, ref_docs, max_length) if shuffle: examples = list(examples) random.shuffle(examples) @@ -102,5 +115,5 @@ class Corpus: if gold_preproc: examples = self.make_examples_gold_preproc(nlp, ref_docs) else: - examples = self.make_examples(nlp, ref_docs) + examples = self.make_examples(nlp, ref_docs, max_length=0) yield from examples