Support gold_preproc in Corpus

This commit is contained in:
Matthew Honnibal 2020-06-22 17:47:12 +02:00
parent afe6ee4548
commit 2d34d2f24a

View File

@ -1,7 +1,7 @@
import random import random
from .. import util from .. import util
from .example import Example from .example import Example
from ..tokens import DocBin from ..tokens import DocBin, Doc
class Corpus: class Corpus:
@ -48,6 +48,17 @@ class Corpus:
predicted = nlp.make_doc(reference.text) predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference) yield Example(predicted, reference)
def make_examples_gold_preproc(self, nlp, reference_docs):
for whole_reference in reference_docs:
for ref_sent in whole_reference.sents:
reference = ref_sent.as_doc()
predicted = Doc(
nlp.vocab,
words=[t.text for t in reference],
spaces=[bool(t.whitespace_) for t in reference]
)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs): def read_docbin(self, vocab, locs):
""" Yield training examples as example dicts """ """ Yield training examples as example dicts """
i = 0 i = 0
@ -72,15 +83,21 @@ class Corpus:
i += 1 i += 1
return n return n
def train_dataset(self, nlp, shuffle=True, **kwargs): def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False, **kwargs):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs) if gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
examples = self.make_examples(nlp, ref_docs)
if shuffle: if shuffle:
examples = list(examples) examples = list(examples)
random.shuffle(examples) random.shuffle(examples)
yield from examples yield from examples
def dev_dataset(self, nlp, **kwargs): def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
examples = self.make_examples(nlp, ref_docs) if gold_preproc:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
examples = self.make_examples(nlp, ref_docs)
yield from examples yield from examples