Update Corpus

This commit is contained in:
Matthew Honnibal 2020-06-20 20:12:31 +02:00
parent 11fa0658f7
commit 0a8b6631a2

View File

@ -1,5 +1,6 @@
import srsly import srsly
from pathlib import Path from pathlib import Path
import random
from .. import util from .. import util
from .example import Example from .example import Example
from ..tokens import DocBin from ..tokens import DocBin
@ -11,14 +12,13 @@ class Corpus:
DOCS: https://spacy.io/api/goldcorpus DOCS: https://spacy.io/api/goldcorpus
""" """
def __init__(self, vocab, train_loc, dev_loc, limit=0): def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus. """Create a GoldCorpus.
train (str / Path): File or directory of training data. train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data. dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object. RETURNS (GoldCorpus): The newly created object.
""" """
self.vocab = vocab
self.train_loc = train_loc self.train_loc = train_loc
self.dev_loc = dev_loc self.dev_loc = dev_loc
@ -42,7 +42,12 @@ class Corpus:
locs.append(path) locs.append(path)
return locs return locs
def read_docbin(self, locs, limit=0): def make_examples(self, nlp, reference_docs, **kwargs):
for reference in reference_docs:
predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0):
""" Yield training examples as example dicts """ """ Yield training examples as example dicts """
i = 0 i = 0
for loc in locs: for loc in locs:
@ -50,31 +55,26 @@ class Corpus:
if loc.parts[-1].endswith(".spacy"): if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_: with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read()) doc_bin = DocBin().from_bytes(file_.read())
docs = list(doc_bin.get_docs(self.vocab)) yield from doc_bin.get_docs(vocab)
assert len(docs) % 2 == 0
# Pair up the docs into the (predicted, reference) pairs.
for i in range(0, len(docs), 2):
predicted = docs[i]
reference = docs[i+1]
yield Example(predicted, reference)
def count_train(self): def count_train(self, nlp):
"""Returns count of words in train examples""" """Returns count of words in train examples"""
n = 0 n = 0
i = 0 i = 0
for example in self.train_dataset(): for example in self.train_dataset(nlp):
n += len(example.predicted) n += len(example.predicted)
if self.limit and i >= self.limit: if self.limit and i >= self.limit:
break break
i += 1 i += 1
return n return n
def train_dataset(self): def train_dataset(self, nlp, **kwargs):
examples = self.read_docbin(self.walk_corpus(self.train_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = list(self.make_examples(nlp, ref_docs, **kwargs))
random.shuffle(examples) random.shuffle(examples)
yield from examples yield from examples
def dev_dataset(self): def dev_dataset(self, nlp):
examples = self.read_docbin(self.walk_corpus(self.dev_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
random.shuffle(examples) examples = self.make_examples(nlp, ref_docs, **kwargs)
yield from examples yield from examples