Draft Corpus class for DocBin

Update Corpus

Fix Corpus
This commit is contained in:
Matthew Honnibal 2020-06-20 18:31:07 +02:00
parent 6e7a7ab6da
commit 17226a60ac

View File

@ -1,24 +1,24 @@
import srsly import srsly
from pathlib import Path from pathlib import Path
import random
from .. import util from .. import util
from .example import Example from .example import Example
from ..tokens import DocBin from ..tokens import DocBin
class GoldCorpus(object): class Corpus:
"""An annotated corpus, using the JSON file format. Manages """An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER. annotations for tagging, dependency parsing and NER.
DOCS: https://spacy.io/api/goldcorpus DOCS: https://spacy.io/api/goldcorpus
""" """
def __init__(self, vocab, train_loc, dev_loc, limit=0): def __init__(self, train_loc, dev_loc, limit=0):
"""Create a GoldCorpus. """Create a GoldCorpus.
train (str / Path): File or directory of training data. train (str / Path): File or directory of training data.
dev (str / Path): File or directory of development data. dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object. RETURNS (GoldCorpus): The newly created object.
""" """
self.vocab = vocab
self.train_loc = train_loc self.train_loc = train_loc
self.dev_loc = dev_loc self.dev_loc = dev_loc
@ -38,11 +38,16 @@ class GoldCorpus(object):
continue continue
elif path.is_dir(): elif path.is_dir():
paths.extend(path.iterdir()) paths.extend(path.iterdir())
elif path.parts[-1].endswith(".spacy") elif path.parts[-1].endswith(".spacy"):
locs.append(path) locs.append(path)
return locs return locs
def read_docbin(self, locs, limit=0): def make_examples(self, nlp, reference_docs, **kwargs):
for reference in reference_docs:
predicted = nlp.make_doc(reference.text)
yield Example(predicted, reference)
def read_docbin(self, vocab, locs, limit=0):
""" Yield training examples as example dicts """ """ Yield training examples as example dicts """
i = 0 i = 0
for loc in locs: for loc in locs:
@ -50,31 +55,28 @@ class GoldCorpus(object):
if loc.parts[-1].endswith(".spacy"): if loc.parts[-1].endswith(".spacy"):
with loc.open("rb") as file_: with loc.open("rb") as file_:
doc_bin = DocBin().from_bytes(file_.read()) doc_bin = DocBin().from_bytes(file_.read())
docs = list(doc_bin.get_docs(self.vocab)) yield from doc_bin.get_docs(vocab)
assert len(docs) % 2 == 0
# Pair up the docs into the (predicted, reference) pairs.
for i in range(0, len(docs), 2):
predicted = docs[i]
reference = docs[i+1]
yield Example(predicted, reference)
def count_train(self): def count_train(self, nlp):
"""Returns count of words in train examples""" """Returns count of words in train examples"""
n = 0 n = 0
i = 0 i = 0
for example in self.train_dataset(): for example in self.train_dataset(nlp):
n += len(example.predicted) n += len(example.predicted)
if self.limit and i >= self.limit: if self.limit and i >= self.limit:
break break
i += 1 i += 1
return n return n
def train_dataset(self): def train_dataset(self, nlp, shuffle=True, **kwargs):
examples = self.read_docbin(self.walk_corpus(self.train_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
examples = self.make_examples(nlp, ref_docs, **kwargs)
if shuffle:
examples = list(examples)
random.shuffle(examples) random.shuffle(examples)
yield from examples yield from examples
def dev_dataset(self): def dev_dataset(self, nlp):
examples = self.read_docbin(self.walk_corpus(self.dev_loc)) ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
random.shuffle(examples) examples = self.make_examples(nlp, ref_docs, **kwargs)
yield from examples yield from examples