Support sentence limits in GoldCorpus

This commit is contained in:
Matthew Honnibal 2017-05-22 10:40:46 -05:00
parent e2136232f9
commit c9760b2104

View File

@ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words):
class GoldCorpus(object): class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages """An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.""" annotations for tagging, dependency parsing and NER."""
def __init__(self, train_path, dev_path): def __init__(self, train_path, dev_path, limit=None):
"""Create a GoldCorpus. """Create a GoldCorpus.
train_path (unicode or Path): File or directory of training data. train_path (unicode or Path): File or directory of training data.
@ -152,20 +152,31 @@ class GoldCorpus(object):
""" """
self.train_path = util.ensure_path(train_path) self.train_path = util.ensure_path(train_path)
self.dev_path = util.ensure_path(dev_path) self.dev_path = util.ensure_path(dev_path)
self.limit = limit
self.train_locs = self.walk_corpus(self.train_path) self.train_locs = self.walk_corpus(self.train_path)
self.dev_locs = self.walk_corpus(self.dev_path) self.dev_locs = self.walk_corpus(self.dev_path)
@property @property
def train_tuples(self): def train_tuples(self):
i = 0
for loc in self.train_locs: for loc in self.train_locs:
gold_tuples = read_json_file(loc) gold_tuples = read_json_file(loc)
yield from gold_tuples for item in gold_tuples:
yield item
i += 1
if self.limit and i >= self.limit:
break
@property @property
def dev_tuples(self): def dev_tuples(self):
i = 0
for loc in self.dev_locs: for loc in self.dev_locs:
gold_tuples = read_json_file(loc) gold_tuples = read_json_file(loc)
yield from gold_tuples for item in gold_tuples:
yield item
i += 1
if self.limit and i >= self.limit:
break
def count_train(self): def count_train(self):
n = 0 n = 0
@ -175,8 +186,7 @@ class GoldCorpus(object):
def train_docs(self, nlp, shuffle=0, gold_preproc=True, def train_docs(self, nlp, shuffle=0, gold_preproc=True,
projectivize=False): projectivize=False):
if shuffle: train_tuples = self.train_tuples
random.shuffle(self.train_locs)
if projectivize: if projectivize:
train_tuples = nonproj.preprocess_training_data( train_tuples = nonproj.preprocess_training_data(
self.train_tuples) self.train_tuples)
@ -185,13 +195,13 @@ class GoldCorpus(object):
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
yield from gold_docs yield from gold_docs
def dev_docs(self, nlp): def dev_docs(self, nlp, gold_preproc=True):
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples) gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
gold_docs = nlp.preprocess_gold(gold_docs) gold_docs = nlp.preprocess_gold(gold_docs)
yield from gold_docs yield from gold_docs
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc=True): def iter_gold_docs(cls, nlp, tuples, gold_preproc):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
gold_preproc) gold_preproc)
@ -275,7 +285,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
ner.append(token.get('ner', '-')) ner.append(token.get('ner', '-'))
sents.append([ sents.append([
[ids, words, tags, heads, labels, ner], [ids, words, tags, heads, labels, ner],
sent.get('brackets', [])]) sent.get('brackets', [])])
if sents: if sents:
yield [paragraph.get('raw', None), sents] yield [paragraph.get('raw', None), sents]