mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Support sentence limits in GoldCorpus
This commit is contained in:
parent
e2136232f9
commit
c9760b2104
|
@ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words):
|
|||
class GoldCorpus(object):
|
||||
"""An annotated corpus, using the JSON file format. Manages
|
||||
annotations for tagging, dependency parsing and NER."""
|
||||
def __init__(self, train_path, dev_path):
|
||||
def __init__(self, train_path, dev_path, limit=None):
|
||||
"""Create a GoldCorpus.
|
||||
|
||||
train_path (unicode or Path): File or directory of training data.
|
||||
|
@ -152,20 +152,31 @@ class GoldCorpus(object):
|
|||
"""
|
||||
self.train_path = util.ensure_path(train_path)
|
||||
self.dev_path = util.ensure_path(dev_path)
|
||||
self.limit = limit
|
||||
self.train_locs = self.walk_corpus(self.train_path)
|
||||
self.dev_locs = self.walk_corpus(self.dev_path)
|
||||
|
||||
@property
|
||||
def train_tuples(self):
|
||||
i = 0
|
||||
for loc in self.train_locs:
|
||||
gold_tuples = read_json_file(loc)
|
||||
yield from gold_tuples
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += 1
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
|
||||
@property
|
||||
def dev_tuples(self):
|
||||
i = 0
|
||||
for loc in self.dev_locs:
|
||||
gold_tuples = read_json_file(loc)
|
||||
yield from gold_tuples
|
||||
for item in gold_tuples:
|
||||
yield item
|
||||
i += 1
|
||||
if self.limit and i >= self.limit:
|
||||
break
|
||||
|
||||
def count_train(self):
|
||||
n = 0
|
||||
|
@ -175,8 +186,7 @@ class GoldCorpus(object):
|
|||
|
||||
def train_docs(self, nlp, shuffle=0, gold_preproc=True,
|
||||
projectivize=False):
|
||||
if shuffle:
|
||||
random.shuffle(self.train_locs)
|
||||
train_tuples = self.train_tuples
|
||||
if projectivize:
|
||||
train_tuples = nonproj.preprocess_training_data(
|
||||
self.train_tuples)
|
||||
|
@ -185,13 +195,13 @@ class GoldCorpus(object):
|
|||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
|
||||
yield from gold_docs
|
||||
|
||||
def dev_docs(self, nlp):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples)
|
||||
def dev_docs(self, nlp, gold_preproc=True):
|
||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||
gold_docs = nlp.preprocess_gold(gold_docs)
|
||||
yield from gold_docs
|
||||
|
||||
@classmethod
|
||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc=True):
|
||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc):
|
||||
for raw_text, paragraph_tuples in tuples:
|
||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||
gold_preproc)
|
||||
|
@ -275,7 +285,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
|||
ner.append(token.get('ner', '-'))
|
||||
sents.append([
|
||||
[ids, words, tags, heads, labels, ner],
|
||||
sent.get('brackets', [])])
|
||||
sent.get('brackets', [])])
|
||||
if sents:
|
||||
yield [paragraph.get('raw', None), sents]
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user