mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-20 21:40:35 +03:00
Support sentence limits in GoldCorpus
This commit is contained in:
parent
e2136232f9
commit
c9760b2104
|
@ -144,7 +144,7 @@ def _min_edit_path(cand_words, gold_words):
|
||||||
class GoldCorpus(object):
|
class GoldCorpus(object):
|
||||||
"""An annotated corpus, using the JSON file format. Manages
|
"""An annotated corpus, using the JSON file format. Manages
|
||||||
annotations for tagging, dependency parsing and NER."""
|
annotations for tagging, dependency parsing and NER."""
|
||||||
def __init__(self, train_path, dev_path):
|
def __init__(self, train_path, dev_path, limit=None):
|
||||||
"""Create a GoldCorpus.
|
"""Create a GoldCorpus.
|
||||||
|
|
||||||
train_path (unicode or Path): File or directory of training data.
|
train_path (unicode or Path): File or directory of training data.
|
||||||
|
@ -152,20 +152,31 @@ class GoldCorpus(object):
|
||||||
"""
|
"""
|
||||||
self.train_path = util.ensure_path(train_path)
|
self.train_path = util.ensure_path(train_path)
|
||||||
self.dev_path = util.ensure_path(dev_path)
|
self.dev_path = util.ensure_path(dev_path)
|
||||||
|
self.limit = limit
|
||||||
self.train_locs = self.walk_corpus(self.train_path)
|
self.train_locs = self.walk_corpus(self.train_path)
|
||||||
self.dev_locs = self.walk_corpus(self.dev_path)
|
self.dev_locs = self.walk_corpus(self.dev_path)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def train_tuples(self):
|
def train_tuples(self):
|
||||||
|
i = 0
|
||||||
for loc in self.train_locs:
|
for loc in self.train_locs:
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
yield from gold_tuples
|
for item in gold_tuples:
|
||||||
|
yield item
|
||||||
|
i += 1
|
||||||
|
if self.limit and i >= self.limit:
|
||||||
|
break
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def dev_tuples(self):
|
def dev_tuples(self):
|
||||||
|
i = 0
|
||||||
for loc in self.dev_locs:
|
for loc in self.dev_locs:
|
||||||
gold_tuples = read_json_file(loc)
|
gold_tuples = read_json_file(loc)
|
||||||
yield from gold_tuples
|
for item in gold_tuples:
|
||||||
|
yield item
|
||||||
|
i += 1
|
||||||
|
if self.limit and i >= self.limit:
|
||||||
|
break
|
||||||
|
|
||||||
def count_train(self):
|
def count_train(self):
|
||||||
n = 0
|
n = 0
|
||||||
|
@ -175,8 +186,7 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
def train_docs(self, nlp, shuffle=0, gold_preproc=True,
|
def train_docs(self, nlp, shuffle=0, gold_preproc=True,
|
||||||
projectivize=False):
|
projectivize=False):
|
||||||
if shuffle:
|
train_tuples = self.train_tuples
|
||||||
random.shuffle(self.train_locs)
|
|
||||||
if projectivize:
|
if projectivize:
|
||||||
train_tuples = nonproj.preprocess_training_data(
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
self.train_tuples)
|
self.train_tuples)
|
||||||
|
@ -185,13 +195,13 @@ class GoldCorpus(object):
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc)
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
def dev_docs(self, nlp):
|
def dev_docs(self, nlp, gold_preproc=True):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples)
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||||
gold_docs = nlp.preprocess_gold(gold_docs)
|
gold_docs = nlp.preprocess_gold(gold_docs)
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples, gold_preproc=True):
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc):
|
||||||
for raw_text, paragraph_tuples in tuples:
|
for raw_text, paragraph_tuples in tuples:
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||||
gold_preproc)
|
gold_preproc)
|
||||||
|
@ -275,7 +285,7 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
ner.append(token.get('ner', '-'))
|
ner.append(token.get('ner', '-'))
|
||||||
sents.append([
|
sents.append([
|
||||||
[ids, words, tags, heads, labels, ner],
|
[ids, words, tags, heads, labels, ner],
|
||||||
sent.get('brackets', [])])
|
sent.get('brackets', [])])
|
||||||
if sents:
|
if sents:
|
||||||
yield [paragraph.get('raw', None), sents]
|
yield [paragraph.get('raw', None), sents]
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user