mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Support gold preprocessing and single gold files
This commit is contained in:
parent
e14533757b
commit
f13d6c7359
|
@ -168,33 +168,41 @@ class GoldCorpus(object):
|
||||||
n += 1
|
n += 1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_docs(self, nlp, shuffle=0):
|
def train_docs(self, nlp, shuffle=0, gold_preproc=True):
|
||||||
if shuffle:
|
if shuffle:
|
||||||
random.shuffle(self.train_locs)
|
random.shuffle(self.train_locs)
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.train_tuples)
|
gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc)
|
||||||
if shuffle:
|
if shuffle:
|
||||||
gold_docs = util.itershuffle(gold_docs, bufsize=shuffle*1000)
|
gold_docs = util.itershuffle(gold_docs, bufsize=shuffle*1000)
|
||||||
|
gold_docs = nlp.preprocess_gold(gold_docs)
|
||||||
yield from gold_docs
|
yield from gold_docs
|
||||||
|
|
||||||
def dev_docs(self, nlp):
|
def dev_docs(self, nlp):
|
||||||
yield from self.iter_gold_docs(nlp, self.dev_tuples)
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples)
|
||||||
|
gold_docs = nlp.preprocess_gold(gold_docs)
|
||||||
|
yield from gold_docs
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def iter_gold_docs(cls, nlp, tuples):
|
def iter_gold_docs(cls, nlp, tuples, gold_preproc=True):
|
||||||
|
tuples = nonproj.PseudoProjectivity.preprocess_training_data(tuples)
|
||||||
for raw_text, paragraph_tuples in tuples:
|
for raw_text, paragraph_tuples in tuples:
|
||||||
docs = cls._make_docs(nlp, raw_text, paragraph_tuples)
|
docs = cls._make_docs(nlp, raw_text, paragraph_tuples,
|
||||||
|
gold_preproc)
|
||||||
golds = cls._make_golds(docs, paragraph_tuples)
|
golds = cls._make_golds(docs, paragraph_tuples)
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_docs(cls, nlp, raw_text, paragraph_tuples):
|
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc):
|
||||||
if raw_text is not None:
|
if gold_preproc:
|
||||||
|
return [Doc(nlp.vocab, words=sent_tuples[0][1])
|
||||||
|
for sent_tuples in paragraph_tuples]
|
||||||
|
elif raw_text is not None:
|
||||||
return [nlp.make_doc(raw_text)]
|
return [nlp.make_doc(raw_text)]
|
||||||
else:
|
else:
|
||||||
return [
|
docs = [Doc(nlp.vocab, words=sent_tuples[0][1])
|
||||||
Doc(nlp.vocab, words=sent_tuples[0][1])
|
|
||||||
for sent_tuples in paragraph_tuples]
|
for sent_tuples in paragraph_tuples]
|
||||||
|
return merge_sents(docs)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _make_golds(cls, docs, paragraph_tuples):
|
def _make_golds(cls, docs, paragraph_tuples):
|
||||||
|
@ -207,8 +215,10 @@ class GoldCorpus(object):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def walk_corpus(path):
|
def walk_corpus(path):
|
||||||
locs = []
|
if not path.is_dir():
|
||||||
|
return [path]
|
||||||
paths = [path]
|
paths = [path]
|
||||||
|
locs = []
|
||||||
seen = set()
|
seen = set()
|
||||||
for path in paths:
|
for path in paths:
|
||||||
if str(path) in seen:
|
if str(path) in seen:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user