mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-11 17:10:36 +03:00
Support gold_preproc in Corpus
This commit is contained in:
parent
afe6ee4548
commit
2d34d2f24a
|
@ -1,7 +1,7 @@
|
||||||
import random
|
import random
|
||||||
from .. import util
|
from .. import util
|
||||||
from .example import Example
|
from .example import Example
|
||||||
from ..tokens import DocBin
|
from ..tokens import DocBin, Doc
|
||||||
|
|
||||||
|
|
||||||
class Corpus:
|
class Corpus:
|
||||||
|
@ -48,6 +48,17 @@ class Corpus:
|
||||||
predicted = nlp.make_doc(reference.text)
|
predicted = nlp.make_doc(reference.text)
|
||||||
yield Example(predicted, reference)
|
yield Example(predicted, reference)
|
||||||
|
|
||||||
|
def make_examples_gold_preproc(self, nlp, reference_docs):
|
||||||
|
for whole_reference in reference_docs:
|
||||||
|
for ref_sent in whole_reference.sents:
|
||||||
|
reference = ref_sent.as_doc()
|
||||||
|
predicted = Doc(
|
||||||
|
nlp.vocab,
|
||||||
|
words=[t.text for t in reference],
|
||||||
|
spaces=[bool(t.whitespace_) for t in reference]
|
||||||
|
)
|
||||||
|
yield Example(predicted, reference)
|
||||||
|
|
||||||
def read_docbin(self, vocab, locs):
|
def read_docbin(self, vocab, locs):
|
||||||
""" Yield training examples as example dicts """
|
""" Yield training examples as example dicts """
|
||||||
i = 0
|
i = 0
|
||||||
|
@ -72,15 +83,21 @@ class Corpus:
|
||||||
i += 1
|
i += 1
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def train_dataset(self, nlp, shuffle=True, **kwargs):
|
def train_dataset(self, nlp, *, shuffle=True, gold_preproc=False, **kwargs):
|
||||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
|
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.train_loc))
|
||||||
examples = self.make_examples(nlp, ref_docs)
|
if gold_preproc:
|
||||||
|
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||||
|
else:
|
||||||
|
examples = self.make_examples(nlp, ref_docs)
|
||||||
if shuffle:
|
if shuffle:
|
||||||
examples = list(examples)
|
examples = list(examples)
|
||||||
random.shuffle(examples)
|
random.shuffle(examples)
|
||||||
yield from examples
|
yield from examples
|
||||||
|
|
||||||
def dev_dataset(self, nlp, **kwargs):
|
def dev_dataset(self, nlp, *, gold_preproc=False, **kwargs):
|
||||||
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
|
ref_docs = self.read_docbin(nlp.vocab, self.walk_corpus(self.dev_loc))
|
||||||
examples = self.make_examples(nlp, ref_docs)
|
if gold_preproc:
|
||||||
|
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||||
|
else:
|
||||||
|
examples = self.make_examples(nlp, ref_docs)
|
||||||
yield from examples
|
yield from examples
|
||||||
|
|
Loading…
Reference in New Issue
Block a user