mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-29 18:23:06 +03:00
Stream the gold data during training, to reduce memory
This commit is contained in:
parent
7c0823ad76
commit
c3d168509a
|
@ -13,7 +13,7 @@ from . import _align
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc
|
from .tokens import Doc
|
||||||
from . import util
|
from . import util
|
||||||
from .util import minibatch
|
from .util import minibatch, itershuffle
|
||||||
|
|
||||||
|
|
||||||
def tags_to_entities(tags):
|
def tags_to_entities(tags):
|
||||||
|
@ -133,15 +133,14 @@ class GoldCorpus(object):
|
||||||
def train_docs(self, nlp, gold_preproc=False,
|
def train_docs(self, nlp, gold_preproc=False,
|
||||||
projectivize=False, max_length=None,
|
projectivize=False, max_length=None,
|
||||||
noise_level=0.0):
|
noise_level=0.0):
|
||||||
train_tuples = list(self.train_tuples)
|
|
||||||
if projectivize:
|
if projectivize:
|
||||||
train_tuples = nonproj.preprocess_training_data(
|
train_tuples = nonproj.preprocess_training_data(
|
||||||
self.train_tuples, label_freq_cutoff=30)
|
self.train_tuples, label_freq_cutoff=30)
|
||||||
random.shuffle(train_tuples)
|
random.shuffle(self.train_locs)
|
||||||
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
|
||||||
max_length=max_length,
|
max_length=max_length,
|
||||||
noise_level=noise_level)
|
noise_level=noise_level)
|
||||||
yield from gold_docs
|
yield from itershuffle(gold_docs, bufsize=100)
|
||||||
|
|
||||||
def dev_docs(self, nlp, gold_preproc=False):
|
def dev_docs(self, nlp, gold_preproc=False):
|
||||||
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user