Remove peeking from Parser.begin_training (#5456)

Inspect all instances in `Parser.begin_training` rather than only the
first 1000.
This commit is contained in:
adrianeboyd 2020-05-20 15:18:06 +02:00 committed by GitHub
parent 40e65d6f63
commit 9393253b66
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -9,7 +9,6 @@ import numpy
cimport cython.parallel
import numpy.random
cimport numpy as np
from itertools import islice
from cpython.ref cimport PyObject, Py_XDECREF
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from libc.math cimport exp
@ -621,15 +620,15 @@ cdef class Parser:
self.model, cfg = self.Model(self.moves.n_moves, **cfg)
if sgd is None:
sgd = self.create_optimizer()
doc_sample = []
gold_sample = []
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
docs = []
golds = []
for raw_text, annots_brackets in get_gold_tuples():
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words))
gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
docs.append(Doc(self.vocab, words=words))
golds.append(GoldParse(docs[-1], words=words, tags=tags,
heads=heads, deps=deps, entities=ents))
self.model.begin_training(doc_sample, gold_sample)
self.model.begin_training(docs, golds)
if pipeline is not None:
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
link_vectors_to_models(self.vocab)