Revert "Remove peeking from Parser.begin_training (#5456)"

This reverts commit 9393253b66. The model shouldn't need to see all examples, and actually in v3 there's no equivalent step. All examples are provided to the component, for the component to do stuff like figuring out the labels. The model just needs to do stuff like shape inference.
2025-08-07 05:40:20 +03:00 · 2020-05-29 23:21:55 +02:00 · 2020-05-29 23:21:55 +02:00 · 64adda3202
commit 64adda3202
parent 85f1acfaa0
1 changed files with 8 additions and 7 deletions
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -9,6 +9,7 @@ import numpy
 cimport cython.parallel
 import numpy.random
 cimport numpy as np
+from itertools import islice
 from cpython.ref cimport PyObject, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 from libc.math cimport exp
@ -620,15 +621,15 @@ cdef class Parser:
            self.model, cfg = self.Model(self.moves.n_moves, **cfg)
            if sgd is None:
                sgd = self.create_optimizer()
-            docs = []
-            golds = []
-            for raw_text, annots_brackets in get_gold_tuples():
+            doc_sample = []
+            gold_sample = []
+            for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
                for annots, brackets in annots_brackets:
                    ids, words, tags, heads, deps, ents = annots
-                    docs.append(Doc(self.vocab, words=words))
-                    golds.append(GoldParse(docs[-1], words=words, tags=tags,
-                                           heads=heads, deps=deps, entities=ents))
-            self.model.begin_training(docs, golds)
+                    doc_sample.append(Doc(self.vocab, words=words))
+                    gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
+                                                 heads=heads, deps=deps, entities=ents))
+            self.model.begin_training(doc_sample, gold_sample)
            if pipeline is not None:
                self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
            link_vectors_to_models(self.vocab)