facilitate larger training files (#4827)

* add warning for large file and change start var to long * type for file_length
2025-10-18 17:54:17 +03:00 · 2019-12-21 21:12:19 +01:00 · 2019-12-21 21:12:19 +01:00 · 732142bf28
commit 732142bf28
parent cb4145adc7
2 changed files with 11 additions and 3 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -105,6 +105,10 @@ class Warnings(object):
    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
            "previous components in the pipeline declare that they assign it.")
    W026 = ("Unable to set all sentence boundaries from dependency parses.")
+    W027 = ("Found a large training file of {size} bytes. Note that it may "
+            "be more efficient to split your training data into multiple "
+            "smaller JSON files instead.")
+


@add_codes
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -13,7 +13,7 @@ import srsly

 from .syntax import nonproj
 from .tokens import Doc, Span
-from .errors import Errors, AlignmentError
+from .errors import Errors, AlignmentError, user_warning, Warnings
 from .compat import path2str
 from . import util
 from .util import minibatch, itershuffle
@ -557,12 +557,16 @@ def _json_iterate(loc):
    loc = util.ensure_path(loc)
    with loc.open("rb") as file_:
        py_raw = file_.read()
+    cdef long file_length = len(py_raw)
+    if file_length > 2 ** 30:
+        user_warning(Warnings.W027.format(size=file_length))
+
    raw = <char*>py_raw
    cdef int square_depth = 0
    cdef int curly_depth = 0
    cdef int inside_string = 0
    cdef int escape = 0
-    cdef int start = -1
+    cdef long start = -1
    cdef char c
    cdef char quote = ord('"')
    cdef char backslash = ord("\\")
@ -570,7 +574,7 @@ def _json_iterate(loc):
    cdef char close_square = ord("]")
    cdef char open_curly = ord("{")
    cdef char close_curly = ord("}")
-    for i in range(len(py_raw)):
+    for i in range(file_length):
        c = raw[i]
        if escape:
            escape = False