facilitate larger training files (#4827)

* add warning for large file and change start var to long

* type for file_length
This commit is contained in:
Sofie Van Landeghem 2019-12-21 21:12:19 +01:00 committed by Ines Montani
parent cb4145adc7
commit 732142bf28
2 changed files with 11 additions and 3 deletions

View File

@ -105,6 +105,10 @@ class Warnings(object):
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
"previous components in the pipeline declare that they assign it.") "previous components in the pipeline declare that they assign it.")
W026 = ("Unable to set all sentence boundaries from dependency parses.") W026 = ("Unable to set all sentence boundaries from dependency parses.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")
@add_codes @add_codes

View File

@ -13,7 +13,7 @@ import srsly
from .syntax import nonproj from .syntax import nonproj
from .tokens import Doc, Span from .tokens import Doc, Span
from .errors import Errors, AlignmentError from .errors import Errors, AlignmentError, user_warning, Warnings
from .compat import path2str from .compat import path2str
from . import util from . import util
from .util import minibatch, itershuffle from .util import minibatch, itershuffle
@ -557,12 +557,16 @@ def _json_iterate(loc):
loc = util.ensure_path(loc) loc = util.ensure_path(loc)
with loc.open("rb") as file_: with loc.open("rb") as file_:
py_raw = file_.read() py_raw = file_.read()
cdef long file_length = len(py_raw)
if file_length > 2 ** 30:
user_warning(Warnings.W027.format(size=file_length))
raw = <char*>py_raw raw = <char*>py_raw
cdef int square_depth = 0 cdef int square_depth = 0
cdef int curly_depth = 0 cdef int curly_depth = 0
cdef int inside_string = 0 cdef int inside_string = 0
cdef int escape = 0 cdef int escape = 0
cdef int start = -1 cdef long start = -1
cdef char c cdef char c
cdef char quote = ord('"') cdef char quote = ord('"')
cdef char backslash = ord("\\") cdef char backslash = ord("\\")
@ -570,7 +574,7 @@ def _json_iterate(loc):
cdef char close_square = ord("]") cdef char close_square = ord("]")
cdef char open_curly = ord("{") cdef char open_curly = ord("{")
cdef char close_curly = ord("}") cdef char close_curly = ord("}")
for i in range(len(py_raw)): for i in range(file_length):
c = raw[i] c = raw[i]
if escape: if escape:
escape = False escape = False