diff --git a/spacy/errors.py b/spacy/errors.py index ebbd314cd..3dab4e1fb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -105,6 +105,10 @@ class Warnings(object): W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") + W027 = ("Found a large training file of {size} bytes. Note that it may " + "be more efficient to split your training data into multiple " + "smaller JSON files instead.") + @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d3316c5d0..0374825dc 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -13,7 +13,7 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError +from .errors import Errors, AlignmentError, user_warning, Warnings from .compat import path2str, basestring_ from . import util @@ -537,12 +537,16 @@ def _json_iterate(loc): loc = util.ensure_path(loc) with loc.open("rb") as file_: py_raw = file_.read() + cdef long file_length = len(py_raw) + if file_length > 2 ** 30: + user_warning(Warnings.W027.format(size=file_length)) + raw = py_raw cdef int square_depth = 0 cdef int curly_depth = 0 cdef int inside_string = 0 cdef int escape = 0 - cdef int start = -1 + cdef long start = -1 cdef char c cdef char quote = ord('"') cdef char backslash = ord("\\") @@ -550,7 +554,7 @@ def _json_iterate(loc): cdef char close_square = ord("]") cdef char open_curly = ord("{") cdef char close_curly = ord("}") - for i in range(len(py_raw)): + for i in range(file_length): c = raw[i] if escape: escape = False diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 5f8d1573f..8d0c32eaa 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest from spacy.kb import KnowledgeBase from spacy.util import ensure_path - from spacy.lang.en import English -from spacy.tests.util import make_tempdir + +from ..tests.util import make_tempdir def test_issue4674(): @@ -15,9 +16,12 @@ def test_issue4674(): vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] - kb.set_entities( - entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] - ) + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) assert kb.get_size_entities() == 1