From 732142bf2825be61824d453009cc0cea130c3b4b Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sat, 21 Dec 2019 21:12:19 +0100 Subject: [PATCH 1/2] facilitate larger training files (#4827) * add warning for large file and change start var to long * type for file_length --- spacy/errors.py | 4 ++++ spacy/gold.pyx | 10 +++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index dd2b38eb9..ce35d706c 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -105,6 +105,10 @@ class Warnings(object): W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " "previous components in the pipeline declare that they assign it.") W026 = ("Unable to set all sentence boundaries from dependency parses.") + W027 = ("Found a large training file of {size} bytes. Note that it may " + "be more efficient to split your training data into multiple " + "smaller JSON files instead.") + @add_codes diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1a74d2206..1d7f80c92 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -13,7 +13,7 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span -from .errors import Errors, AlignmentError +from .errors import Errors, AlignmentError, user_warning, Warnings from .compat import path2str from . import util from .util import minibatch, itershuffle @@ -557,12 +557,16 @@ def _json_iterate(loc): loc = util.ensure_path(loc) with loc.open("rb") as file_: py_raw = file_.read() + cdef long file_length = len(py_raw) + if file_length > 2 ** 30: + user_warning(Warnings.W027.format(size=file_length)) + raw = py_raw cdef int square_depth = 0 cdef int curly_depth = 0 cdef int inside_string = 0 cdef int escape = 0 - cdef int start = -1 + cdef long start = -1 cdef char c cdef char quote = ord('"') cdef char backslash = ord("\\") @@ -570,7 +574,7 @@ def _json_iterate(loc): cdef char close_square = ord("]") cdef char open_curly = ord("{") cdef char close_curly = ord("}") - for i in range(len(py_raw)): + for i in range(file_length): c = raw[i] if escape: escape = False From 7c69d30de5aa58d330a183a0e5015e67c36ca7bc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 21 Dec 2019 21:14:52 +0100 Subject: [PATCH 2/2] Tidy up and expect warning --- spacy/tests/regression/test_issue4674.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 5f8d1573f..8d0c32eaa 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -1,11 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals +import pytest from spacy.kb import KnowledgeBase from spacy.util import ensure_path - from spacy.lang.en import English -from spacy.tests.util import make_tempdir + +from ..tests.util import make_tempdir def test_issue4674(): @@ -15,9 +16,12 @@ def test_issue4674(): vector1 = [0.9, 1.1, 1.01] vector2 = [1.8, 2.25, 2.01] - kb.set_entities( - entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] - ) + with pytest.warns(UserWarning): + kb.set_entities( + entity_list=["Q1", "Q1"], + freq_list=[32, 111], + vector_list=[vector1, vector2], + ) assert kb.get_size_entities() == 1