Merge branch 'master' into develop

This commit is contained in:
Ines Montani 2019-12-21 21:15:46 +01:00
commit de33b6d566
3 changed files with 20 additions and 8 deletions

View File

@ -105,6 +105,10 @@ class Warnings(object):
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
"previous components in the pipeline declare that they assign it.") "previous components in the pipeline declare that they assign it.")
W026 = ("Unable to set all sentence boundaries from dependency parses.") W026 = ("Unable to set all sentence boundaries from dependency parses.")
W027 = ("Found a large training file of {size} bytes. Note that it may "
"be more efficient to split your training data into multiple "
"smaller JSON files instead.")
@add_codes @add_codes

View File

@ -13,7 +13,7 @@ import srsly
from .syntax import nonproj from .syntax import nonproj
from .tokens import Doc, Span from .tokens import Doc, Span
from .errors import Errors, AlignmentError from .errors import Errors, AlignmentError, user_warning, Warnings
from .compat import path2str, basestring_ from .compat import path2str, basestring_
from . import util from . import util
@ -537,12 +537,16 @@ def _json_iterate(loc):
loc = util.ensure_path(loc) loc = util.ensure_path(loc)
with loc.open("rb") as file_: with loc.open("rb") as file_:
py_raw = file_.read() py_raw = file_.read()
cdef long file_length = len(py_raw)
if file_length > 2 ** 30:
user_warning(Warnings.W027.format(size=file_length))
raw = <char*>py_raw raw = <char*>py_raw
cdef int square_depth = 0 cdef int square_depth = 0
cdef int curly_depth = 0 cdef int curly_depth = 0
cdef int inside_string = 0 cdef int inside_string = 0
cdef int escape = 0 cdef int escape = 0
cdef int start = -1 cdef long start = -1
cdef char c cdef char c
cdef char quote = ord('"') cdef char quote = ord('"')
cdef char backslash = ord("\\") cdef char backslash = ord("\\")
@ -550,7 +554,7 @@ def _json_iterate(loc):
cdef char close_square = ord("]") cdef char close_square = ord("]")
cdef char open_curly = ord("{") cdef char open_curly = ord("{")
cdef char close_curly = ord("}") cdef char close_curly = ord("}")
for i in range(len(py_raw)): for i in range(file_length):
c = raw[i] c = raw[i]
if escape: if escape:
escape = False escape = False

View File

@ -1,11 +1,12 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy.util import ensure_path from spacy.util import ensure_path
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tests.util import make_tempdir
from ..tests.util import make_tempdir
def test_issue4674(): def test_issue4674():
@ -15,8 +16,11 @@ def test_issue4674():
vector1 = [0.9, 1.1, 1.01] vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01] vector2 = [1.8, 2.25, 2.01]
with pytest.warns(UserWarning):
kb.set_entities( kb.set_entities(
entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2] entity_list=["Q1", "Q1"],
freq_list=[32, 111],
vector_list=[vector1, vector2],
) )
assert kb.get_size_entities() == 1 assert kb.get_size_entities() == 1