mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 01:34:30 +03:00
Merge branch 'master' into develop
This commit is contained in:
commit
de33b6d566
|
@ -105,6 +105,10 @@ class Warnings(object):
|
||||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||||
"previous components in the pipeline declare that they assign it.")
|
"previous components in the pipeline declare that they assign it.")
|
||||||
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
W026 = ("Unable to set all sentence boundaries from dependency parses.")
|
||||||
|
W027 = ("Found a large training file of {size} bytes. Note that it may "
|
||||||
|
"be more efficient to split your training data into multiple "
|
||||||
|
"smaller JSON files instead.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
|
|
|
@ -13,7 +13,7 @@ import srsly
|
||||||
|
|
||||||
from .syntax import nonproj
|
from .syntax import nonproj
|
||||||
from .tokens import Doc, Span
|
from .tokens import Doc, Span
|
||||||
from .errors import Errors, AlignmentError
|
from .errors import Errors, AlignmentError, user_warning, Warnings
|
||||||
from .compat import path2str, basestring_
|
from .compat import path2str, basestring_
|
||||||
from . import util
|
from . import util
|
||||||
|
|
||||||
|
@ -537,12 +537,16 @@ def _json_iterate(loc):
|
||||||
loc = util.ensure_path(loc)
|
loc = util.ensure_path(loc)
|
||||||
with loc.open("rb") as file_:
|
with loc.open("rb") as file_:
|
||||||
py_raw = file_.read()
|
py_raw = file_.read()
|
||||||
|
cdef long file_length = len(py_raw)
|
||||||
|
if file_length > 2 ** 30:
|
||||||
|
user_warning(Warnings.W027.format(size=file_length))
|
||||||
|
|
||||||
raw = <char*>py_raw
|
raw = <char*>py_raw
|
||||||
cdef int square_depth = 0
|
cdef int square_depth = 0
|
||||||
cdef int curly_depth = 0
|
cdef int curly_depth = 0
|
||||||
cdef int inside_string = 0
|
cdef int inside_string = 0
|
||||||
cdef int escape = 0
|
cdef int escape = 0
|
||||||
cdef int start = -1
|
cdef long start = -1
|
||||||
cdef char c
|
cdef char c
|
||||||
cdef char quote = ord('"')
|
cdef char quote = ord('"')
|
||||||
cdef char backslash = ord("\\")
|
cdef char backslash = ord("\\")
|
||||||
|
@ -550,7 +554,7 @@ def _json_iterate(loc):
|
||||||
cdef char close_square = ord("]")
|
cdef char close_square = ord("]")
|
||||||
cdef char open_curly = ord("{")
|
cdef char open_curly = ord("{")
|
||||||
cdef char close_curly = ord("}")
|
cdef char close_curly = ord("}")
|
||||||
for i in range(len(py_raw)):
|
for i in range(file_length):
|
||||||
c = raw[i]
|
c = raw[i]
|
||||||
if escape:
|
if escape:
|
||||||
escape = False
|
escape = False
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
from spacy.kb import KnowledgeBase
|
from spacy.kb import KnowledgeBase
|
||||||
from spacy.util import ensure_path
|
from spacy.util import ensure_path
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tests.util import make_tempdir
|
|
||||||
|
from ..tests.util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_issue4674():
|
def test_issue4674():
|
||||||
|
@ -15,8 +16,11 @@ def test_issue4674():
|
||||||
|
|
||||||
vector1 = [0.9, 1.1, 1.01]
|
vector1 = [0.9, 1.1, 1.01]
|
||||||
vector2 = [1.8, 2.25, 2.01]
|
vector2 = [1.8, 2.25, 2.01]
|
||||||
|
with pytest.warns(UserWarning):
|
||||||
kb.set_entities(
|
kb.set_entities(
|
||||||
entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]
|
entity_list=["Q1", "Q1"],
|
||||||
|
freq_list=[32, 111],
|
||||||
|
vector_list=[vector1, vector2],
|
||||||
)
|
)
|
||||||
|
|
||||||
assert kb.get_size_entities() == 1
|
assert kb.get_size_entities() == 1
|
||||||
|
|
Loading…
Reference in New Issue
Block a user