diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 3ca821893..342f94848 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -1,27 +1,22 @@ -import tempfile -import contextlib -import shutil -from pathlib import Path -from ..gold_io import json_to_annotations +import srsly +from ..gold_io import json_iterate, json_to_annotations from ..example import annotations2doc from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ...util import load_model from ...lang.xx import MultiLanguage -@contextlib.contextmanager -def make_tempdir(): - d = Path(tempfile.mkdtemp()) - yield d - shutil.rmtree(str(d)) - - def json2docs(input_data, model=None, **kwargs): nlp = load_model(model) if model is not None else MultiLanguage() + if not isinstance(input_data, bytes): + if not isinstance(input_data, str): + input_data = srsly.json_dumps(input_data) + input_data = input_data.encode("utf8") docs = [] - for json_annot in json_to_annotations(input_data): - example_dict = _fix_legacy_dict_data(json_annot) - tok_dict, doc_dict = _parse_example_dict_data(example_dict) - doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) - docs.append(doc) + for json_doc in json_iterate(input_data): + for json_para in json_to_annotations(json_doc): + example_dict = _fix_legacy_dict_data(json_para) + tok_dict, doc_dict = _parse_example_dict_data(example_dict) + doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) + docs.append(doc) return docs diff --git a/spacy/gold/gold_io.pyx b/spacy/gold/gold_io.pyx index aa8273bfb..10ca427ed 100644 --- a/spacy/gold/gold_io.pyx +++ b/spacy/gold/gold_io.pyx @@ -75,11 +75,13 @@ def read_json_file(loc, docs_filter=None, limit=None): for filename in loc.iterdir(): yield from read_json_file(loc / filename, limit=limit) else: - for doc in json_iterate(loc): - if docs_filter is not None and not docs_filter(doc): + with loc.open("rb") as file_: + utf8_str = file_.read() + for json_doc in json_iterate(utf8_str): + if docs_filter is not None and not docs_filter(json_doc): continue - for json_data in json_to_annotations(doc): - yield json_data + for json_paragraph in json_to_annotations(json_doc): + yield json_paragraph def json_to_annotations(doc): @@ -169,19 +171,15 @@ def json_to_annotations(doc): ) yield example -def json_iterate(loc): +def json_iterate(bytes utf8_str): # We should've made these files jsonl...But since we didn't, parse out # the docs one-by-one to reduce memory usage. # It's okay to read in the whole file -- just don't parse it into JSON. - cdef bytes py_raw - loc = util.ensure_path(loc) - with loc.open("rb") as file_: - py_raw = file_.read() - cdef long file_length = len(py_raw) + cdef long file_length = len(utf8_str) if file_length > 2 ** 30: warnings.warn(Warnings.W027.format(size=file_length)) - raw = py_raw + raw = utf8_str cdef int square_depth = 0 cdef int curly_depth = 0 cdef int inside_string = 0 @@ -218,10 +216,6 @@ def json_iterate(loc): elif c == close_curly: curly_depth -= 1 if square_depth == 1 and curly_depth == 0: - py_str = py_raw[start : i + 1].decode("utf8") - try: - yield srsly.json_loads(py_str) - except Exception: - print(py_str) - raise + substr = utf8_str[start : i + 1].decode("utf8") + yield srsly.json_loads(substr) start = -1 diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 462bb8ea1..fc05444d5 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -10,7 +10,7 @@ def test_issue4402(): nlp = English() with make_tempdir() as tmpdir: output_file = tmpdir / "test4402.spacy" - docs = json2docs(json_data) + docs = json2docs([json_data]) data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() with output_file.open("wb") as file_: file_.write(data)