Fix json2docs converter

2025-08-02 11:20:19 +03:00 · 2020-06-23 13:19:26 +02:00 · 2020-06-23 13:19:26 +02:00 · 8722b65bce
commit 8722b65bce
parent 7376518af2
3 changed files with 24 additions and 35 deletions
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@ -1,27 +1,22 @@
-import tempfile
-import contextlib
-import shutil
-from pathlib import Path
-from ..gold_io import json_to_annotations
+import srsly
+from ..gold_io import json_iterate, json_to_annotations
 from ..example import annotations2doc
 from ..example import _fix_legacy_dict_data, _parse_example_dict_data
 from ...util import load_model
 from ...lang.xx import MultiLanguage


-@contextlib.contextmanager
-def make_tempdir():
-    d = Path(tempfile.mkdtemp())
-    yield d
-    shutil.rmtree(str(d))
-
-
 def json2docs(input_data, model=None, **kwargs):
    nlp = load_model(model) if model is not None else MultiLanguage()
+    if not isinstance(input_data, bytes):
+        if not isinstance(input_data, str):
+            input_data = srsly.json_dumps(input_data)
+        input_data = input_data.encode("utf8")
    docs = []
-    for json_annot in json_to_annotations(input_data):
-        example_dict = _fix_legacy_dict_data(json_annot)
-        tok_dict, doc_dict = _parse_example_dict_data(example_dict)
-        doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
-        docs.append(doc)
+    for json_doc in json_iterate(input_data):
+        for json_para in json_to_annotations(json_doc):
+            example_dict = _fix_legacy_dict_data(json_para)
+            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
+            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
+            docs.append(doc)
    return docs
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@ -75,11 +75,13 @@ def read_json_file(loc, docs_filter=None, limit=None):
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename, limit=limit)
    else:
-        for doc in json_iterate(loc):
-            if docs_filter is not None and not docs_filter(doc):
+        with loc.open("rb") as file_:
+            utf8_str = file_.read()
+        for json_doc in json_iterate(utf8_str):
+            if docs_filter is not None and not docs_filter(json_doc):
                continue
-            for json_data in json_to_annotations(doc):
-                yield json_data
+            for json_paragraph in json_to_annotations(json_doc):
+                yield json_paragraph


 def json_to_annotations(doc):
@ -169,19 +171,15 @@ def json_to_annotations(doc):
        )
        yield example

-def json_iterate(loc):
+def json_iterate(bytes utf8_str):
    # We should've made these files jsonl...But since we didn't, parse out
    # the docs one-by-one to reduce memory usage.
    # It's okay to read in the whole file -- just don't parse it into JSON.
-    cdef bytes py_raw
-    loc = util.ensure_path(loc)
-    with loc.open("rb") as file_:
-        py_raw = file_.read()
-    cdef long file_length = len(py_raw)
+    cdef long file_length = len(utf8_str)
    if file_length > 2 ** 30:
        warnings.warn(Warnings.W027.format(size=file_length))

-    raw = <char*>py_raw
+    raw = <char*>utf8_str
    cdef int square_depth = 0
    cdef int curly_depth = 0
    cdef int inside_string = 0
@ -218,10 +216,6 @@ def json_iterate(loc):
        elif c == close_curly:
            curly_depth -= 1
            if square_depth == 1 and curly_depth == 0:
-                py_str = py_raw[start : i + 1].decode("utf8")
-                try:
-                    yield srsly.json_loads(py_str)
-                except Exception:
-                    print(py_str)
-                    raise
+                substr = utf8_str[start : i + 1].decode("utf8")
+                yield srsly.json_loads(substr)
                start = -1
--- a/spacy/tests/regression/test_issue4402.py
+++ b/spacy/tests/regression/test_issue4402.py
@ -10,7 +10,7 @@ def test_issue4402():
    nlp = English()
    with make_tempdir() as tmpdir:
        output_file = tmpdir / "test4402.spacy"
-        docs = json2docs(json_data)
+        docs = json2docs([json_data])
        data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)