Fix json2docs converter

This commit is contained in:
Matthew Honnibal 2020-06-23 13:19:26 +02:00
parent 7376518af2
commit 8722b65bce
3 changed files with 24 additions and 35 deletions

View File

@ -1,26 +1,21 @@
import tempfile import srsly
import contextlib from ..gold_io import json_iterate, json_to_annotations
import shutil
from pathlib import Path
from ..gold_io import json_to_annotations
from ..example import annotations2doc from ..example import annotations2doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
from ...lang.xx import MultiLanguage from ...lang.xx import MultiLanguage
@contextlib.contextmanager
def make_tempdir():
d = Path(tempfile.mkdtemp())
yield d
shutil.rmtree(str(d))
def json2docs(input_data, model=None, **kwargs): def json2docs(input_data, model=None, **kwargs):
nlp = load_model(model) if model is not None else MultiLanguage() nlp = load_model(model) if model is not None else MultiLanguage()
if not isinstance(input_data, bytes):
if not isinstance(input_data, str):
input_data = srsly.json_dumps(input_data)
input_data = input_data.encode("utf8")
docs = [] docs = []
for json_annot in json_to_annotations(input_data): for json_doc in json_iterate(input_data):
example_dict = _fix_legacy_dict_data(json_annot) for json_para in json_to_annotations(json_doc):
example_dict = _fix_legacy_dict_data(json_para)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc) docs.append(doc)

View File

@ -75,11 +75,13 @@ def read_json_file(loc, docs_filter=None, limit=None):
for filename in loc.iterdir(): for filename in loc.iterdir():
yield from read_json_file(loc / filename, limit=limit) yield from read_json_file(loc / filename, limit=limit)
else: else:
for doc in json_iterate(loc): with loc.open("rb") as file_:
if docs_filter is not None and not docs_filter(doc): utf8_str = file_.read()
for json_doc in json_iterate(utf8_str):
if docs_filter is not None and not docs_filter(json_doc):
continue continue
for json_data in json_to_annotations(doc): for json_paragraph in json_to_annotations(json_doc):
yield json_data yield json_paragraph
def json_to_annotations(doc): def json_to_annotations(doc):
@ -169,19 +171,15 @@ def json_to_annotations(doc):
) )
yield example yield example
def json_iterate(loc): def json_iterate(bytes utf8_str):
# We should've made these files jsonl...But since we didn't, parse out # We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage. # the docs one-by-one to reduce memory usage.
# It's okay to read in the whole file -- just don't parse it into JSON. # It's okay to read in the whole file -- just don't parse it into JSON.
cdef bytes py_raw cdef long file_length = len(utf8_str)
loc = util.ensure_path(loc)
with loc.open("rb") as file_:
py_raw = file_.read()
cdef long file_length = len(py_raw)
if file_length > 2 ** 30: if file_length > 2 ** 30:
warnings.warn(Warnings.W027.format(size=file_length)) warnings.warn(Warnings.W027.format(size=file_length))
raw = <char*>py_raw raw = <char*>utf8_str
cdef int square_depth = 0 cdef int square_depth = 0
cdef int curly_depth = 0 cdef int curly_depth = 0
cdef int inside_string = 0 cdef int inside_string = 0
@ -218,10 +216,6 @@ def json_iterate(loc):
elif c == close_curly: elif c == close_curly:
curly_depth -= 1 curly_depth -= 1
if square_depth == 1 and curly_depth == 0: if square_depth == 1 and curly_depth == 0:
py_str = py_raw[start : i + 1].decode("utf8") substr = utf8_str[start : i + 1].decode("utf8")
try: yield srsly.json_loads(substr)
yield srsly.json_loads(py_str)
except Exception:
print(py_str)
raise
start = -1 start = -1

View File

@ -10,7 +10,7 @@ def test_issue4402():
nlp = English() nlp = English()
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
output_file = tmpdir / "test4402.spacy" output_file = tmpdir / "test4402.spacy"
docs = json2docs(json_data) docs = json2docs([json_data])
data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes() data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)