mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-13 16:05:50 +03:00
Fix json2docs converter
This commit is contained in:
parent
7376518af2
commit
8722b65bce
|
@ -1,26 +1,21 @@
|
||||||
import tempfile
|
import srsly
|
||||||
import contextlib
|
from ..gold_io import json_iterate, json_to_annotations
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from ..gold_io import json_to_annotations
|
|
||||||
from ..example import annotations2doc
|
from ..example import annotations2doc
|
||||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||||
from ...util import load_model
|
from ...util import load_model
|
||||||
from ...lang.xx import MultiLanguage
|
from ...lang.xx import MultiLanguage
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
|
||||||
def make_tempdir():
|
|
||||||
d = Path(tempfile.mkdtemp())
|
|
||||||
yield d
|
|
||||||
shutil.rmtree(str(d))
|
|
||||||
|
|
||||||
|
|
||||||
def json2docs(input_data, model=None, **kwargs):
|
def json2docs(input_data, model=None, **kwargs):
|
||||||
nlp = load_model(model) if model is not None else MultiLanguage()
|
nlp = load_model(model) if model is not None else MultiLanguage()
|
||||||
|
if not isinstance(input_data, bytes):
|
||||||
|
if not isinstance(input_data, str):
|
||||||
|
input_data = srsly.json_dumps(input_data)
|
||||||
|
input_data = input_data.encode("utf8")
|
||||||
docs = []
|
docs = []
|
||||||
for json_annot in json_to_annotations(input_data):
|
for json_doc in json_iterate(input_data):
|
||||||
example_dict = _fix_legacy_dict_data(json_annot)
|
for json_para in json_to_annotations(json_doc):
|
||||||
|
example_dict = _fix_legacy_dict_data(json_para)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
|
@ -75,11 +75,13 @@ def read_json_file(loc, docs_filter=None, limit=None):
|
||||||
for filename in loc.iterdir():
|
for filename in loc.iterdir():
|
||||||
yield from read_json_file(loc / filename, limit=limit)
|
yield from read_json_file(loc / filename, limit=limit)
|
||||||
else:
|
else:
|
||||||
for doc in json_iterate(loc):
|
with loc.open("rb") as file_:
|
||||||
if docs_filter is not None and not docs_filter(doc):
|
utf8_str = file_.read()
|
||||||
|
for json_doc in json_iterate(utf8_str):
|
||||||
|
if docs_filter is not None and not docs_filter(json_doc):
|
||||||
continue
|
continue
|
||||||
for json_data in json_to_annotations(doc):
|
for json_paragraph in json_to_annotations(json_doc):
|
||||||
yield json_data
|
yield json_paragraph
|
||||||
|
|
||||||
|
|
||||||
def json_to_annotations(doc):
|
def json_to_annotations(doc):
|
||||||
|
@ -169,19 +171,15 @@ def json_to_annotations(doc):
|
||||||
)
|
)
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
def json_iterate(loc):
|
def json_iterate(bytes utf8_str):
|
||||||
# We should've made these files jsonl...But since we didn't, parse out
|
# We should've made these files jsonl...But since we didn't, parse out
|
||||||
# the docs one-by-one to reduce memory usage.
|
# the docs one-by-one to reduce memory usage.
|
||||||
# It's okay to read in the whole file -- just don't parse it into JSON.
|
# It's okay to read in the whole file -- just don't parse it into JSON.
|
||||||
cdef bytes py_raw
|
cdef long file_length = len(utf8_str)
|
||||||
loc = util.ensure_path(loc)
|
|
||||||
with loc.open("rb") as file_:
|
|
||||||
py_raw = file_.read()
|
|
||||||
cdef long file_length = len(py_raw)
|
|
||||||
if file_length > 2 ** 30:
|
if file_length > 2 ** 30:
|
||||||
warnings.warn(Warnings.W027.format(size=file_length))
|
warnings.warn(Warnings.W027.format(size=file_length))
|
||||||
|
|
||||||
raw = <char*>py_raw
|
raw = <char*>utf8_str
|
||||||
cdef int square_depth = 0
|
cdef int square_depth = 0
|
||||||
cdef int curly_depth = 0
|
cdef int curly_depth = 0
|
||||||
cdef int inside_string = 0
|
cdef int inside_string = 0
|
||||||
|
@ -218,10 +216,6 @@ def json_iterate(loc):
|
||||||
elif c == close_curly:
|
elif c == close_curly:
|
||||||
curly_depth -= 1
|
curly_depth -= 1
|
||||||
if square_depth == 1 and curly_depth == 0:
|
if square_depth == 1 and curly_depth == 0:
|
||||||
py_str = py_raw[start : i + 1].decode("utf8")
|
substr = utf8_str[start : i + 1].decode("utf8")
|
||||||
try:
|
yield srsly.json_loads(substr)
|
||||||
yield srsly.json_loads(py_str)
|
|
||||||
except Exception:
|
|
||||||
print(py_str)
|
|
||||||
raise
|
|
||||||
start = -1
|
start = -1
|
||||||
|
|
|
@ -10,7 +10,7 @@ def test_issue4402():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
with make_tempdir() as tmpdir:
|
with make_tempdir() as tmpdir:
|
||||||
output_file = tmpdir / "test4402.spacy"
|
output_file = tmpdir / "test4402.spacy"
|
||||||
docs = json2docs(json_data)
|
docs = json2docs([json_data])
|
||||||
data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
|
data = DocBin(docs=docs, attrs =["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]).to_bytes()
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user