mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-06 23:36:33 +03:00
a902b5f217
* Tell convert CLI to store user data for Doc * Remove assert * Add has_unknwon_spaces flag on Doc * Do not tokenize docs with unknown spaces in Corpus * Handle conversion of unknown spaces in Example * Fixes * Fixes * Draft has_known_spaces support in DocBin * Add test for serialize has_unknown_spaces * Fix DocBin serialization when has_unknown_spaces * Use serialization in test
23 lines
911 B
Python
23 lines
911 B
Python
import srsly
|
|
from ..gold_io import json_iterate, json_to_annotations
|
|
from ..example import annotations2doc
|
|
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
|
from ...util import load_model
|
|
from ...lang.xx import MultiLanguage
|
|
|
|
|
|
def json2docs(input_data, model=None, **kwargs):
|
|
nlp = load_model(model) if model is not None else MultiLanguage()
|
|
if not isinstance(input_data, bytes):
|
|
if not isinstance(input_data, str):
|
|
input_data = srsly.json_dumps(input_data)
|
|
input_data = input_data.encode("utf8")
|
|
docs = []
|
|
for json_doc in json_iterate(input_data):
|
|
for json_para in json_to_annotations(json_doc):
|
|
example_dict = _fix_legacy_dict_data(json_para)
|
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
|
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
|
docs.append(doc)
|
|
return docs
|