mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Tell convert CLI to store user data for Doc * Remove assert * Add has_unknwon_spaces flag on Doc * Do not tokenize docs with unknown spaces in Corpus * Handle conversion of unknown spaces in Example * Fixes * Fixes * Draft has_known_spaces support in DocBin * Add test for serialize has_unknown_spaces * Fix DocBin serialization when has_unknown_spaces * Use serialization in test
		
			
				
	
	
		
			23 lines
		
	
	
		
			911 B
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			23 lines
		
	
	
		
			911 B
		
	
	
	
		
			Python
		
	
	
	
	
	
import srsly
 | 
						|
from ..gold_io import json_iterate, json_to_annotations
 | 
						|
from ..example import annotations2doc
 | 
						|
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
 | 
						|
from ...util import load_model
 | 
						|
from ...lang.xx import MultiLanguage
 | 
						|
 | 
						|
 | 
						|
def json2docs(input_data, model=None, **kwargs):
 | 
						|
    nlp = load_model(model) if model is not None else MultiLanguage()
 | 
						|
    if not isinstance(input_data, bytes):
 | 
						|
        if not isinstance(input_data, str):
 | 
						|
            input_data = srsly.json_dumps(input_data)
 | 
						|
        input_data = input_data.encode("utf8")
 | 
						|
    docs = []
 | 
						|
    for json_doc in json_iterate(input_data):
 | 
						|
        for json_para in json_to_annotations(json_doc):
 | 
						|
            example_dict = _fix_legacy_dict_data(json_para)
 | 
						|
            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
 | 
						|
            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
 | 
						|
            docs.append(doc)
 | 
						|
    return docs
 |