mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Make docs_to_json backwards-compatible with v2 (#5714)
				
					
				
			* In `spacy convert -t json` output the JSON docs wrapped in a list * Add back token-level `ner` alongside the doc-level `entities`
This commit is contained in:
		
							parent
							
								
									412dbb1f38
								
							
						
					
					
						commit
						c67fc6aa5b
					
				|  | @ -135,7 +135,7 @@ def convert( | |||
| 
 | ||||
| def _print_docs_to_stdout(docs, output_type): | ||||
|     if output_type == "json": | ||||
|         srsly.write_json("-", docs_to_json(docs)) | ||||
|         srsly.write_json("-", [docs_to_json(docs)]) | ||||
|     else: | ||||
|         sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) | ||||
| 
 | ||||
|  | @ -144,7 +144,7 @@ def _write_docs_to_file(docs, output_file, output_type): | |||
|     if not output_file.parent.exists(): | ||||
|         output_file.parent.mkdir(parents=True) | ||||
|     if output_type == "json": | ||||
|         srsly.write_json(output_file, docs_to_json(docs)) | ||||
|         srsly.write_json(output_file, [docs_to_json(docs)]) | ||||
|     else: | ||||
|         data = DocBin(docs=docs, store_user_data=True).to_bytes() | ||||
|         with output_file.open("wb") as file_: | ||||
|  |  | |||
|  | @ -24,14 +24,15 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): | |||
|         for cat, val in doc.cats.items(): | ||||
|             json_cat = {"label": cat, "value": val} | ||||
|             json_para["cats"].append(json_cat) | ||||
|         # warning: entities information is currently duplicated as | ||||
|         # doc-level "entities" and token-level "ner" | ||||
|         for ent in doc.ents: | ||||
|             ent_tuple = (ent.start_char, ent.end_char, ent.label_) | ||||
|             json_para["entities"].append(ent_tuple) | ||||
|             if ent.kb_id_: | ||||
|                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} | ||||
|                 json_para["links"].append(link_dict) | ||||
|         ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] | ||||
|         biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) | ||||
|         biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) | ||||
|         for j, sent in enumerate(doc.sents): | ||||
|             json_sent = {"tokens": [], "brackets": []} | ||||
|             for token in sent: | ||||
|  | @ -44,6 +45,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"): | |||
|                 if doc.is_parsed: | ||||
|                     json_token["head"] = token.head.i-token.i | ||||
|                     json_token["dep"] = token.dep_ | ||||
|                 json_token["ner"] = biluo_tags[token.i] | ||||
|                 json_sent["tokens"].append(json_token) | ||||
|             json_para["sentences"].append(json_sent) | ||||
|         json_doc["paragraphs"].append(json_para) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user