Make docs_to_json backwards-compatible with v2 (#5714)

* In `spacy convert -t json` output the JSON docs wrapped in a list

* Add back token-level `ner` alongside the doc-level `entities`
This commit is contained in:
Adriane Boyd 2020-07-06 14:15:00 +02:00 committed by GitHub
parent 412dbb1f38
commit c67fc6aa5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 4 deletions

View File

@ -135,7 +135,7 @@ def convert(
def _print_docs_to_stdout(docs, output_type): def _print_docs_to_stdout(docs, output_type):
if output_type == "json": if output_type == "json":
srsly.write_json("-", docs_to_json(docs)) srsly.write_json("-", [docs_to_json(docs)])
else: else:
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
@ -144,7 +144,7 @@ def _write_docs_to_file(docs, output_file, output_type):
if not output_file.parent.exists(): if not output_file.parent.exists():
output_file.parent.mkdir(parents=True) output_file.parent.mkdir(parents=True)
if output_type == "json": if output_type == "json":
srsly.write_json(output_file, docs_to_json(docs)) srsly.write_json(output_file, [docs_to_json(docs)])
else: else:
data = DocBin(docs=docs, store_user_data=True).to_bytes() data = DocBin(docs=docs, store_user_data=True).to_bytes()
with output_file.open("wb") as file_: with output_file.open("wb") as file_:

View File

@ -24,14 +24,15 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
for cat, val in doc.cats.items(): for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val} json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat) json_para["cats"].append(json_cat)
# warning: entities information is currently duplicated as
# doc-level "entities" and token-level "ner"
for ent in doc.ents: for ent in doc.ents:
ent_tuple = (ent.start_char, ent.end_char, ent.label_) ent_tuple = (ent.start_char, ent.end_char, ent.label_)
json_para["entities"].append(ent_tuple) json_para["entities"].append(ent_tuple)
if ent.kb_id_: if ent.kb_id_:
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict) json_para["links"].append(link_dict)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []} json_sent = {"tokens": [], "brackets": []}
for token in sent: for token in sent:
@ -44,6 +45,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if doc.is_parsed: if doc.is_parsed:
json_token["head"] = token.head.i-token.i json_token["head"] = token.head.i-token.i
json_token["dep"] = token.dep_ json_token["dep"] = token.dep_
json_token["ner"] = biluo_tags[token.i]
json_sent["tokens"].append(json_token) json_sent["tokens"].append(json_token)
json_para["sentences"].append(json_sent) json_para["sentences"].append(json_sent)
json_doc["paragraphs"].append(json_para) json_doc["paragraphs"].append(json_para)