Make docs_to_json backwards-compatible with v2 (#5714)

* In `spacy convert -t json` output the JSON docs wrapped in a list * Add back token-level `ner` alongside the doc-level `entities`
2025-12-24 02:23:19 +03:00 · 2020-07-06 14:15:00 +02:00 · 2020-07-06 14:15:00 +02:00 · c67fc6aa5b
commit c67fc6aa5b
parent 412dbb1f38
2 changed files with 6 additions and 4 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -135,7 +135,7 @@ def convert(

 def _print_docs_to_stdout(docs, output_type):
    if output_type == "json":
-        srsly.write_json("-", docs_to_json(docs))
+        srsly.write_json("-", [docs_to_json(docs)])
    else:
        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())

@ -144,7 +144,7 @@ def _write_docs_to_file(docs, output_file, output_type):
    if not output_file.parent.exists():
        output_file.parent.mkdir(parents=True)
    if output_type == "json":
-        srsly.write_json(output_file, docs_to_json(docs))
+        srsly.write_json(output_file, [docs_to_json(docs)])
    else:
        data = DocBin(docs=docs, store_user_data=True).to_bytes()
        with output_file.open("wb") as file_:
--- a/spacy/gold/gold_io.pyx
+++ b/spacy/gold/gold_io.pyx
@ -24,14 +24,15 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
        for cat, val in doc.cats.items():
            json_cat = {"label": cat, "value": val}
            json_para["cats"].append(json_cat)
+        # warning: entities information is currently duplicated as
+        # doc-level "entities" and token-level "ner"
        for ent in doc.ents:
            ent_tuple = (ent.start_char, ent.end_char, ent.label_)
            json_para["entities"].append(ent_tuple)
            if ent.kb_id_:
                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                json_para["links"].append(link_dict)
-        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
-        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
+        biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
        for j, sent in enumerate(doc.sents):
            json_sent = {"tokens": [], "brackets": []}
            for token in sent:
@ -44,6 +45,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
                if doc.is_parsed:
                    json_token["head"] = token.head.i-token.i
                    json_token["dep"] = token.dep_
+                json_token["ner"] = biluo_tags[token.i]
                json_sent["tokens"].append(json_token)
            json_para["sentences"].append(json_sent)
        json_doc["paragraphs"].append(json_para)