mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
Make docs_to_json
backwards-compatible with v2 (#5714)
* In `spacy convert -t json` output the JSON docs wrapped in a list * Add back token-level `ner` alongside the doc-level `entities`
This commit is contained in:
parent
412dbb1f38
commit
c67fc6aa5b
|
@ -135,7 +135,7 @@ def convert(
|
||||||
|
|
||||||
def _print_docs_to_stdout(docs, output_type):
|
def _print_docs_to_stdout(docs, output_type):
|
||||||
if output_type == "json":
|
if output_type == "json":
|
||||||
srsly.write_json("-", docs_to_json(docs))
|
srsly.write_json("-", [docs_to_json(docs)])
|
||||||
else:
|
else:
|
||||||
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ def _write_docs_to_file(docs, output_file, output_type):
|
||||||
if not output_file.parent.exists():
|
if not output_file.parent.exists():
|
||||||
output_file.parent.mkdir(parents=True)
|
output_file.parent.mkdir(parents=True)
|
||||||
if output_type == "json":
|
if output_type == "json":
|
||||||
srsly.write_json(output_file, docs_to_json(docs))
|
srsly.write_json(output_file, [docs_to_json(docs)])
|
||||||
else:
|
else:
|
||||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
|
|
|
@ -24,14 +24,15 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
for cat, val in doc.cats.items():
|
for cat, val in doc.cats.items():
|
||||||
json_cat = {"label": cat, "value": val}
|
json_cat = {"label": cat, "value": val}
|
||||||
json_para["cats"].append(json_cat)
|
json_para["cats"].append(json_cat)
|
||||||
|
# warning: entities information is currently duplicated as
|
||||||
|
# doc-level "entities" and token-level "ner"
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
ent_tuple = (ent.start_char, ent.end_char, ent.label_)
|
ent_tuple = (ent.start_char, ent.end_char, ent.label_)
|
||||||
json_para["entities"].append(ent_tuple)
|
json_para["entities"].append(ent_tuple)
|
||||||
if ent.kb_id_:
|
if ent.kb_id_:
|
||||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||||
json_para["links"].append(link_dict)
|
json_para["links"].append(link_dict)
|
||||||
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
|
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
||||||
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
|
|
||||||
for j, sent in enumerate(doc.sents):
|
for j, sent in enumerate(doc.sents):
|
||||||
json_sent = {"tokens": [], "brackets": []}
|
json_sent = {"tokens": [], "brackets": []}
|
||||||
for token in sent:
|
for token in sent:
|
||||||
|
@ -44,6 +45,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
||||||
if doc.is_parsed:
|
if doc.is_parsed:
|
||||||
json_token["head"] = token.head.i-token.i
|
json_token["head"] = token.head.i-token.i
|
||||||
json_token["dep"] = token.dep_
|
json_token["dep"] = token.dep_
|
||||||
|
json_token["ner"] = biluo_tags[token.i]
|
||||||
json_sent["tokens"].append(json_token)
|
json_sent["tokens"].append(json_token)
|
||||||
json_para["sentences"].append(json_sent)
|
json_para["sentences"].append(json_sent)
|
||||||
json_doc["paragraphs"].append(json_para)
|
json_doc["paragraphs"].append(json_para)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user