Add spans to doc.to_json (#10073)

* Add spans to to_json

* adjustments to_json

* Change docstring

* change doc key naming

* Update spacy/tokens/doc.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
Edward 2022-03-14 15:47:57 +01:00 committed by GitHub
parent 23bc93d3d2
commit b68bf43f5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 2 deletions

View File

@ -1,5 +1,5 @@
import pytest
from spacy.tokens import Doc
from spacy.tokens import Doc, Span
@pytest.fixture()
@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc):
Doc.set_extension("json_test4", method=lambda doc: doc.text)
with pytest.raises(ValueError):
doc.to_json(underscore=["json_test4"])
def test_doc_to_json_span(doc):
"""Test that Doc.to_json() includes spans"""
doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")]
json_doc = doc.to_json()
assert "spans" in json_doc
assert len(json_doc["spans"]) == 1
assert len(json_doc["spans"]["test"]) == 2
assert json_doc["spans"]["test"][0]["start"] == 0

View File

@ -1457,7 +1457,7 @@ cdef class Doc:
underscore (list): Optional list of string names of custom doc._.
attributes. Attribute values need to be JSON-serializable. Values will
be added to an "_" key in the data, e.g. "_": {"foo": "bar"}.
RETURNS (dict): The data in spaCy's JSON format.
RETURNS (dict): The data in JSON format.
"""
data = {"text": self.text}
if self.has_annotation("ENT_IOB"):
@ -1486,6 +1486,15 @@ cdef class Doc:
token_data["dep"] = token.dep_
token_data["head"] = token.head.i
data["tokens"].append(token_data)
if self.spans:
data["spans"] = {}
for span_group in self.spans:
data["spans"][span_group] = []
for span in self.spans[span_group]:
span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_}
data["spans"][span_group].append(span_data)
if underscore:
data["_"] = {}
for attr in underscore: