From b68bf43f5bf07b78c062777f35240f031374fe00 Mon Sep 17 00:00:00 2001 From: Edward <43848523+thomashacker@users.noreply.github.com> Date: Mon, 14 Mar 2022 15:47:57 +0100 Subject: [PATCH] Add spans to doc.to_json (#10073) * Add spans to to_json * adjustments to_json * Change docstring * change doc key naming * Update spacy/tokens/doc.pyx Co-authored-by: Adriane Boyd Co-authored-by: Adriane Boyd --- spacy/tests/doc/test_to_json.py | 12 +++++++++++- spacy/tokens/doc.pyx | 11 ++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index 9ebee6c88..202281654 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,5 +1,5 @@ import pytest -from spacy.tokens import Doc +from spacy.tokens import Doc, Span @pytest.fixture() @@ -60,3 +60,13 @@ def test_doc_to_json_underscore_error_serialize(doc): Doc.set_extension("json_test4", method=lambda doc: doc.text) with pytest.raises(ValueError): doc.to_json(underscore=["json_test4"]) + + +def test_doc_to_json_span(doc): + """Test that Doc.to_json() includes spans""" + doc.spans["test"] = [Span(doc, 0, 2, "test"), Span(doc, 0, 1, "test")] + json_doc = doc.to_json() + assert "spans" in json_doc + assert len(json_doc["spans"]) == 1 + assert len(json_doc["spans"]["test"]) == 2 + assert json_doc["spans"]["test"][0]["start"] == 0 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d33764ac9..1a48705fd 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1457,7 +1457,7 @@ cdef class Doc: underscore (list): Optional list of string names of custom doc._. attributes. Attribute values need to be JSON-serializable. Values will be added to an "_" key in the data, e.g. "_": {"foo": "bar"}. - RETURNS (dict): The data in spaCy's JSON format. + RETURNS (dict): The data in JSON format. """ data = {"text": self.text} if self.has_annotation("ENT_IOB"): @@ -1486,6 +1486,15 @@ cdef class Doc: token_data["dep"] = token.dep_ token_data["head"] = token.head.i data["tokens"].append(token_data) + + if self.spans: + data["spans"] = {} + for span_group in self.spans: + data["spans"][span_group] = [] + for span in self.spans[span_group]: + span_data = {"start": span.start_char, "end": span.end_char, "label": span.label_, "kb_id": span.kb_id_} + data["spans"][span_group].append(span_data) + if underscore: data["_"] = {} for attr in underscore: