adding spans to doc_annotation in Example.to_dict (#11261)

* adding spans to doc_annotation in Example.to_dict * to_dict compatible with from_dict: tuples instead of spans * use strings for label and kb_id * Simplify test * Update data formats docs Co-authored-by: Stefanie Wolf <stefanie.wolf@vitecsoftware.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
2025-08-10 15:14:56 +03:00 · 2022-08-05 12:26:38 +02:00 · 2022-08-05 12:26:38 +02:00 · 23749cfc91
commit 23749cfc91
parent b07708d5d0
3 changed files with 55 additions and 2 deletions
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab):

    example = Example(predicted, reference)
    assert example.get_aligned("TAG", as_string=True) == tags
+
+
+@pytest.mark.issue("11260")
+def test_issue11260():
+    annots = {
+        "words": ["I", "like", "New", "York", "."],
+        "spans": {
+            "cities": [(7, 15, "LOC", "")],
+            "people": [(0, 1, "PERSON", "")],
+        },
+    }
+    vocab = Vocab()
+    predicted = Doc(vocab, words=annots["words"])
+    example = Example.from_dict(predicted, annots)
+    assert len(example.reference.spans["cities"]) == 1
+    assert len(example.reference.spans["people"]) == 1
+
+    output_dict = example.to_dict()
+    assert "spans" in output_dict["doc_annotation"]
+    assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
+    assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
+
+    output_example = Example.from_dict(predicted, output_dict)
+
+    assert len(output_example.reference.spans["cities"]) == len(
+        example.reference.spans["cities"]
+    )
+    assert len(output_example.reference.spans["people"]) == len(
+        example.reference.spans["people"]
+    )
+    for span in example.reference.spans["cities"]:
+        assert span.label_ == "LOC"
+        assert span.text == "New York"
+        assert span.start_char == 7
+    for span in example.reference.spans["people"]:
+        assert span.label_ == "PERSON"
+        assert span.text == "I"
+        assert span.start_char == 0
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -361,6 +361,7 @@ cdef class Example:
            "doc_annotation": {
                "cats": dict(self.reference.cats),
                "entities": doc_to_biluo_tags(self.reference),
+                "spans": self._spans_to_dict(),
                "links": self._links_to_dict()
            },
            "token_annotation": {
@ -376,6 +377,18 @@ cdef class Example:
            }
        }

+    def _spans_to_dict(self):
+        span_dict = {}
+        for key in self.reference.spans:
+            span_tuples = []
+            for span in self.reference.spans[key]: 
+                span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
+                span_tuples.append(span_tuple)
+            span_dict[key] = span_tuples
+
+        return span_dict
+
+
    def _links_to_dict(self):
        links = {}
        for ent in self.reference.ents:
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own
 >    "pos": List[str],
 >    "morphs": List[str],
 >    "sent_starts": List[Optional[bool]],
->    "deps": List[string],
+>    "deps": List[str],
 >    "heads": List[int],
 >    "entities": List[str],
 >    "entities": List[(int, int, str)],
 >    "cats": Dict[str, float],
 >    "links": Dict[(int, int), dict],
+>    "spans": Dict[str, List[Tuple]],
 > }
 > ```

@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own
 | `deps`        | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~                                                                                  |
 | `heads`       | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~                                                                                  |
 | `entities`    | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~                                                     |
-| `entities`    | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~                                                                                                         |
+| `entities`    | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~                                                                                                 |
 | `cats`        | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~                                                                              |
 | `links`       | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
+| `spans`       | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~                                            |

 <Infobox title="Notes and caveats">