adding spans to doc_annotation in Example.to_dict (#11261)

* adding spans to doc_annotation in Example.to_dict

* to_dict compatible with from_dict: tuples instead of spans

* use strings for label and kb_id

* Simplify test

* Update data formats docs

Co-authored-by: Stefanie Wolf <stefanie.wolf@vitecsoftware.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
stefawolf 2022-08-05 12:26:38 +02:00 committed by GitHub
parent b07708d5d0
commit 23749cfc91
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 55 additions and 2 deletions

View File

@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab):
example = Example(predicted, reference)
assert example.get_aligned("TAG", as_string=True) == tags
@pytest.mark.issue("11260")
def test_issue11260():
annots = {
"words": ["I", "like", "New", "York", "."],
"spans": {
"cities": [(7, 15, "LOC", "")],
"people": [(0, 1, "PERSON", "")],
},
}
vocab = Vocab()
predicted = Doc(vocab, words=annots["words"])
example = Example.from_dict(predicted, annots)
assert len(example.reference.spans["cities"]) == 1
assert len(example.reference.spans["people"]) == 1
output_dict = example.to_dict()
assert "spans" in output_dict["doc_annotation"]
assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
output_example = Example.from_dict(predicted, output_dict)
assert len(output_example.reference.spans["cities"]) == len(
example.reference.spans["cities"]
)
assert len(output_example.reference.spans["people"]) == len(
example.reference.spans["people"]
)
for span in example.reference.spans["cities"]:
assert span.label_ == "LOC"
assert span.text == "New York"
assert span.start_char == 7
for span in example.reference.spans["people"]:
assert span.label_ == "PERSON"
assert span.text == "I"
assert span.start_char == 0

View File

@ -361,6 +361,7 @@ cdef class Example:
"doc_annotation": {
"cats": dict(self.reference.cats),
"entities": doc_to_biluo_tags(self.reference),
"spans": self._spans_to_dict(),
"links": self._links_to_dict()
},
"token_annotation": {
@ -376,6 +377,18 @@ cdef class Example:
}
}
def _spans_to_dict(self):
span_dict = {}
for key in self.reference.spans:
span_tuples = []
for span in self.reference.spans[key]:
span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
span_tuples.append(span_tuple)
span_dict[key] = span_tuples
return span_dict
def _links_to_dict(self):
links = {}
for ent in self.reference.ents:

View File

@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own
> "pos": List[str],
> "morphs": List[str],
> "sent_starts": List[Optional[bool]],
> "deps": List[string],
> "deps": List[str],
> "heads": List[int],
> "entities": List[str],
> "entities": List[(int, int, str)],
> "cats": Dict[str, float],
> "links": Dict[(int, int), dict],
> "spans": Dict[str, List[Tuple]],
> }
> ```
@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
| `entities` | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
| `spans` | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~ |
<Infobox title="Notes and caveats">