mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
adding spans to doc_annotation in Example.to_dict (#11261)
* adding spans to doc_annotation in Example.to_dict * to_dict compatible with from_dict: tuples instead of spans * use strings for label and kb_id * Simplify test * Update data formats docs Co-authored-by: Stefanie Wolf <stefanie.wolf@vitecsoftware.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
parent
b07708d5d0
commit
23749cfc91
|
@ -431,3 +431,41 @@ def test_Example_aligned_whitespace(en_vocab):
|
|||
|
||||
example = Example(predicted, reference)
|
||||
assert example.get_aligned("TAG", as_string=True) == tags
|
||||
|
||||
|
||||
@pytest.mark.issue("11260")
|
||||
def test_issue11260():
|
||||
annots = {
|
||||
"words": ["I", "like", "New", "York", "."],
|
||||
"spans": {
|
||||
"cities": [(7, 15, "LOC", "")],
|
||||
"people": [(0, 1, "PERSON", "")],
|
||||
},
|
||||
}
|
||||
vocab = Vocab()
|
||||
predicted = Doc(vocab, words=annots["words"])
|
||||
example = Example.from_dict(predicted, annots)
|
||||
assert len(example.reference.spans["cities"]) == 1
|
||||
assert len(example.reference.spans["people"]) == 1
|
||||
|
||||
output_dict = example.to_dict()
|
||||
assert "spans" in output_dict["doc_annotation"]
|
||||
assert output_dict["doc_annotation"]["spans"]["cities"] == annots["spans"]["cities"]
|
||||
assert output_dict["doc_annotation"]["spans"]["people"] == annots["spans"]["people"]
|
||||
|
||||
output_example = Example.from_dict(predicted, output_dict)
|
||||
|
||||
assert len(output_example.reference.spans["cities"]) == len(
|
||||
example.reference.spans["cities"]
|
||||
)
|
||||
assert len(output_example.reference.spans["people"]) == len(
|
||||
example.reference.spans["people"]
|
||||
)
|
||||
for span in example.reference.spans["cities"]:
|
||||
assert span.label_ == "LOC"
|
||||
assert span.text == "New York"
|
||||
assert span.start_char == 7
|
||||
for span in example.reference.spans["people"]:
|
||||
assert span.label_ == "PERSON"
|
||||
assert span.text == "I"
|
||||
assert span.start_char == 0
|
||||
|
|
|
@ -361,6 +361,7 @@ cdef class Example:
|
|||
"doc_annotation": {
|
||||
"cats": dict(self.reference.cats),
|
||||
"entities": doc_to_biluo_tags(self.reference),
|
||||
"spans": self._spans_to_dict(),
|
||||
"links": self._links_to_dict()
|
||||
},
|
||||
"token_annotation": {
|
||||
|
@ -376,6 +377,18 @@ cdef class Example:
|
|||
}
|
||||
}
|
||||
|
||||
def _spans_to_dict(self):
|
||||
span_dict = {}
|
||||
for key in self.reference.spans:
|
||||
span_tuples = []
|
||||
for span in self.reference.spans[key]:
|
||||
span_tuple = (span.start_char, span.end_char, span.label_, span.kb_id_)
|
||||
span_tuples.append(span_tuple)
|
||||
span_dict[key] = span_tuples
|
||||
|
||||
return span_dict
|
||||
|
||||
|
||||
def _links_to_dict(self):
|
||||
links = {}
|
||||
for ent in self.reference.ents:
|
||||
|
|
|
@ -395,12 +395,13 @@ file to keep track of your settings and hyperparameters and your own
|
|||
> "pos": List[str],
|
||||
> "morphs": List[str],
|
||||
> "sent_starts": List[Optional[bool]],
|
||||
> "deps": List[string],
|
||||
> "deps": List[str],
|
||||
> "heads": List[int],
|
||||
> "entities": List[str],
|
||||
> "entities": List[(int, int, str)],
|
||||
> "cats": Dict[str, float],
|
||||
> "links": Dict[(int, int), dict],
|
||||
> "spans": Dict[str, List[Tuple]],
|
||||
> }
|
||||
> ```
|
||||
|
||||
|
@ -417,9 +418,10 @@ file to keep track of your settings and hyperparameters and your own
|
|||
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
|
||||
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
|
||||
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
|
||||
| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
||||
| `entities` | **Option 2:** List of `(start_char, end_char, label)` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
||||
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
|
||||
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
|
||||
| `spans` | Dictionary of `spans_key`/`List[Tuple]` pairs defining the spans for each spans key as `(start_char, end_char, label, kb_id)` tuples. ~~Dict[str, List[Tuple[int, int, str, str]]~~ |
|
||||
|
||||
<Infobox title="Notes and caveats">
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user