add links to to_dict

This commit is contained in:
svlandeg 2020-06-19 11:22:24 +02:00
parent 03db143cd0
commit c705a28438
2 changed files with 14 additions and 3 deletions

View File

@ -126,7 +126,7 @@ cdef class Example:
"doc_annotation": {
"cats": dict(self.reference.cats),
"entities": biluo_tags_from_doc(self.reference),
"links": [], # TODO
"links": self._links_to_dict()
},
"token_annotation": {
"ids": [t.i+1 for t in self.reference],
@ -141,6 +141,14 @@ cdef class Example:
}
}
def _links_to_dict(self):
links = {}
for ent in self.reference.ents:
if ent.kb_id_:
links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
return links
def split_sents(self):
""" Split the token annotations into multiple Examples based on
sent_starts and return a list of the new Examples"""

View File

@ -200,13 +200,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
words = ["I flew", "to", "San Francisco", "Valley", "."]
spaces = [True, True, True, False, False]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}}
offset_start = len("I flew to ")
offset_end = len("I flew to San Francisco Valley")
entities = [(offset_start, offset_end, "LOC")]
links = {(offset_start, offset_end): {"Q816843": 1.0}}
gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}
# additional whitespace tokens in GoldParse words
words, spaces = get_words_and_spaces(