diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 402228994..b5d1b1402 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -126,7 +126,7 @@ cdef class Example: "doc_annotation": { "cats": dict(self.reference.cats), "entities": biluo_tags_from_doc(self.reference), - "links": [], # TODO + "links": self._links_to_dict() }, "token_annotation": { "ids": [t.i+1 for t in self.reference], @@ -141,6 +141,14 @@ cdef class Example: } } + def _links_to_dict(self): + links = {} + for ent in self.reference.ents: + if ent.kb_id_: + links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0} + return links + + def split_sents(self): """ Split the token annotations into multiple Examples based on sent_starts and return a list of the new Examples""" diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d98a93f2f..9e63f8a98 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -200,13 +200,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer): words = ["I flew", "to", "San Francisco", "Valley", "."] spaces = [True, True, True, False, False] doc = Doc(en_vocab, words=words, spaces=spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] - links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}} + offset_start = len("I flew to ") + offset_end = len("I flew to San Francisco Valley") + entities = [(offset_start, offset_end, "LOC")] + links = {(offset_start, offset_end): {"Q816843": 1.0}} gold_words = ["I", "flew to", "San", "Francisco Valley", "."] example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links}) assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2] assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""] assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""] + assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0} # additional whitespace tokens in GoldParse words words, spaces = get_words_and_spaces(