add links to to_dict

2025-12-22 09:34:23 +03:00 · 2020-06-19 11:22:24 +02:00 · 2020-06-19 11:22:24 +02:00 · c705a28438
commit c705a28438
parent 03db143cd0
2 changed files with 14 additions and 3 deletions
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -126,7 +126,7 @@ cdef class Example:
            "doc_annotation": {
                "cats": dict(self.reference.cats),
                "entities": biluo_tags_from_doc(self.reference),
-                "links": [], # TODO
+                "links": self._links_to_dict()
            },
            "token_annotation": {
                "ids": [t.i+1 for t in self.reference],
@ -141,6 +141,14 @@ cdef class Example:
            }
        }

+    def _links_to_dict(self):
+        links = {}
+        for ent in self.reference.ents:
+            if ent.kb_id_:
+                links[(ent.start_char, ent.end_char)] = {ent.kb_id_: 1.0}
+        return links
+
+
    def split_sents(self):
        """ Split the token annotations into multiple Examples based on
        sent_starts and return a list of the new Examples"""
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@ -200,13 +200,16 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
    words = ["I flew", "to", "San Francisco", "Valley", "."]
    spaces = [True, True, True, False, False]
    doc = Doc(en_vocab, words=words, spaces=spaces)
-    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
-    links = {(len("I flew to "), len("I flew to San Francisco Valley")): {"Q816843": 1.0}}
+    offset_start = len("I flew to ")
+    offset_end = len("I flew to San Francisco Valley")
+    entities = [(offset_start, offset_end, "LOC")]
+    links = {(offset_start, offset_end): {"Q816843": 1.0}}
    gold_words = ["I", "flew to", "San", "Francisco Valley", "."]
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities, "links": links})
    assert example.get_aligned("ENT_IOB") == [2, 2, 3, 1, 2]
    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", "LOC", ""]
    assert example.get_aligned("ENT_KB_ID", as_string=True) == ["", "", "Q816843", "Q816843", ""]
+    assert example.to_dict()["doc_annotation"]["links"][(offset_start, offset_end)] == {"Q816843": 1.0}

    # additional whitespace tokens in GoldParse words
    words, spaces = get_words_and_spaces(