From 5f002e9cede44a4ca8ef9ee9a74c6dea0e0455fb Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Thu, 14 Mar 2019 15:48:40 +0100
Subject: [PATCH] annotate kb_id through ents in doc

---
 sandbox_test_sofie/testing_el.py | 13 +++++++++----
 spacy/morphology.pxd             |  2 --
 spacy/morphology.pyx             |  3 ---
 spacy/pipeline/pipes.pyx         |  6 ++++--
 spacy/structs.pxd                |  3 +--
 spacy/tokens/doc.pyx             | 16 +++++++++++-----
 spacy/tokens/span.pxd            |  1 +
 spacy/tokens/span.pyx            | 11 ++++++++++-
 spacy/tokens/token.pyx           | 24 ++++++++++++++++--------
 9 files changed, 52 insertions(+), 27 deletions(-)

diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py
index 8d9b0c21d..7883e44d4 100644
--- a/sandbox_test_sofie/testing_el.py
+++ b/sandbox_test_sofie/testing_el.py
@@ -3,18 +3,23 @@ import spacy
 
 def add_el():
     nlp = spacy.load('en_core_web_sm')
-    print("pipes", nlp.pipe_names)
+    print("pipes before:", nlp.pipe_names)
 
     el_pipe = nlp.create_pipe(name='el')
     nlp.add_pipe(el_pipe, last=True)
 
-    print("pipes", nlp.pipe_names)
+    print("pipes after:", nlp.pipe_names)
     print()
 
-    text = "Australian striker John hits century"
+    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
     doc = nlp(text)
+
     for token in doc:
-        print("token", token.text, token.tag_, token.pos_, token.kb_id)
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)
 
 
 if __name__ == "__main__":
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index d674140b0..d0110b300 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -43,8 +43,6 @@ cdef class Morphology:
 
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
-
 
 cdef enum univ_morph_t:
     NIL = 0
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 92ca67f18..bd821d76f 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -122,9 +122,6 @@ cdef class Morphology:
         else:
             flags[0] &= ~(one << flag_id)
 
-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
-        token.kb_id = kb_id
-
     def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                          force=False):
         """Add a special-case rule to the morphological analyser. Tokens whose
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 4eb3ecc80..e1e5471be 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -1086,8 +1086,10 @@ class EntityLinker(Pipe):
     def set_annotations(self, docs, scores, tensors=None):
         # TODO Sofie: actually implement this class instead of dummy implementation
         for i, doc in enumerate(docs):
-            for token in doc:
-                token.kb_id = 342
+            for ent in doc.ents:
+                if ent.label_ in ["PERSON", "PER"]:
+                    for token in ent:
+                        token.ent_kb_id_ = "Q42"
 
     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
         scores, bp_scores = self.model.begin_update(docs, drop=drop)
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 86b738a5c..154202c0d 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -70,6 +70,5 @@ cdef struct TokenC:
     int sent_start
     int ent_iob
     attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_kb_id
     hash_t ent_id
-
-    hash_t kb_id
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 97ac10f76..7640368ec 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -279,7 +279,7 @@ cdef class Doc:
     def doc(self):
         return self
 
-    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
         """Create a `Span` object from the slice `doc.text[start : end]`.
 
         doc (Doc): The parent document.
@@ -287,12 +287,15 @@ cdef class Doc:
         end (int): The index of the first character after the span.
         label (uint64 or string): A label to attach to the Span, e.g. for
             named entities.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
             the span.
         RETURNS (Span): The newly constructed object.
         """
         if not isinstance(label, int):
             label = self.vocab.strings.add(label)
+        if not isinstance(kb_id, int):
+            kb_id = self.vocab.strings.add(kb_id)
         cdef int start = token_by_start(self.c, self.length, start_idx)
         if start == -1:
             return None
@@ -301,7 +304,7 @@ cdef class Doc:
             return None
         # Currently we have the token index, we want the range-end index
         end += 1
-        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
         return span
 
     def similarity(self, other):
@@ -438,6 +441,7 @@ cdef class Doc:
             cdef const TokenC* token
             cdef int start = -1
             cdef attr_t label = 0
+            cdef attr_t kb_id = 0
             output = []
             for i in range(self.length):
                 token = &self.c[i]
@@ -447,16 +451,18 @@ cdef class Doc:
                         raise ValueError(Errors.E093.format(seq=' '.join(seq)))
                 elif token.ent_iob == 2 or token.ent_iob == 0:
                     if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                     start = -1
                     label = 0
+                    kb_id = 0
                 elif token.ent_iob == 3:
                     if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                     start = i
                     label = token.ent_type
+                    kb_id = token.ent_kb_id
             if start != -1:
-                output.append(Span(self, start, self.length, label=label))
+                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
             return tuple(output)
 
         def __set__(self, ents):
diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 9645189a5..f6f88a23e 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -11,6 +11,7 @@ cdef class Span:
     cdef readonly int start_char
     cdef readonly int end_char
     cdef readonly attr_t label
+    cdef readonly attr_t kb_id
 
     cdef public _vector
     cdef public _vector_norm
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index a418fc13f..f65c84ffb 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -45,13 +45,14 @@ cdef class Span:
         return Underscore.span_extensions.pop(name)
 
     def __cinit__(self, Doc doc, int start, int end, label=0,
-                  vector=None, vector_norm=None):
+                  vector=None, vector_norm=None, kb_id=0):
         """Create a `Span` object from the slice `doc[start : end]`.
 
         doc (Doc): The parent document.
         start (int): The index of the first token of the span.
         end (int): The index of the first token after the span.
         label (uint64): A label to attach to the Span, e.g. for named entities.
+        kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
         vector (ndarray[ndim=1, dtype='float32']): A meaning representation
             of the span.
         RETURNS (Span): The newly constructed object.
@@ -73,6 +74,7 @@ cdef class Span:
         self.label = label
         self._vector = vector
         self._vector_norm = vector_norm
+        self.kb_id = kb_id
 
     def __richcmp__(self, Span other, int op):
         if other is None:
@@ -592,6 +594,13 @@ cdef class Span:
         def __set__(self, unicode label_):
             self.label = self.doc.vocab.strings.add(label_)
 
+    property kb_id_:
+        """RETURNS (unicode): The named entity's KB ID."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.kb_id]
+        def __set__(self, unicode kb_id_):
+            raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
+
 
 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
     # Don't allow spaces to be the root, if there are
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 39e408a89..ccf2f8249 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -307,14 +307,6 @@ cdef class Token:
         def __set__(self, attr_t tag):
             self.vocab.morphology.assign_tag(self.c, tag)
 
-    property kb_id:
-        """RETURNS (uint64): ID of entity (after Entity Linking)."""
-        def __get__(self):
-            return self.c.kb_id
-
-        def __set__(self, attr_t kb_id):
-            self.vocab.morphology.assign_kb_id(self.c, kb_id)
-
     property dep:
         """RETURNS (uint64): ID of syntactic dependency label."""
         def __get__(self):
@@ -699,6 +691,22 @@ cdef class Token:
         def __set__(self, name):
             self.c.ent_id = self.vocab.strings.add(name)
 
+    property ent_kb_id:
+        """RETURNS (uint64): Named entity KB ID."""
+        def __get__(self):
+            return self.c.ent_kb_id
+
+        def __set__(self, attr_t ent_kb_id):
+            self.c.ent_kb_id = ent_kb_id
+
+    property ent_kb_id_:
+        """RETURNS (unicode): Named entity KB ID."""
+        def __get__(self):
+            return self.vocab.strings[self.c.ent_kb_id]
+
+        def __set__(self, ent_kb_id):
+            self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
+
     property whitespace_:
         """RETURNS (unicode): The trailing whitespace character, if present.
         """