annotate kb_id through ents in doc

2025-10-30 23:47:31 +03:00 · 2019-03-14 15:48:40 +01:00 · 2019-03-14 15:48:40 +01:00 · 735fc2a735
commit 735fc2a735
parent d849eb2455
9 changed files with 52 additions and 27 deletions
--- a/sandbox_test_sofie/testing_el.py
+++ b/sandbox_test_sofie/testing_el.py
@ -3,18 +3,23 @@ import spacy

 def add_el():
    nlp = spacy.load('en_core_web_sm')
-    print("pipes", nlp.pipe_names)
+    print("pipes before:", nlp.pipe_names)

    el_pipe = nlp.create_pipe(name='el')
    nlp.add_pipe(el_pipe, last=True)

-    print("pipes", nlp.pipe_names)
+    print("pipes after:", nlp.pipe_names)
    print()

-    text = "Australian striker John hits century"
+    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
    doc = nlp(text)
+
    for token in doc:
-        print("token", token.text, token.tag_, token.pos_, token.kb_id)
+        print("token", token.text, token.ent_type_, token.ent_kb_id_)
+
+    print()
+    for ent in doc.ents:
+        print("ent", ent.text, ent.label_, ent.kb_id_)


 if __name__ == "__main__":
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -43,8 +43,6 @@ cdef class Morphology:

    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1

-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1
-

 cdef enum univ_morph_t:
    NIL = 0
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -123,9 +123,6 @@ cdef class Morphology:
        else:
            flags[0] &= ~(one << flag_id)

-    cdef int assign_kb_id(self, TokenC* token, kb_id) except -1:
-        token.kb_id = kb_id
-
    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
                         force=False):
        """Add a special-case rule to the morphological analyser. Tokens whose
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1104,8 +1104,10 @@ class EntityLinker(Pipe):
    def set_annotations(self, docs, scores, tensors=None):
        # TODO Sofie: actually implement this class instead of dummy implementation
        for i, doc in enumerate(docs):
-            for token in doc:
-                token.kb_id = 342
+            for ent in doc.ents:
+                if ent.label_ in ["PERSON", "PER"]:
+                    for token in ent:
+                        token.ent_kb_id_ = "Q42"

    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -70,6 +70,5 @@ cdef struct TokenC:
    int sent_start
    int ent_iob
    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_kb_id
    hash_t ent_id
-
-    hash_t kb_id
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -326,7 +326,7 @@ cdef class Doc:
    def doc(self):
        return self

-    def char_span(self, int start_idx, int end_idx, label=0, vector=None):
+    def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
        """Create a `Span` object from the slice `doc.text[start : end]`.

        doc (Doc): The parent document.
@ -334,6 +334,7 @@ cdef class Doc:
        end (int): The index of the first character after the span.
        label (uint64 or string): A label to attach to the Span, e.g. for
            named entities.
+        kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
            the span.
        RETURNS (Span): The newly constructed object.
@ -342,6 +343,8 @@ cdef class Doc:
        """
        if not isinstance(label, int):
            label = self.vocab.strings.add(label)
+        if not isinstance(kb_id, int):
+            kb_id = self.vocab.strings.add(kb_id)
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
@ -350,7 +353,7 @@ cdef class Doc:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
-        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector)
        return span

    def similarity(self, other):
@ -484,6 +487,7 @@ cdef class Doc:
            cdef const TokenC* token
            cdef int start = -1
            cdef attr_t label = 0
+            cdef attr_t kb_id = 0
            output = []
            for i in range(self.length):
                token = &self.c[i]
@ -493,16 +497,18 @@ cdef class Doc:
                        raise ValueError(Errors.E093.format(seq=" ".join(seq)))
                elif token.ent_iob == 2 or token.ent_iob == 0:
                    if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                    start = -1
                    label = 0
+                    kb_id = 0
                elif token.ent_iob == 3:
                    if start != -1:
-                        output.append(Span(self, start, i, label=label))
+                        output.append(Span(self, start, i, label=label, kb_id=kb_id))
                    start = i
                    label = token.ent_type
+                    kb_id = token.ent_kb_id
            if start != -1:
-                output.append(Span(self, start, self.length, label=label))
+                output.append(Span(self, start, self.length, label=label, kb_id=kb_id))
            return tuple(output)

        def __set__(self, ents):
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@ -11,6 +11,7 @@ cdef class Span:
    cdef readonly int start_char
    cdef readonly int end_char
    cdef readonly attr_t label
+    cdef readonly attr_t kb_id

    cdef public _vector
    cdef public _vector_norm
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -85,13 +85,14 @@ cdef class Span:
        return Underscore.span_extensions.pop(name)

    def __cinit__(self, Doc doc, int start, int end, label=0, vector=None,
-                  vector_norm=None):
+                  vector_norm=None, kb_id=0):
        """Create a `Span` object from the slice `doc[start : end]`.

        doc (Doc): The parent document.
        start (int): The index of the first token of the span.
        end (int): The index of the first token after the span.
        label (uint64): A label to attach to the Span, e.g. for named entities.
+        kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation
            of the span.
        RETURNS (Span): The newly constructed object.
@ -115,6 +116,7 @@ cdef class Span:
        self.label = label
        self._vector = vector
        self._vector_norm = vector_norm
+        self.kb_id = kb_id

    def __richcmp__(self, Span other, int op):
        if other is None:
@ -655,6 +657,13 @@ cdef class Span:
                label_ = ''
            raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))

+    property kb_id_:
+        """RETURNS (unicode): The named entity's KB ID."""
+        def __get__(self):
+            return self.doc.vocab.strings[self.kb_id]
+        def __set__(self, unicode kb_id_):
+            raise NotImplementedError(TempErrors.T007.format(attr='kb_id_'))
+

 cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
    # Don't allow spaces to be the root, if there are
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -354,14 +354,6 @@ cdef class Token:
        def __set__(self, attr_t tag):
            self.vocab.morphology.assign_tag(self.c, tag)

-    property kb_id:
-        """RETURNS (uint64): ID of entity (after Entity Linking)."""
-        def __get__(self):
-            return self.c.kb_id
-
-        def __set__(self, attr_t kb_id):
-            self.vocab.morphology.assign_kb_id(self.c, kb_id)
-
    property dep:
        """RETURNS (uint64): ID of syntactic dependency label."""
        def __get__(self):
@ -777,6 +769,22 @@ cdef class Token:
        def __set__(self, name):
            self.c.ent_id = self.vocab.strings.add(name)

+    property ent_kb_id:
+        """RETURNS (uint64): Named entity KB ID."""
+        def __get__(self):
+            return self.c.ent_kb_id
+
+        def __set__(self, attr_t ent_kb_id):
+            self.c.ent_kb_id = ent_kb_id
+
+    property ent_kb_id_:
+        """RETURNS (unicode): Named entity KB ID."""
+        def __get__(self):
+            return self.vocab.strings[self.c.ent_kb_id]
+
+        def __set__(self, ent_kb_id):
+            self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id)
+
    @property
    def whitespace_(self):
        """RETURNS (unicode): The trailing whitespace character, if present."""