From 5f002e9cede44a4ca8ef9ee9a74c6dea0e0455fb Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 14 Mar 2019 15:48:40 +0100 Subject: [PATCH] annotate kb_id through ents in doc --- sandbox_test_sofie/testing_el.py | 13 +++++++++---- spacy/morphology.pxd | 2 -- spacy/morphology.pyx | 3 --- spacy/pipeline/pipes.pyx | 6 ++++-- spacy/structs.pxd | 3 +-- spacy/tokens/doc.pyx | 16 +++++++++++----- spacy/tokens/span.pxd | 1 + spacy/tokens/span.pyx | 11 ++++++++++- spacy/tokens/token.pyx | 24 ++++++++++++++++-------- 9 files changed, 52 insertions(+), 27 deletions(-) diff --git a/sandbox_test_sofie/testing_el.py b/sandbox_test_sofie/testing_el.py index 8d9b0c21d..7883e44d4 100644 --- a/sandbox_test_sofie/testing_el.py +++ b/sandbox_test_sofie/testing_el.py @@ -3,18 +3,23 @@ import spacy def add_el(): nlp = spacy.load('en_core_web_sm') - print("pipes", nlp.pipe_names) + print("pipes before:", nlp.pipe_names) el_pipe = nlp.create_pipe(name='el') nlp.add_pipe(el_pipe, last=True) - print("pipes", nlp.pipe_names) + print("pipes after:", nlp.pipe_names) print() - text = "Australian striker John hits century" + text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel." doc = nlp(text) + for token in doc: - print("token", token.text, token.tag_, token.pos_, token.kb_id) + print("token", token.text, token.ent_type_, token.ent_kb_id_) + + print() + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) if __name__ == "__main__": diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d674140b0..d0110b300 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -43,8 +43,6 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 - cdef int assign_kb_id(self, TokenC* token, kb_id) except -1 - cdef enum univ_morph_t: NIL = 0 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 92ca67f18..bd821d76f 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -122,9 +122,6 @@ cdef class Morphology: else: flags[0] &= ~(one << flag_id) - cdef int assign_kb_id(self, TokenC* token, kb_id) except -1: - token.kb_id = kb_id - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): """Add a special-case rule to the morphological analyser. Tokens whose diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 4eb3ecc80..e1e5471be 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1086,8 +1086,10 @@ class EntityLinker(Pipe): def set_annotations(self, docs, scores, tensors=None): # TODO Sofie: actually implement this class instead of dummy implementation for i, doc in enumerate(docs): - for token in doc: - token.kb_id = 342 + for ent in doc.ents: + if ent.label_ in ["PERSON", "PER"]: + for token in ent: + token.ent_kb_id_ = "Q42" def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): scores, bp_scores = self.model.begin_update(docs, drop=drop) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 86b738a5c..154202c0d 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -70,6 +70,5 @@ cdef struct TokenC: int sent_start int ent_iob attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. + attr_t ent_kb_id hash_t ent_id - - hash_t kb_id diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 97ac10f76..7640368ec 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -279,7 +279,7 @@ cdef class Doc: def doc(self): return self - def char_span(self, int start_idx, int end_idx, label=0, vector=None): + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): """Create a `Span` object from the slice `doc.text[start : end]`. doc (Doc): The parent document. @@ -287,12 +287,15 @@ cdef class Doc: end (int): The index of the first character after the span. label (uint64 or string): A label to attach to the Span, e.g. for named entities. + kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object. """ if not isinstance(label, int): label = self.vocab.strings.add(label) + if not isinstance(kb_id, int): + kb_id = self.vocab.strings.add(kb_id) cdef int start = token_by_start(self.c, self.length, start_idx) if start == -1: return None @@ -301,7 +304,7 @@ cdef class Doc: return None # Currently we have the token index, we want the range-end index end += 1 - cdef Span span = Span(self, start, end, label=label, vector=vector) + cdef Span span = Span(self, start, end, label=label, kb_id=kb_id, vector=vector) return span def similarity(self, other): @@ -438,6 +441,7 @@ cdef class Doc: cdef const TokenC* token cdef int start = -1 cdef attr_t label = 0 + cdef attr_t kb_id = 0 output = [] for i in range(self.length): token = &self.c[i] @@ -447,16 +451,18 @@ cdef class Doc: raise ValueError(Errors.E093.format(seq=' '.join(seq))) elif token.ent_iob == 2 or token.ent_iob == 0: if start != -1: - output.append(Span(self, start, i, label=label)) + output.append(Span(self, start, i, label=label, kb_id=kb_id)) start = -1 label = 0 + kb_id = 0 elif token.ent_iob == 3: if start != -1: - output.append(Span(self, start, i, label=label)) + output.append(Span(self, start, i, label=label, kb_id=kb_id)) start = i label = token.ent_type + kb_id = token.ent_kb_id if start != -1: - output.append(Span(self, start, self.length, label=label)) + output.append(Span(self, start, self.length, label=label, kb_id=kb_id)) return tuple(output) def __set__(self, ents): diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd index 9645189a5..f6f88a23e 100644 --- a/spacy/tokens/span.pxd +++ b/spacy/tokens/span.pxd @@ -11,6 +11,7 @@ cdef class Span: cdef readonly int start_char cdef readonly int end_char cdef readonly attr_t label + cdef readonly attr_t kb_id cdef public _vector cdef public _vector_norm diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index a418fc13f..f65c84ffb 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -45,13 +45,14 @@ cdef class Span: return Underscore.span_extensions.pop(name) def __cinit__(self, Doc doc, int start, int end, label=0, - vector=None, vector_norm=None): + vector=None, vector_norm=None, kb_id=0): """Create a `Span` object from the slice `doc[start : end]`. doc (Doc): The parent document. start (int): The index of the first token of the span. end (int): The index of the first token after the span. label (uint64): A label to attach to the Span, e.g. for named entities. + kb_id (uint64): An identifier from a Knowledge Base to capture the meaning of a named entity. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object. @@ -73,6 +74,7 @@ cdef class Span: self.label = label self._vector = vector self._vector_norm = vector_norm + self.kb_id = kb_id def __richcmp__(self, Span other, int op): if other is None: @@ -592,6 +594,13 @@ cdef class Span: def __set__(self, unicode label_): self.label = self.doc.vocab.strings.add(label_) + property kb_id_: + """RETURNS (unicode): The named entity's KB ID.""" + def __get__(self): + return self.doc.vocab.strings[self.kb_id] + def __set__(self, unicode kb_id_): + raise NotImplementedError(TempErrors.T007.format(attr='kb_id_')) + cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: # Don't allow spaces to be the root, if there are diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 39e408a89..ccf2f8249 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -307,14 +307,6 @@ cdef class Token: def __set__(self, attr_t tag): self.vocab.morphology.assign_tag(self.c, tag) - property kb_id: - """RETURNS (uint64): ID of entity (after Entity Linking).""" - def __get__(self): - return self.c.kb_id - - def __set__(self, attr_t kb_id): - self.vocab.morphology.assign_kb_id(self.c, kb_id) - property dep: """RETURNS (uint64): ID of syntactic dependency label.""" def __get__(self): @@ -699,6 +691,22 @@ cdef class Token: def __set__(self, name): self.c.ent_id = self.vocab.strings.add(name) + property ent_kb_id: + """RETURNS (uint64): Named entity KB ID.""" + def __get__(self): + return self.c.ent_kb_id + + def __set__(self, attr_t ent_kb_id): + self.c.ent_kb_id = ent_kb_id + + property ent_kb_id_: + """RETURNS (unicode): Named entity KB ID.""" + def __get__(self): + return self.vocab.strings[self.c.ent_kb_id] + + def __set__(self, ent_kb_id): + self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) + property whitespace_: """RETURNS (unicode): The trailing whitespace character, if present. """