From 1ee0e78fd7a07637f5ac31154d4c63faeba6f4cd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 18:55:01 +0100 Subject: [PATCH] select candidate with highest prior probabiity --- examples/pipeline/dummy_entity_linking.py | 69 +++++++++++++++++++++ spacy/kb.pxd | 10 +--- spacy/kb.pyx | 26 +------- spacy/pipeline/pipes.pyx | 11 +++- spacy/sandbox_test_sofie/testing_el.py | 73 ----------------------- 5 files changed, 81 insertions(+), 108 deletions(-) create mode 100644 examples/pipeline/dummy_entity_linking.py delete mode 100644 spacy/sandbox_test_sofie/testing_el.py diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py new file mode 100644 index 000000000..c51f321e0 --- /dev/null +++ b/examples/pipeline/dummy_entity_linking.py @@ -0,0 +1,69 @@ +# coding: utf-8 +"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm. +Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias +""" +import spacy +from spacy.kb import KnowledgeBase + + +def create_kb(): + kb = KnowledgeBase() + + # adding entities + entity_0 = "Q1004791" + print("adding entity", entity_0) + kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5) + + entity_1 = "Q42" + print("adding entity", entity_1) + kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5) + + entity_2 = "Q5301561" + print("adding entity", entity_2) + kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5) + + # adding aliases + print() + alias_0 = "Douglas" + print("adding alias", alias_0, "to all three entities") + kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2]) + + alias_1 = "Douglas Adams" + print("adding alias", alias_1, "to just the one entity") + kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9]) + + print() + print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + + return kb + + +def add_el(kb): + nlp = spacy.load('en_core_web_sm') + + el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) + + for alias in ["Douglas Adams", "Douglas"]: + candidates = nlp.linker.kb.get_candidates(alias) + print() + print(len(candidates), "candidate(s) for", alias, ":") + for c in candidates: + print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob) + + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel." + doc = nlp(text) + + print() + for token in doc: + print("token", token.text, token.ent_type_, token.ent_kb_id_) + + print() + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + +if __name__ == "__main__": + mykb = create_kb() + add_el(mykb) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index cffbcd5d1..490e05036 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -44,15 +44,7 @@ cdef struct _AliasC: vector[float] probs -# TODO: document -cdef class Entity: - - cdef readonly KnowledgeBase kb - cdef hash_t entity_id_hash - cdef float confidence - - -# TODO: document +# Object used by the Entity Linker that summarizes one entity-alias candidate combination. cdef class Candidate: cdef readonly KnowledgeBase kb diff --git a/spacy/kb.pyx b/spacy/kb.pyx index e51cb087d..6d031fb91 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -3,28 +3,6 @@ from spacy.errors import user_warning -cdef class Entity: - - def __init__(self, KnowledgeBase kb, entity_id_hash, confidence): - self.kb = kb - self.entity_id_hash = entity_id_hash - self.confidence = confidence - - property kb_id_: - """RETURNS (unicode): ID of this entity in the KB""" - def __get__(self): - return self.kb.strings[self.entity_id_hash] - - property kb_id: - """RETURNS (uint64): hash of the entity's KB ID""" - def __get__(self): - return self.entity_id_hash - - property confidence: - def __get__(self): - return self.confidence - - cdef class Candidate: def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob): @@ -103,7 +81,8 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value) + self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, + vector_rows=&dummy_value, feats_row=dummy_value) # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -155,6 +134,7 @@ cdef class KnowledgeBase: def get_candidates(self, unicode alias): + """ TODO: where to put this functionality ?""" cdef hash_t alias_hash = self.strings[alias] alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 6bb7da1eb..98ca9d76d 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1086,12 +1086,17 @@ class EntityLinker(Pipe): yield from docs def set_annotations(self, docs, scores, tensors=None): - # TODO Sofie: actually implement this class instead of dummy implementation + """ + Currently implemented as taking the KB entry with highest prior probability for each named entity + TODO: actually use context etc + """ for i, doc in enumerate(docs): for ent in doc.ents: - if ent.label_ in ["PERSON", "PER"]: + candidates = self.kb.get_candidates(ent.text) + if candidates: + best_candidate = max(candidates, key=lambda c: c.prior_prob) for token in ent: - token.ent_kb_id_ = "Q42" + token.ent_kb_id_ = best_candidate.entity_id_ def get_loss(self, docs, golds, scores): # TODO diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py deleted file mode 100644 index c7b0a3a07..000000000 --- a/spacy/sandbox_test_sofie/testing_el.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -import spacy -from spacy.kb import KnowledgeBase - - -def create_kb(): - mykb = KnowledgeBase() - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print() - - # adding entities - entity_0 = "Q0" # douglas adams - print(" adding entity", entity_0) - mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5) - - entity_42 = "Q42" # douglas adams - print(" adding entity", entity_42) - mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5) - - entity_5301561 = "Q5301561" - print(" adding entity", entity_5301561) - mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5) - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print() - - # adding aliases - alias1 = "douglassss" - print(" adding alias", alias1, "to Q42 and Q5301561") - mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2]) - - alias3 = "adam" - print(" adding alias", alias3, "to Q42") - mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9]) - - print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) - print() - - return mykb - - -def add_el(kb): - nlp = spacy.load('en_core_web_sm') - print("pipes before:", nlp.pipe_names) - - el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) - - print("pipes after:", nlp.pipe_names) - print() - - text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel." - doc = nlp(text) - - for token in doc: - print("token", token.text, token.ent_type_, token.ent_kb_id_) - - print() - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - - print() - for alias in ["douglassss", "rubbish", "adam"]: - candidates = nlp.linker.kb.get_candidates(alias) - print(len(candidates), "candidates for", alias, ":") - for c in candidates: - print(" ", c.entity_id_, c.entity_name_, c.alias_) - - -if __name__ == "__main__": - mykb = create_kb() - add_el(mykb)