mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	select candidate with highest prior probabiity
This commit is contained in:
		
							parent
							
								
									7b708ab8a4
								
							
						
					
					
						commit
						1ee0e78fd7
					
				
							
								
								
									
										69
									
								
								examples/pipeline/dummy_entity_linking.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								examples/pipeline/dummy_entity_linking.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,69 @@
 | 
				
			||||||
 | 
					# coding: utf-8
 | 
				
			||||||
 | 
					"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
 | 
				
			||||||
 | 
					Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					from spacy.kb import KnowledgeBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_kb():
 | 
				
			||||||
 | 
					    kb = KnowledgeBase()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # adding entities
 | 
				
			||||||
 | 
					    entity_0 = "Q1004791"
 | 
				
			||||||
 | 
					    print("adding entity", entity_0)
 | 
				
			||||||
 | 
					    kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    entity_1 = "Q42"
 | 
				
			||||||
 | 
					    print("adding entity", entity_1)
 | 
				
			||||||
 | 
					    kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    entity_2 = "Q5301561"
 | 
				
			||||||
 | 
					    print("adding entity", entity_2)
 | 
				
			||||||
 | 
					    kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # adding aliases
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					    alias_0 = "Douglas"
 | 
				
			||||||
 | 
					    print("adding alias", alias_0, "to all three entities")
 | 
				
			||||||
 | 
					    kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    alias_1 = "Douglas Adams"
 | 
				
			||||||
 | 
					    print("adding alias", alias_1, "to just the one entity")
 | 
				
			||||||
 | 
					    kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return kb
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def add_el(kb):
 | 
				
			||||||
 | 
					    nlp = spacy.load('en_core_web_sm')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
 | 
				
			||||||
 | 
					    nlp.add_pipe(el_pipe, last=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for alias in ["Douglas Adams", "Douglas"]:
 | 
				
			||||||
 | 
					        candidates = nlp.linker.kb.get_candidates(alias)
 | 
				
			||||||
 | 
					        print()
 | 
				
			||||||
 | 
					        print(len(candidates), "candidate(s) for", alias, ":")
 | 
				
			||||||
 | 
					        for c in candidates:
 | 
				
			||||||
 | 
					            print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
 | 
				
			||||||
 | 
					           "Douglas reminds us to always bring our towel."
 | 
				
			||||||
 | 
					    doc = nlp(text)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					    for token in doc:
 | 
				
			||||||
 | 
					        print("token", token.text, token.ent_type_, token.ent_kb_id_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					    for ent in doc.ents:
 | 
				
			||||||
 | 
					        print("ent", ent.text, ent.label_, ent.kb_id_)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    mykb = create_kb()
 | 
				
			||||||
 | 
					    add_el(mykb)
 | 
				
			||||||
							
								
								
									
										10
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								spacy/kb.pxd
									
									
									
									
									
								
							| 
						 | 
					@ -44,15 +44,7 @@ cdef struct _AliasC:
 | 
				
			||||||
    vector[float] probs
 | 
					    vector[float] probs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# TODO: document
 | 
					# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
 | 
				
			||||||
cdef class Entity:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef readonly KnowledgeBase kb
 | 
					 | 
				
			||||||
    cdef hash_t entity_id_hash
 | 
					 | 
				
			||||||
    cdef float confidence
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# TODO: document
 | 
					 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef readonly KnowledgeBase kb
 | 
					    cdef readonly KnowledgeBase kb
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										26
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								spacy/kb.pyx
									
									
									
									
									
								
							| 
						 | 
					@ -3,28 +3,6 @@
 | 
				
			||||||
from spacy.errors import user_warning
 | 
					from spacy.errors import user_warning
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Entity:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
 | 
					 | 
				
			||||||
        self.kb = kb
 | 
					 | 
				
			||||||
        self.entity_id_hash = entity_id_hash
 | 
					 | 
				
			||||||
        self.confidence = confidence
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    property kb_id_:
 | 
					 | 
				
			||||||
        """RETURNS (unicode): ID of this entity in the KB"""
 | 
					 | 
				
			||||||
        def __get__(self):
 | 
					 | 
				
			||||||
            return self.kb.strings[self.entity_id_hash]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    property kb_id:
 | 
					 | 
				
			||||||
        """RETURNS (uint64): hash of the entity's KB ID"""
 | 
					 | 
				
			||||||
        def __get__(self):
 | 
					 | 
				
			||||||
            return self.entity_id_hash
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    property confidence:
 | 
					 | 
				
			||||||
        def __get__(self):
 | 
					 | 
				
			||||||
            return self.confidence
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class Candidate:
 | 
					cdef class Candidate:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
 | 
					    def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
 | 
				
			||||||
| 
						 | 
					@ -103,7 +81,8 @@ cdef class KnowledgeBase:
 | 
				
			||||||
            return
 | 
					            return
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef int32_t dummy_value = 342
 | 
					        cdef int32_t dummy_value = 342
 | 
				
			||||||
        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
 | 
					        self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob,
 | 
				
			||||||
 | 
					                          vector_rows=&dummy_value, feats_row=dummy_value)
 | 
				
			||||||
        # TODO self._vectors_table.get_pointer(vectors),
 | 
					        # TODO self._vectors_table.get_pointer(vectors),
 | 
				
			||||||
        # self._features_table.get(features))
 | 
					        # self._features_table.get(features))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -155,6 +134,7 @@ cdef class KnowledgeBase:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_candidates(self, unicode alias):
 | 
					    def get_candidates(self, unicode alias):
 | 
				
			||||||
 | 
					        """ TODO: where to put this functionality ?"""
 | 
				
			||||||
        cdef hash_t alias_hash = self.strings[alias]
 | 
					        cdef hash_t alias_hash = self.strings[alias]
 | 
				
			||||||
        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
					        alias_index = <int64_t>self._alias_index.get(alias_hash)
 | 
				
			||||||
        alias_entry = self._aliases_table[alias_index]
 | 
					        alias_entry = self._aliases_table[alias_index]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1086,12 +1086,17 @@ class EntityLinker(Pipe):
 | 
				
			||||||
            yield from docs
 | 
					            yield from docs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def set_annotations(self, docs, scores, tensors=None):
 | 
					    def set_annotations(self, docs, scores, tensors=None):
 | 
				
			||||||
        # TODO Sofie: actually implement this class instead of dummy implementation
 | 
					        """
 | 
				
			||||||
 | 
					        Currently implemented as taking the KB entry with highest prior probability for each named entity
 | 
				
			||||||
 | 
					        TODO: actually use context etc
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
        for i, doc in enumerate(docs):
 | 
					        for i, doc in enumerate(docs):
 | 
				
			||||||
            for ent in doc.ents:
 | 
					            for ent in doc.ents:
 | 
				
			||||||
                if ent.label_ in ["PERSON", "PER"]:
 | 
					                candidates = self.kb.get_candidates(ent.text)
 | 
				
			||||||
 | 
					                if candidates:
 | 
				
			||||||
 | 
					                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
 | 
				
			||||||
                    for token in ent:
 | 
					                    for token in ent:
 | 
				
			||||||
                        token.ent_kb_id_ = "Q42"
 | 
					                        token.ent_kb_id_ = best_candidate.entity_id_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_loss(self, docs, golds, scores):
 | 
					    def get_loss(self, docs, golds, scores):
 | 
				
			||||||
        # TODO
 | 
					        # TODO
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,73 +0,0 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
import spacy
 | 
					 | 
				
			||||||
from spacy.kb import KnowledgeBase
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def create_kb():
 | 
					 | 
				
			||||||
    mykb = KnowledgeBase()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 | 
					 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # adding entities
 | 
					 | 
				
			||||||
    entity_0 = "Q0"  # douglas adams
 | 
					 | 
				
			||||||
    print(" adding entity", entity_0)
 | 
					 | 
				
			||||||
    mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    entity_42 = "Q42"   # douglas adams
 | 
					 | 
				
			||||||
    print(" adding entity", entity_42)
 | 
					 | 
				
			||||||
    mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    entity_5301561 = "Q5301561"
 | 
					 | 
				
			||||||
    print(" adding entity", entity_5301561)
 | 
					 | 
				
			||||||
    mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 | 
					 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # adding aliases
 | 
					 | 
				
			||||||
    alias1 = "douglassss"
 | 
					 | 
				
			||||||
    print(" adding alias", alias1, "to Q42 and Q5301561")
 | 
					 | 
				
			||||||
    mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    alias3 = "adam"
 | 
					 | 
				
			||||||
    print(" adding alias", alias3, "to Q42")
 | 
					 | 
				
			||||||
    mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
 | 
					 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return mykb
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def add_el(kb):
 | 
					 | 
				
			||||||
    nlp = spacy.load('en_core_web_sm')
 | 
					 | 
				
			||||||
    print("pipes before:", nlp.pipe_names)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
 | 
					 | 
				
			||||||
    nlp.add_pipe(el_pipe, last=True)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print("pipes after:", nlp.pipe_names)
 | 
					 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
 | 
					 | 
				
			||||||
    doc = nlp(text)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for token in doc:
 | 
					 | 
				
			||||||
        print("token", token.text, token.ent_type_, token.ent_kb_id_)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
    for ent in doc.ents:
 | 
					 | 
				
			||||||
        print("ent", ent.text, ent.label_, ent.kb_id_)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    print()
 | 
					 | 
				
			||||||
    for alias in ["douglassss", "rubbish", "adam"]:
 | 
					 | 
				
			||||||
        candidates = nlp.linker.kb.get_candidates(alias)
 | 
					 | 
				
			||||||
        print(len(candidates), "candidates for", alias, ":")
 | 
					 | 
				
			||||||
        for c in candidates:
 | 
					 | 
				
			||||||
            print(" ", c.entity_id_, c.entity_name_, c.alias_)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == "__main__":
 | 
					 | 
				
			||||||
    mykb = create_kb()
 | 
					 | 
				
			||||||
    add_el(mykb)
 | 
					 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user