mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
select candidate with highest prior probabiity
This commit is contained in:
parent
24a0c4a8d4
commit
6e2433b95e
69
examples/pipeline/dummy_entity_linking.py
Normal file
69
examples/pipeline/dummy_entity_linking.py
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
# coding: utf-8
|
||||||
|
"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
|
||||||
|
Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
|
||||||
|
"""
|
||||||
|
import spacy
|
||||||
|
from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
|
def create_kb():
|
||||||
|
kb = KnowledgeBase()
|
||||||
|
|
||||||
|
# adding entities
|
||||||
|
entity_0 = "Q1004791"
|
||||||
|
print("adding entity", entity_0)
|
||||||
|
kb.add_entity(entity_id=entity_0, entity_name="Douglas", prob=0.5)
|
||||||
|
|
||||||
|
entity_1 = "Q42"
|
||||||
|
print("adding entity", entity_1)
|
||||||
|
kb.add_entity(entity_id=entity_1, entity_name="Douglas Adams", prob=0.5)
|
||||||
|
|
||||||
|
entity_2 = "Q5301561"
|
||||||
|
print("adding entity", entity_2)
|
||||||
|
kb.add_entity(entity_id=entity_2, entity_name="Douglas Haig", prob=0.5)
|
||||||
|
|
||||||
|
# adding aliases
|
||||||
|
print()
|
||||||
|
alias_0 = "Douglas"
|
||||||
|
print("adding alias", alias_0, "to all three entities")
|
||||||
|
kb.add_alias(alias=alias_0, entities=["Q1004791", "Q42", "Q5301561"], probabilities=[0.1, 0.6, 0.2])
|
||||||
|
|
||||||
|
alias_1 = "Douglas Adams"
|
||||||
|
print("adding alias", alias_1, "to just the one entity")
|
||||||
|
kb.add_alias(alias=alias_1, entities=["Q42"], probabilities=[0.9])
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
|
||||||
|
|
||||||
|
return kb
|
||||||
|
|
||||||
|
|
||||||
|
def add_el(kb):
|
||||||
|
nlp = spacy.load('en_core_web_sm')
|
||||||
|
|
||||||
|
el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
|
||||||
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
|
for alias in ["Douglas Adams", "Douglas"]:
|
||||||
|
candidates = nlp.linker.kb.get_candidates(alias)
|
||||||
|
print()
|
||||||
|
print(len(candidates), "candidate(s) for", alias, ":")
|
||||||
|
for c in candidates:
|
||||||
|
print(" ", c.entity_id_, c.entity_name_, c.alias_, c.prior_prob)
|
||||||
|
|
||||||
|
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||||
|
"Douglas reminds us to always bring our towel."
|
||||||
|
doc = nlp(text)
|
||||||
|
|
||||||
|
print()
|
||||||
|
for token in doc:
|
||||||
|
print("token", token.text, token.ent_type_, token.ent_kb_id_)
|
||||||
|
|
||||||
|
print()
|
||||||
|
for ent in doc.ents:
|
||||||
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
mykb = create_kb()
|
||||||
|
add_el(mykb)
|
10
spacy/kb.pxd
10
spacy/kb.pxd
|
@ -44,15 +44,7 @@ cdef struct _AliasC:
|
||||||
vector[float] probs
|
vector[float] probs
|
||||||
|
|
||||||
|
|
||||||
# TODO: document
|
# Object used by the Entity Linker that summarizes one entity-alias candidate combination.
|
||||||
cdef class Entity:
|
|
||||||
|
|
||||||
cdef readonly KnowledgeBase kb
|
|
||||||
cdef hash_t entity_id_hash
|
|
||||||
cdef float confidence
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: document
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
|
|
||||||
cdef readonly KnowledgeBase kb
|
cdef readonly KnowledgeBase kb
|
||||||
|
|
26
spacy/kb.pyx
26
spacy/kb.pyx
|
@ -3,28 +3,6 @@
|
||||||
from spacy.errors import user_warning
|
from spacy.errors import user_warning
|
||||||
|
|
||||||
|
|
||||||
cdef class Entity:
|
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_id_hash, confidence):
|
|
||||||
self.kb = kb
|
|
||||||
self.entity_id_hash = entity_id_hash
|
|
||||||
self.confidence = confidence
|
|
||||||
|
|
||||||
property kb_id_:
|
|
||||||
"""RETURNS (unicode): ID of this entity in the KB"""
|
|
||||||
def __get__(self):
|
|
||||||
return self.kb.strings[self.entity_id_hash]
|
|
||||||
|
|
||||||
property kb_id:
|
|
||||||
"""RETURNS (uint64): hash of the entity's KB ID"""
|
|
||||||
def __get__(self):
|
|
||||||
return self.entity_id_hash
|
|
||||||
|
|
||||||
property confidence:
|
|
||||||
def __get__(self):
|
|
||||||
return self.confidence
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Candidate:
|
cdef class Candidate:
|
||||||
|
|
||||||
def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
|
def __init__(self, KnowledgeBase kb, entity_id_hash, alias_hash, prior_prob):
|
||||||
|
@ -103,7 +81,8 @@ cdef class KnowledgeBase:
|
||||||
return
|
return
|
||||||
|
|
||||||
cdef int32_t dummy_value = 342
|
cdef int32_t dummy_value = 342
|
||||||
self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob, vector_rows=&dummy_value, feats_row=dummy_value)
|
self.c_add_entity(entity_id_hash=id_hash, entity_name_hash=name_hash, prob=prob,
|
||||||
|
vector_rows=&dummy_value, feats_row=dummy_value)
|
||||||
# TODO self._vectors_table.get_pointer(vectors),
|
# TODO self._vectors_table.get_pointer(vectors),
|
||||||
# self._features_table.get(features))
|
# self._features_table.get(features))
|
||||||
|
|
||||||
|
@ -155,6 +134,7 @@ cdef class KnowledgeBase:
|
||||||
|
|
||||||
|
|
||||||
def get_candidates(self, unicode alias):
|
def get_candidates(self, unicode alias):
|
||||||
|
""" TODO: where to put this functionality ?"""
|
||||||
cdef hash_t alias_hash = self.strings[alias]
|
cdef hash_t alias_hash = self.strings[alias]
|
||||||
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
alias_index = <int64_t>self._alias_index.get(alias_hash)
|
||||||
alias_entry = self._aliases_table[alias_index]
|
alias_entry = self._aliases_table[alias_index]
|
||||||
|
|
|
@ -1068,12 +1068,17 @@ class EntityLinker(Pipe):
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
# TODO Sofie: actually implement this class instead of dummy implementation
|
"""
|
||||||
|
Currently implemented as taking the KB entry with highest prior probability for each named entity
|
||||||
|
TODO: actually use context etc
|
||||||
|
"""
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
if ent.label_ in ["PERSON", "PER"]:
|
candidates = self.kb.get_candidates(ent.text)
|
||||||
|
if candidates:
|
||||||
|
best_candidate = max(candidates, key=lambda c: c.prior_prob)
|
||||||
for token in ent:
|
for token in ent:
|
||||||
token.ent_kb_id_ = "Q42"
|
token.ent_kb_id_ = best_candidate.entity_id_
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
# TODO
|
# TODO
|
||||||
|
|
|
@ -1,73 +0,0 @@
|
||||||
# coding: utf-8
|
|
||||||
import spacy
|
|
||||||
from spacy.kb import KnowledgeBase
|
|
||||||
|
|
||||||
|
|
||||||
def create_kb():
|
|
||||||
mykb = KnowledgeBase()
|
|
||||||
|
|
||||||
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
|
|
||||||
print()
|
|
||||||
|
|
||||||
# adding entities
|
|
||||||
entity_0 = "Q0" # douglas adams
|
|
||||||
print(" adding entity", entity_0)
|
|
||||||
mykb.add_entity(entity_id=entity_0, entity_name="queZero", prob=0.5)
|
|
||||||
|
|
||||||
entity_42 = "Q42" # douglas adams
|
|
||||||
print(" adding entity", entity_42)
|
|
||||||
mykb.add_entity(entity_id=entity_42, entity_name="que42", prob=0.5)
|
|
||||||
|
|
||||||
entity_5301561 = "Q5301561"
|
|
||||||
print(" adding entity", entity_5301561)
|
|
||||||
mykb.add_entity(entity_id=entity_5301561, entity_name="queMore", prob=0.5)
|
|
||||||
|
|
||||||
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
|
|
||||||
print()
|
|
||||||
|
|
||||||
# adding aliases
|
|
||||||
alias1 = "douglassss"
|
|
||||||
print(" adding alias", alias1, "to Q42 and Q5301561")
|
|
||||||
mykb.add_alias(alias=alias1, entities=["Q42", "Q5301561"], probabilities=[0.8, 0.2])
|
|
||||||
|
|
||||||
alias3 = "adam"
|
|
||||||
print(" adding alias", alias3, "to Q42")
|
|
||||||
mykb.add_alias(alias=alias3, entities=["Q42"], probabilities=[0.9])
|
|
||||||
|
|
||||||
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
|
|
||||||
print()
|
|
||||||
|
|
||||||
return mykb
|
|
||||||
|
|
||||||
|
|
||||||
def add_el(kb):
|
|
||||||
nlp = spacy.load('en_core_web_sm')
|
|
||||||
print("pipes before:", nlp.pipe_names)
|
|
||||||
|
|
||||||
el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
|
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
|
||||||
|
|
||||||
print("pipes after:", nlp.pipe_names)
|
|
||||||
print()
|
|
||||||
|
|
||||||
text = "The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, reminds us to always bring our towel."
|
|
||||||
doc = nlp(text)
|
|
||||||
|
|
||||||
for token in doc:
|
|
||||||
print("token", token.text, token.ent_type_, token.ent_kb_id_)
|
|
||||||
|
|
||||||
print()
|
|
||||||
for ent in doc.ents:
|
|
||||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
|
||||||
|
|
||||||
print()
|
|
||||||
for alias in ["douglassss", "rubbish", "adam"]:
|
|
||||||
candidates = nlp.linker.kb.get_candidates(alias)
|
|
||||||
print(len(candidates), "candidates for", alias, ":")
|
|
||||||
for c in candidates:
|
|
||||||
print(" ", c.entity_id_, c.entity_name_, c.alias_)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
mykb = create_kb()
|
|
||||||
add_el(mykb)
|
|
Loading…
Reference in New Issue
Block a user