2019-03-21 20:55:01 +03:00
|
|
|
# coding: utf-8
|
2019-03-22 18:18:04 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2019-03-21 20:55:01 +03:00
|
|
|
"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm.
|
|
|
|
Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias
|
|
|
|
"""
|
|
|
|
import spacy
|
|
|
|
from spacy.kb import KnowledgeBase
|
|
|
|
|
|
|
|
|
2019-03-22 01:17:25 +03:00
|
|
|
def create_kb(vocab):
|
2019-06-05 19:29:18 +03:00
|
|
|
kb = KnowledgeBase(vocab=vocab, entity_vector_length=1)
|
2019-03-21 20:55:01 +03:00
|
|
|
|
|
|
|
# adding entities
|
2019-03-25 20:10:41 +03:00
|
|
|
entity_0 = "Q1004791_Douglas"
|
2019-03-21 20:55:01 +03:00
|
|
|
print("adding entity", entity_0)
|
2019-06-05 19:29:18 +03:00
|
|
|
kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0])
|
2019-03-21 20:55:01 +03:00
|
|
|
|
2019-03-25 20:10:41 +03:00
|
|
|
entity_1 = "Q42_Douglas_Adams"
|
2019-03-21 20:55:01 +03:00
|
|
|
print("adding entity", entity_1)
|
2019-06-05 19:29:18 +03:00
|
|
|
kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1])
|
2019-03-21 20:55:01 +03:00
|
|
|
|
2019-03-25 20:10:41 +03:00
|
|
|
entity_2 = "Q5301561_Douglas_Haig"
|
2019-03-21 20:55:01 +03:00
|
|
|
print("adding entity", entity_2)
|
2019-06-05 19:29:18 +03:00
|
|
|
kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2])
|
2019-03-21 20:55:01 +03:00
|
|
|
|
|
|
|
# adding aliases
|
|
|
|
print()
|
|
|
|
alias_0 = "Douglas"
|
2019-03-22 01:17:25 +03:00
|
|
|
print("adding alias", alias_0)
|
2019-04-10 18:25:10 +03:00
|
|
|
kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
|
2019-03-21 20:55:01 +03:00
|
|
|
|
|
|
|
alias_1 = "Douglas Adams"
|
2019-03-22 01:17:25 +03:00
|
|
|
print("adding alias", alias_1)
|
2019-03-25 20:10:41 +03:00
|
|
|
kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9])
|
2019-03-21 20:55:01 +03:00
|
|
|
|
|
|
|
print()
|
|
|
|
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
|
|
|
|
|
|
|
|
return kb
|
|
|
|
|
|
|
|
|
2019-03-22 01:17:25 +03:00
|
|
|
def add_el(kb, nlp):
|
2019-06-28 09:59:23 +03:00
|
|
|
el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64})
|
|
|
|
el_pipe.set_kb(kb)
|
2019-03-21 20:55:01 +03:00
|
|
|
nlp.add_pipe(el_pipe, last=True)
|
2019-06-28 09:59:23 +03:00
|
|
|
nlp.begin_training()
|
|
|
|
el_pipe.context_weight = 0
|
|
|
|
el_pipe.prior_weight = 1
|
2019-03-21 20:55:01 +03:00
|
|
|
|
|
|
|
for alias in ["Douglas Adams", "Douglas"]:
|
|
|
|
candidates = nlp.linker.kb.get_candidates(alias)
|
|
|
|
print()
|
|
|
|
print(len(candidates), "candidate(s) for", alias, ":")
|
|
|
|
for c in candidates:
|
2019-03-25 20:10:41 +03:00
|
|
|
print(" ", c.entity_, c.prior_prob)
|
2019-03-21 20:55:01 +03:00
|
|
|
|
|
|
|
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
2019-03-22 01:17:25 +03:00
|
|
|
"Douglas reminds us to always bring our towel. " \
|
|
|
|
"The main character in Doug's novel is called Arthur Dent."
|
2019-03-21 20:55:01 +03:00
|
|
|
doc = nlp(text)
|
|
|
|
|
|
|
|
print()
|
|
|
|
for token in doc:
|
|
|
|
print("token", token.text, token.ent_type_, token.ent_kb_id_)
|
|
|
|
|
|
|
|
print()
|
|
|
|
for ent in doc.ents:
|
|
|
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2019-05-06 11:56:56 +03:00
|
|
|
my_nlp = spacy.load('en_core_web_sm')
|
|
|
|
my_kb = create_kb(my_nlp.vocab)
|
|
|
|
add_el(my_kb, my_nlp)
|