spaCy/examples/pipeline/wikidata_entity_linking.py
2019-04-11 21:42:44 +02:00

114 lines
3.6 KiB
Python

# coding: utf-8
from __future__ import unicode_literals
"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""
import json
import spacy
import bz2
from spacy.kb import KnowledgeBase
def create_kb(vocab):
kb = KnowledgeBase(vocab=vocab)
_read_wikidata()
# adding entities
# kb.add_entity(entity=entity, prob=prob)
# adding aliases
# kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
print()
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
return kb
def _read_wikidata():
""" Read the JSON wiki data """
# TODO remove hardcoded path
languages = {'en', 'de'}
properties = {'P31'}
sites = {'enwiki'}
with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
line = file.readline()
cnt = 1
while line and cnt < 10:
clean_line = line.strip()
if clean_line.endswith(b","):
clean_line = clean_line[:-1]
if len(clean_line) > 1:
obj = json.loads(clean_line)
unique_id = obj["id"]
print("ID:", unique_id)
entry_type = obj["type"]
print("type:", entry_type)
# TODO: filter on rank: preferred, normal or deprecated
claims = obj["claims"]
for prop in properties:
claim_property = claims.get(prop, None)
if claim_property:
for cp in claim_property:
print(prop, cp['mainsnak']['datavalue']['value']['id'])
entry_sites = obj["sitelinks"]
for site in sites:
site_value = entry_sites.get(site, None)
print(site, ":", site_value['title'])
labels = obj["labels"]
if labels:
for lang in languages:
lang_label = labels.get(lang, None)
if lang_label:
print("label (" + lang + "):", lang_label["value"])
descriptions = obj["descriptions"]
if descriptions:
for lang in languages:
lang_descr = descriptions.get(lang, None)
if lang_descr:
print("description (" + lang + "):", lang_descr["value"])
aliases = obj["aliases"]
if aliases:
for lang in languages:
lang_aliases = aliases.get(lang, None)
if lang_aliases:
for item in lang_aliases:
print("alias (" + lang + "):", item["value"])
print()
line = file.readline()
cnt += 1
def add_el(kb, nlp):
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
nlp.add_pipe(el_pipe, last=True)
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
"Douglas reminds us to always bring our towel. " \
"The main character in Doug's novel is called Arthur Dent."
doc = nlp(text)
print()
for token in doc:
print("token", token.text, token.ent_type_, token.ent_kb_id_)
print()
for ent in doc.ents:
print("ent", ent.text, ent.label_, ent.kb_id_)
if __name__ == "__main__":
nlp = spacy.load('en_core_web_sm')
my_kb = create_kb(nlp.vocab)
# add_el(my_kb, nlp)