spaCy/examples/pipeline/wikidata_entity_linking.py

# coding: utf-8
from __future__ import unicode_literals

"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""
import json
import spacy
import bz2
from spacy.kb import KnowledgeBase


def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab)
    _read_wikidata()

    # adding entities
    # kb.add_entity(entity=entity, prob=prob)

    # adding aliases
    # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])

    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())

    return kb


def _read_wikidata():
    """ Read the JSON wiki data """
    # TODO remove hardcoded path

    languages = {'en', 'de'}
    properties = {'P31'}
    sites = {'enwiki'}

    with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
        line = file.readline()
        cnt = 1
        while line and cnt < 10:
            clean_line = line.strip()
            if clean_line.endswith(b","):
                clean_line = clean_line[:-1]
            if len(clean_line) > 1:
                obj = json.loads(clean_line)

                unique_id = obj["id"]
                print("ID:", unique_id)

                entry_type = obj["type"]
                print("type:", entry_type)

                # TODO: filter on rank:  preferred, normal or deprecated
                claims = obj["claims"]
                for prop in properties:
                    claim_property = claims.get(prop, None)
                    if claim_property:
                        for cp in claim_property:
                            print(prop, cp['mainsnak']['datavalue']['value']['id'])

                entry_sites = obj["sitelinks"]
                for site in sites:
                    site_value = entry_sites.get(site, None)
                    print(site, ":", site_value['title'])

                labels = obj["labels"]
                if labels:
                    for lang in languages:
                        lang_label = labels.get(lang, None)
                        if lang_label:
                            print("label (" + lang + "):", lang_label["value"])

                descriptions = obj["descriptions"]
                if descriptions:
                    for lang in languages:
                        lang_descr = descriptions.get(lang, None)
                        if lang_descr:
                            print("description (" + lang + "):", lang_descr["value"])

                aliases = obj["aliases"]
                if aliases:
                    for lang in languages:
                        lang_aliases = aliases.get(lang, None)
                        if lang_aliases:
                            for item in lang_aliases:
                                print("alias (" + lang + "):", item["value"])

                print()
            line = file.readline()
            cnt += 1


def add_el(kb, nlp):
    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
    nlp.add_pipe(el_pipe, last=True)

    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is called Arthur Dent."
    doc = nlp(text)

    print()
    for token in doc:
        print("token", token.text, token.ent_type_, token.ent_kb_id_)

    print()
    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)


if __name__ == "__main__":
    nlp = spacy.load('en_core_web_sm')
    my_kb = create_kb(nlp.vocab)
    # add_el(my_kb, nlp)