# coding: utf-8 from __future__ import unicode_literals """Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ import re import json import spacy import bz2 from spacy.kb import KnowledgeBase def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) # _read_wikidata() _read_wikipedia() # adding entities # kb.add_entity(entity=entity, prob=prob) # adding aliases # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2]) print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) return kb def _read_wikidata(): """ Read the JSON wiki data """ # TODO remove hardcoded path languages = {'en', 'de'} properties = {'P31'} sites = {'enwiki'} with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file: line = file.readline() cnt = 1 while line and cnt < 10: clean_line = line.strip() if clean_line.endswith(b","): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) unique_id = obj["id"] print("ID:", unique_id) entry_type = obj["type"] print("type:", entry_type) # TODO: filter on rank: preferred, normal or deprecated claims = obj["claims"] for prop in properties: claim_property = claims.get(prop, None) if claim_property: for cp in claim_property: print(prop, cp['mainsnak']['datavalue']['value']['id']) entry_sites = obj["sitelinks"] for site in sites: site_value = entry_sites.get(site, None) print(site, ":", site_value['title']) labels = obj["labels"] if labels: for lang in languages: lang_label = labels.get(lang, None) if lang_label: print("label (" + lang + "):", lang_label["value"]) descriptions = obj["descriptions"] if descriptions: for lang in languages: lang_descr = descriptions.get(lang, None) if lang_descr: print("description (" + lang + "):", lang_descr["value"]) aliases = obj["aliases"] if aliases: for lang in languages: lang_aliases = aliases.get(lang, None) if lang_aliases: for item in lang_aliases: print("alias (" + lang + "):", item["value"]) print() line = file.readline() cnt += 1 def _read_wikipedia(): """ Read the XML wikipedia data """ # TODO remove hardcoded path # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file: with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file: line = file.readline() cnt = 1 article_text = "" article_title = None article_id = None reading_text = False while line and cnt < 10000: clean_line = line.strip().decode("utf-8") # Start reading new page if clean_line == "": article_text = "" article_title = None article_id = 342 # finished reading this page elif clean_line == "": if article_id: _store_wp_article(article_id, article_title, article_text.strip()) # start reading text within a page if ")\d*(?=)", clean_line) if ids: article_id = ids[0] # read the title of this article titles = re.findall(r"(?<=).*(?=)", clean_line) if titles: article_title = titles[0].strip() line = file.readline() cnt += 1 def _store_wp_article(article_id, article_title, article_text): print("WP article", article_id, ":", article_title) print(article_text) print(_get_clean_wp_text(article_text)) print() def _get_clean_wp_text(article_text): # remove category statements clean_text = re.sub('\[\[Category:.*\]\]', '', article_text) # remove nested {{info}} statements by removing the inner/smallest ones first and iterating try_again = True previous_length = len(clean_text) while try_again: clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match print(clean_text) if len(clean_text) < previous_length: try_again = True else: try_again = False previous_length = len(clean_text) return clean_text def add_el(kb, nlp): el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True) text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ "Douglas reminds us to always bring our towel. " \ "The main character in Doug's novel is called Arthur Dent." doc = nlp(text) print() for token in doc: print("token", token.text, token.ent_type_, token.ent_kb_id_) print() for ent in doc.ents: print("ent", ent.text, ent.label_, ent.kb_id_) if __name__ == "__main__": nlp = spacy.load('en_core_web_sm') my_kb = create_kb(nlp.vocab) # add_el(my_kb, nlp)