mirror of
https://github.com/explosion/spaCy.git
synced 2024-09-22 20:09:18 +03:00
193 lines
6.1 KiB
Python
193 lines
6.1 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
|
|
"""
|
|
import re
|
|
import json
|
|
import spacy
|
|
import bz2
|
|
from spacy.kb import KnowledgeBase
|
|
|
|
|
|
def create_kb(vocab):
|
|
kb = KnowledgeBase(vocab=vocab)
|
|
# _read_wikidata()
|
|
_read_wikipedia()
|
|
|
|
# adding entities
|
|
# kb.add_entity(entity=entity, prob=prob)
|
|
|
|
# adding aliases
|
|
# kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])
|
|
|
|
print()
|
|
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
|
|
|
|
return kb
|
|
|
|
|
|
def _read_wikidata():
|
|
""" Read the JSON wiki data """
|
|
# TODO remove hardcoded path
|
|
|
|
languages = {'en', 'de'}
|
|
properties = {'P31'}
|
|
sites = {'enwiki'}
|
|
|
|
with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
|
|
line = file.readline()
|
|
cnt = 1
|
|
while line and cnt < 10:
|
|
clean_line = line.strip()
|
|
if clean_line.endswith(b","):
|
|
clean_line = clean_line[:-1]
|
|
if len(clean_line) > 1:
|
|
obj = json.loads(clean_line)
|
|
|
|
unique_id = obj["id"]
|
|
print("ID:", unique_id)
|
|
|
|
entry_type = obj["type"]
|
|
print("type:", entry_type)
|
|
|
|
# TODO: filter on rank: preferred, normal or deprecated
|
|
claims = obj["claims"]
|
|
for prop in properties:
|
|
claim_property = claims.get(prop, None)
|
|
if claim_property:
|
|
for cp in claim_property:
|
|
print(prop, cp['mainsnak']['datavalue']['value']['id'])
|
|
|
|
entry_sites = obj["sitelinks"]
|
|
for site in sites:
|
|
site_value = entry_sites.get(site, None)
|
|
print(site, ":", site_value['title'])
|
|
|
|
labels = obj["labels"]
|
|
if labels:
|
|
for lang in languages:
|
|
lang_label = labels.get(lang, None)
|
|
if lang_label:
|
|
print("label (" + lang + "):", lang_label["value"])
|
|
|
|
descriptions = obj["descriptions"]
|
|
if descriptions:
|
|
for lang in languages:
|
|
lang_descr = descriptions.get(lang, None)
|
|
if lang_descr:
|
|
print("description (" + lang + "):", lang_descr["value"])
|
|
|
|
aliases = obj["aliases"]
|
|
if aliases:
|
|
for lang in languages:
|
|
lang_aliases = aliases.get(lang, None)
|
|
if lang_aliases:
|
|
for item in lang_aliases:
|
|
print("alias (" + lang + "):", item["value"])
|
|
|
|
print()
|
|
line = file.readline()
|
|
cnt += 1
|
|
|
|
|
|
def _read_wikipedia():
|
|
""" Read the XML wikipedia data """
|
|
# TODO remove hardcoded path
|
|
|
|
# with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file:
|
|
with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
|
|
line = file.readline()
|
|
cnt = 1
|
|
article_text = ""
|
|
article_title = None
|
|
article_id = None
|
|
reading_text = False
|
|
while line and cnt < 10000:
|
|
clean_line = line.strip().decode("utf-8")
|
|
|
|
# Start reading new page
|
|
if clean_line == "<page>":
|
|
article_text = ""
|
|
article_title = None
|
|
article_id = 342
|
|
|
|
# finished reading this page
|
|
elif clean_line == "</page>":
|
|
if article_id:
|
|
_store_wp_article(article_id, article_title, article_text.strip())
|
|
|
|
# start reading text within a page
|
|
if "<text" in clean_line:
|
|
reading_text = True
|
|
|
|
if reading_text:
|
|
article_text += " " + clean_line
|
|
|
|
# stop reading text within a page
|
|
if "</text" in clean_line:
|
|
reading_text = False
|
|
|
|
# read the ID of this article
|
|
ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)
|
|
if ids:
|
|
article_id = ids[0]
|
|
|
|
# read the title of this article
|
|
titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)
|
|
if titles:
|
|
article_title = titles[0].strip()
|
|
|
|
line = file.readline()
|
|
cnt += 1
|
|
|
|
|
|
def _store_wp_article(article_id, article_title, article_text):
|
|
print("WP article", article_id, ":", article_title)
|
|
print(article_text)
|
|
print(_get_clean_wp_text(article_text))
|
|
print()
|
|
|
|
|
|
def _get_clean_wp_text(article_text):
|
|
# remove category statements
|
|
clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)
|
|
|
|
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
|
try_again = True
|
|
previous_length = len(clean_text)
|
|
while try_again:
|
|
clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match
|
|
print(clean_text)
|
|
if len(clean_text) < previous_length:
|
|
try_again = True
|
|
else:
|
|
try_again = False
|
|
previous_length = len(clean_text)
|
|
|
|
return clean_text
|
|
|
|
|
|
def add_el(kb, nlp):
|
|
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
|
nlp.add_pipe(el_pipe, last=True)
|
|
|
|
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
|
"Douglas reminds us to always bring our towel. " \
|
|
"The main character in Doug's novel is called Arthur Dent."
|
|
doc = nlp(text)
|
|
|
|
print()
|
|
for token in doc:
|
|
print("token", token.text, token.ent_type_, token.ent_kb_id_)
|
|
|
|
print()
|
|
for ent in doc.ents:
|
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
nlp = spacy.load('en_core_web_sm')
|
|
my_kb = create_kb(nlp.vocab)
|
|
# add_el(my_kb, nlp)
|