spaCy/examples/pipeline/wikidata_entity_linking.py

# coding: utf-8
from __future__ import unicode_literals

"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
"""
import re
import json
import spacy
import bz2
from spacy.kb import KnowledgeBase


def create_kb(vocab):
    kb = KnowledgeBase(vocab=vocab)
    # _read_wikidata()
    _read_wikipedia()

    # adding entities
    # kb.add_entity(entity=entity, prob=prob)

    # adding aliases
    # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])

    print()
    print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())

    return kb


def _read_wikidata():
    """ Read the JSON wiki data """
    # TODO remove hardcoded path

    languages = {'en', 'de'}
    properties = {'P31'}
    sites = {'enwiki'}

    with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:
        line = file.readline()
        cnt = 1
        while line and cnt < 10:
            clean_line = line.strip()
            if clean_line.endswith(b","):
                clean_line = clean_line[:-1]
            if len(clean_line) > 1:
                obj = json.loads(clean_line)

                unique_id = obj["id"]
                print("ID:", unique_id)

                entry_type = obj["type"]
                print("type:", entry_type)

                # TODO: filter on rank:  preferred, normal or deprecated
                claims = obj["claims"]
                for prop in properties:
                    claim_property = claims.get(prop, None)
                    if claim_property:
                        for cp in claim_property:
                            print(prop, cp['mainsnak']['datavalue']['value']['id'])

                entry_sites = obj["sitelinks"]
                for site in sites:
                    site_value = entry_sites.get(site, None)
                    print(site, ":", site_value['title'])

                labels = obj["labels"]
                if labels:
                    for lang in languages:
                        lang_label = labels.get(lang, None)
                        if lang_label:
                            print("label (" + lang + "):", lang_label["value"])

                descriptions = obj["descriptions"]
                if descriptions:
                    for lang in languages:
                        lang_descr = descriptions.get(lang, None)
                        if lang_descr:
                            print("description (" + lang + "):", lang_descr["value"])

                aliases = obj["aliases"]
                if aliases:
                    for lang in languages:
                        lang_aliases = aliases.get(lang, None)
                        if lang_aliases:
                            for item in lang_aliases:
                                print("alias (" + lang + "):", item["value"])

                print()
            line = file.readline()
            cnt += 1


def _read_wikipedia():
    """ Read the XML wikipedia data """
    # TODO remove hardcoded path

    # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file:
    with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:
        line = file.readline()
        cnt = 1
        article_text = ""
        article_title = None
        article_id = None
        reading_text = False
        while line and cnt < 10000:
            clean_line = line.strip().decode("utf-8")

            # Start reading new page
            if clean_line == "<page>":
                article_text = ""
                article_title = None
                article_id = 342

            # finished reading this page
            elif clean_line == "</page>":
                if article_id:
                    _store_wp_article(article_id, article_title, article_text.strip())

            # start reading text within a page
            if "<text" in clean_line:
                reading_text = True

            if reading_text:
                article_text += " " + clean_line

            # stop reading text within a page
            if "</text" in clean_line:
                reading_text = False

            # read the ID of this article
            ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)
            if ids:
                article_id = ids[0]

            # read the title of this article
            titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)
            if titles:
                article_title = titles[0].strip()

            line = file.readline()
            cnt += 1


def _store_wp_article(article_id, article_title, article_text):
    print("WP article", article_id, ":", article_title)
    print(article_text)
    print(_get_clean_wp_text(article_text))
    print()


def _get_clean_wp_text(article_text):
    # remove category statements
    clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)

    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
    try_again = True
    previous_length = len(clean_text)
    while try_again:
        clean_text = re.sub('{[^{]*?}', '', clean_text)  # non-greedy match
        print(clean_text)
        if len(clean_text) < previous_length:
            try_again = True
        else:
            try_again = False
        previous_length = len(clean_text)

    return clean_text


def add_el(kb, nlp):
    el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
    nlp.add_pipe(el_pipe, last=True)

    text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
           "Douglas reminds us to always bring our towel. " \
           "The main character in Doug's novel is called Arthur Dent."
    doc = nlp(text)

    print()
    for token in doc:
        print("token", token.text, token.ent_type_, token.ent_kb_id_)

    print()
    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)


if __name__ == "__main__":
    nlp = spacy.load('en_core_web_sm')
    my_kb = create_kb(nlp.vocab)
    # add_el(my_kb, nlp)
reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

			`"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.`
			`"""`
wikipedia dump parser and mediawiki format regex cleanup 2019-04-14 22:52:01 +03:00			`import re`
reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00			`import json`
			`import spacy`
			`import bz2`
			`from spacy.kb import KnowledgeBase`


			`def create_kb(vocab):`
			`kb = KnowledgeBase(vocab=vocab)`
wikipedia dump parser and mediawiki format regex cleanup 2019-04-14 22:52:01 +03:00			`# _read_wikidata()`
			`_read_wikipedia()`
reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00
			`# adding entities`
			`# kb.add_entity(entity=entity, prob=prob)`

			`# adding aliases`
			`# kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2])`

			`print()`
			`print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())`

			`return kb`


			`def _read_wikidata():`
			`""" Read the JSON wiki data """`
			`# TODO remove hardcoded path`

			`languages = {'en', 'de'}`
reading types, claims and sitelinks 2019-04-11 22:42:44 +03:00			`properties = {'P31'}`
			`sites = {'enwiki'}`
reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00
			`with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file:`
			`line = file.readline()`
			`cnt = 1`
			`while line and cnt < 10:`
			`clean_line = line.strip()`
			`if clean_line.endswith(b","):`
			`clean_line = clean_line[:-1]`
			`if len(clean_line) > 1:`
			`obj = json.loads(clean_line)`
reading types, claims and sitelinks 2019-04-11 22:42:44 +03:00
reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00			`unique_id = obj["id"]`
reading types, claims and sitelinks 2019-04-11 22:42:44 +03:00			`print("ID:", unique_id)`

			`entry_type = obj["type"]`
			`print("type:", entry_type)`

			`# TODO: filter on rank: preferred, normal or deprecated`
			`claims = obj["claims"]`
			`for prop in properties:`
			`claim_property = claims.get(prop, None)`
			`if claim_property:`
			`for cp in claim_property:`
			`print(prop, cp['mainsnak']['datavalue']['value']['id'])`

			`entry_sites = obj["sitelinks"]`
			`for site in sites:`
			`site_value = entry_sites.get(site, None)`
			`print(site, ":", site_value['title'])`
reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00
			`labels = obj["labels"]`
			`if labels:`
			`for lang in languages:`
			`lang_label = labels.get(lang, None)`
			`if lang_label:`
			`print("label (" + lang + "):", lang_label["value"])`

			`descriptions = obj["descriptions"]`
			`if descriptions:`
			`for lang in languages:`
			`lang_descr = descriptions.get(lang, None)`
			`if lang_descr:`
			`print("description (" + lang + "):", lang_descr["value"])`

			`aliases = obj["aliases"]`
			`if aliases:`
			`for lang in languages:`
			`lang_aliases = aliases.get(lang, None)`
			`if lang_aliases:`
			`for item in lang_aliases:`
			`print("alias (" + lang + "):", item["value"])`

			`print()`
			`line = file.readline()`
			`cnt += 1`


wikipedia dump parser and mediawiki format regex cleanup 2019-04-14 22:52:01 +03:00			`def _read_wikipedia():`
			`""" Read the XML wikipedia data """`
			`# TODO remove hardcoded path`

			`# with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file:`
			`with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file:`
			`line = file.readline()`
			`cnt = 1`
			`article_text = ""`
			`article_title = None`
			`article_id = None`
			`reading_text = False`
			`while line and cnt < 10000:`
			`clean_line = line.strip().decode("utf-8")`

			`# Start reading new page`
			`if clean_line == "<page>":`
			`article_text = ""`
			`article_title = None`
			`article_id = 342`

			`# finished reading this page`
			`elif clean_line == "</page>":`
			`if article_id:`
			`_store_wp_article(article_id, article_title, article_text.strip())`

			`# start reading text within a page`
			`if "<text" in clean_line:`
			`reading_text = True`

			`if reading_text:`
			`article_text += " " + clean_line`

			`# stop reading text within a page`
			`if "</text" in clean_line:`
			`reading_text = False`

			`# read the ID of this article`
			`ids = re.findall(r"(?<=<id>)\d*(?=</id>)", clean_line)`
			`if ids:`
			`article_id = ids[0]`

			`# read the title of this article`
			`titles = re.findall(r"(?<=<title>).*(?=</title>)", clean_line)`
			`if titles:`
			`article_title = titles[0].strip()`

			`line = file.readline()`
			`cnt += 1`


			`def _store_wp_article(article_id, article_title, article_text):`
			`print("WP article", article_id, ":", article_title)`
			`print(article_text)`
			`print(_get_clean_wp_text(article_text))`
			`print()`


			`def _get_clean_wp_text(article_text):`
			`# remove category statements`
			`clean_text = re.sub('\[\[Category:.*\]\]', '', article_text)`

			`# remove nested {{info}} statements by removing the inner/smallest ones first and iterating`
			`try_again = True`
			`previous_length = len(clean_text)`
			`while try_again:`
			`clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match`
			`print(clean_text)`
			`if len(clean_text) < previous_length:`
			`try_again = True`
			`else:`
			`try_again = False`
			`previous_length = len(clean_text)`

			`return clean_text`


reading wikidata descriptions and aliases 2019-04-11 22:08:22 +03:00			`def add_el(kb, nlp):`
			`el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})`
			`nlp.add_pipe(el_pipe, last=True)`

			`text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \`
			`"Douglas reminds us to always bring our towel. " \`
			`"The main character in Doug's novel is called Arthur Dent."`
			`doc = nlp(text)`

			`print()`
			`for token in doc:`
			`print("token", token.text, token.ent_type_, token.ent_kb_id_)`

			`print()`
			`for ent in doc.ents:`
			`print("ent", ent.text, ent.label_, ent.kb_id_)`


			`if __name__ == "__main__":`
			`nlp = spacy.load('en_core_web_sm')`
			`my_kb = create_kb(nlp.vocab)`
			`# add_el(my_kb, nlp)`