spaCy/bin/wiki_entity_linking/wikidata_processor.py

# coding: utf-8
from __future__ import unicode_literals

import gzip
import json
import logging
import datetime

logger = logging.getLogger(__name__)


def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descriptions=True):
    # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
    # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/

    site_filter = '{}wiki'.format(lang)

    # properties filter (currently disabled to get ALL data)
    prop_filter = dict()
    # prop_filter = {'P31': {'Q5', 'Q15632617'}}     # currently defined as OR: one property suffices to be selected

    title_to_id = dict()
    id_to_descr = dict()

    # parse appropriate fields - depending on what we need in the KB
    parse_properties = False
    parse_sitelinks = True
    parse_labels = False
    parse_aliases = False
    parse_claims = False

    with gzip.open(wikidata_file, mode='rb') as file:
        for cnt, line in enumerate(file):
            if limit and cnt >= limit:
                break
            if cnt % 500000 == 0:
                logger.info("processed {} lines of WikiData dump".format(cnt))
            clean_line = line.strip()
            if clean_line.endswith(b","):
                clean_line = clean_line[:-1]
            if len(clean_line) > 1:
                obj = json.loads(clean_line)
                entry_type = obj["type"]

                if entry_type == "item":
                    # filtering records on their properties (currently disabled to get ALL data)
                    # keep = False
                    keep = True

                    claims = obj["claims"]
                    if parse_claims:
                        for prop, value_set in prop_filter.items():
                            claim_property = claims.get(prop, None)
                            if claim_property:
                                for cp in claim_property:
                                    cp_id = (
                                        cp["mainsnak"]
                                        .get("datavalue", {})
                                        .get("value", {})
                                        .get("id")
                                    )
                                    cp_rank = cp["rank"]
                                    if cp_rank != "deprecated" and cp_id in value_set:
                                        keep = True

                    if keep:
                        unique_id = obj["id"]

                        if to_print:
                            print("ID:", unique_id)
                            print("type:", entry_type)

                        # parsing all properties that refer to other entities
                        if parse_properties:
                            for prop, claim_property in claims.items():
                                cp_dicts = [
                                    cp["mainsnak"]["datavalue"].get("value")
                                    for cp in claim_property
                                    if cp["mainsnak"].get("datavalue")
                                ]
                                cp_values = [
                                    cp_dict.get("id")
                                    for cp_dict in cp_dicts
                                    if isinstance(cp_dict, dict)
                                    if cp_dict.get("id") is not None
                                ]
                                if cp_values:
                                    if to_print:
                                        print("prop:", prop, cp_values)

                        found_link = False
                        if parse_sitelinks:
                            site_value = obj["sitelinks"].get(site_filter, None)
                            if site_value:
                                site = site_value["title"]
                                if to_print:
                                    print(site_filter, ":", site)
                                title_to_id[site] = unique_id
                                found_link = True

                        if parse_labels:
                            labels = obj["labels"]
                            if labels:
                                lang_label = labels.get(lang, None)
                                if lang_label:
                                    if to_print:
                                        print(
                                            "label (" + lang + "):", lang_label["value"]
                                        )

                        if found_link and parse_descriptions:
                            descriptions = obj["descriptions"]
                            if descriptions:
                                lang_descr = descriptions.get(lang, None)
                                if lang_descr:
                                    if to_print:
                                        print(
                                            "description (" + lang + "):",
                                            lang_descr["value"],
                                        )
                                    id_to_descr[unique_id] = lang_descr["value"]

                        if parse_aliases:
                            aliases = obj["aliases"]
                            if aliases:
                                lang_aliases = aliases.get(lang, None)
                                if lang_aliases:
                                    for item in lang_aliases:
                                        if to_print:
                                            print(
                                                "alias (" + lang + "):", item["value"]
                                            )

                        if to_print:
                            print()

    return title_to_id, id_to_descr


def write_entity_files(entity_def_output, title_to_id):
    with entity_def_output.open("w", encoding="utf8") as id_file:
        id_file.write("WP_title" + "|" + "WD_id" + "\n")
        for title, qid in title_to_id.items():
            id_file.write(title + "|" + str(qid) + "\n")


def write_entity_description_files(entity_descr_output, id_to_descr):
    with entity_descr_output.open("w", encoding="utf8") as descr_file:
        descr_file.write("WD_id" + "|" + "description" + "\n")
        for qid, descr in id_to_descr.items():
            descr_file.write(str(qid) + "|" + descr + "\n")
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00			`import gzip`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`import json`
Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00			`import logging`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`import datetime`

Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00			`logger = logging.getLogger(__name__)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00
			`def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descriptions=True):`
small tweaks and documentation 2019-06-18 19:38:09 +03:00			`# Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.`
further code cleanup 2019-06-19 10:15:43 +03:00			`# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00			`site_filter = '{}wiki'.format(lang)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
further code cleanup 2019-06-19 10:15:43 +03:00			`# properties filter (currently disabled to get ALL data)`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`prop_filter = dict()`
			`# prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected`

refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`title_to_id = dict()`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`id_to_descr = dict()`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`# parse appropriate fields - depending on what we need in the KB`
			`parse_properties = False`
			`parse_sitelinks = True`
			`parse_labels = False`
			`parse_aliases = False`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`parse_claims = False`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00			`with gzip.open(wikidata_file, mode='rb') as file:`
			`for cnt, line in enumerate(file):`
			`if limit and cnt >= limit:`
			`break`
			`if cnt % 500000 == 0:`
			`logger.info("processed {} lines of WikiData dump".format(cnt))`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`clean_line = line.strip()`
			`if clean_line.endswith(b","):`
			`clean_line = clean_line[:-1]`
			`if len(clean_line) > 1:`
			`obj = json.loads(clean_line)`
			`entry_type = obj["type"]`

			`if entry_type == "item":`
performance per entity type 2019-06-14 20:55:46 +03:00			`# filtering records on their properties (currently disabled to get ALL data)`
			`# keep = False`
			`keep = True`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`claims = obj["claims"]`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`if parse_claims:`
			`for prop, value_set in prop_filter.items():`
			`claim_property = claims.get(prop, None)`
			`if claim_property:`
			`for cp in claim_property:`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`cp_id = (`
			`cp["mainsnak"]`
			`.get("datavalue", {})`
			`.get("value", {})`
			`.get("id")`
			`)`
			`cp_rank = cp["rank"]`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`if cp_rank != "deprecated" and cp_id in value_set:`
			`keep = True`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`if keep:`
			`unique_id = obj["id"]`

			`if to_print:`
			`print("ID:", unique_id)`
			`print("type:", entry_type)`

			`# parsing all properties that refer to other entities`
			`if parse_properties:`
			`for prop, claim_property in claims.items():`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`cp_dicts = [`
			`cp["mainsnak"]["datavalue"].get("value")`
			`for cp in claim_property`
			`if cp["mainsnak"].get("datavalue")`
			`]`
			`cp_values = [`
			`cp_dict.get("id")`
			`for cp_dict in cp_dicts`
			`if isinstance(cp_dict, dict)`
			`if cp_dict.get("id") is not None`
			`]`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`if cp_values:`
			`if to_print:`
			`print("prop:", prop, cp_values)`

performance per entity type 2019-06-14 20:55:46 +03:00			`found_link = False`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`if parse_sitelinks:`
			`site_value = obj["sitelinks"].get(site_filter, None)`
			`if site_value:`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`site = site_value["title"]`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`if to_print:`
			`print(site_filter, ":", site)`
			`title_to_id[site] = unique_id`
performance per entity type 2019-06-14 20:55:46 +03:00			`found_link = True`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`if parse_labels:`
			`labels = obj["labels"]`
			`if labels:`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`lang_label = labels.get(lang, None)`
			`if lang_label:`
			`if to_print:`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`print(`
			`"label (" + lang + "):", lang_label["value"]`
			`)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
performance per entity type 2019-06-14 20:55:46 +03:00			`if found_link and parse_descriptions:`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`descriptions = obj["descriptions"]`
			`if descriptions:`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`lang_descr = descriptions.get(lang, None)`
			`if lang_descr:`
			`if to_print:`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`print(`
			`"description (" + lang + "):",`
			`lang_descr["value"],`
			`)`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`id_to_descr[unique_id] = lang_descr["value"]`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`if parse_aliases:`
			`aliases = obj["aliases"]`
			`if aliases:`
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`lang_aliases = aliases.get(lang, None)`
			`if lang_aliases:`
			`for item in lang_aliases:`
			`if to_print:`
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script 2019-08-13 16:38:59 +03:00			`print(`
			`"alias (" + lang + "):", item["value"]`
			`)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`if to_print:`
			`print()`

using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00			`return title_to_id, id_to_descr`
Changes to wiki_entity_linker (#4235) * Changes to wiki_entity_linker * No more f-strings * Make some requested changes * Add back option to get descriptions from wd not wp * Fix logs * Address comments and clean evaluation * Remove type hints * Refactor evaluation, add back metrics by label * Address comments * Log training performance as well as dev 2019-09-13 18:03:57 +03:00

			`def write_entity_files(entity_def_output, title_to_id):`
			`with entity_def_output.open("w", encoding="utf8") as id_file:`
			`id_file.write("WP_title" + "\|" + "WD_id" + "\n")`
			`for title, qid in title_to_id.items():`
			`id_file.write(title + "\|" + str(qid) + "\n")`


			`def write_entity_description_files(entity_descr_output, id_to_descr):`
			`with entity_descr_output.open("w", encoding="utf8") as descr_file:`
			`descr_file.write("WD_id" + "\|" + "description" + "\n")`
			`for qid, descr in id_to_descr.items():`
			`descr_file.write(str(qid) + "\|" + descr + "\n")`