2019-05-06 11:56:56 +03:00
|
|
|
# coding: utf-8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
|
|
|
from bin.wiki_entity_linking import wikidata_processor as wd, wikipedia_processor as wp
|
2019-05-06 11:56:56 +03:00
|
|
|
from spacy.kb import KnowledgeBase
|
|
|
|
|
2019-05-07 17:03:42 +03:00
|
|
|
import csv
|
2019-05-06 11:56:56 +03:00
|
|
|
import datetime
|
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
from spacy import Errors
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-06-13 23:32:56 +03:00
|
|
|
|
2019-07-23 13:17:19 +03:00
|
|
|
def create_kb(
|
|
|
|
nlp,
|
|
|
|
max_entities_per_alias,
|
|
|
|
min_entity_freq,
|
|
|
|
min_occ,
|
|
|
|
entity_def_output,
|
|
|
|
entity_descr_output,
|
|
|
|
count_input,
|
|
|
|
prior_prob_input,
|
|
|
|
wikidata_input,
|
2019-08-13 16:38:59 +03:00
|
|
|
entity_vector_length,
|
|
|
|
limit=None,
|
|
|
|
read_raw_data=True,
|
2019-07-23 13:17:19 +03:00
|
|
|
):
|
2019-06-14 20:55:46 +03:00
|
|
|
# Create the knowledge base from Wikidata entries
|
2019-08-13 16:38:59 +03:00
|
|
|
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
# check the length of the nlp vectors
|
|
|
|
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
|
|
|
|
input_dim = nlp.vocab.vectors_length
|
|
|
|
print("Loaded pre-trained vectors of size %s" % input_dim)
|
|
|
|
else:
|
|
|
|
raise ValueError(Errors.E155)
|
2019-06-06 20:51:27 +03:00
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
# disable this part of the pipeline when rerunning the KB generation from preprocessed files
|
2019-06-06 20:51:27 +03:00
|
|
|
if read_raw_data:
|
|
|
|
print()
|
2019-08-13 16:38:59 +03:00
|
|
|
print(now(), " * read wikidata entities:")
|
|
|
|
title_to_id, id_to_descr = wd.read_wikidata_entities_json(
|
|
|
|
wikidata_input, limit=limit
|
|
|
|
)
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-06-06 20:51:27 +03:00
|
|
|
# write the title-ID and ID-description mappings to file
|
2019-07-23 13:17:19 +03:00
|
|
|
_write_entity_files(
|
|
|
|
entity_def_output, entity_descr_output, title_to_id, id_to_descr
|
|
|
|
)
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-06-06 20:51:27 +03:00
|
|
|
else:
|
|
|
|
# read the mappings from file
|
2019-06-18 14:20:40 +03:00
|
|
|
title_to_id = get_entity_to_id(entity_def_output)
|
2019-06-28 09:29:31 +03:00
|
|
|
id_to_descr = get_id_to_description(entity_descr_output)
|
2019-06-06 20:51:27 +03:00
|
|
|
|
2019-05-06 11:56:56 +03:00
|
|
|
print()
|
2019-08-13 16:38:59 +03:00
|
|
|
print(now(), " * get entity frequencies:")
|
2019-05-06 11:56:56 +03:00
|
|
|
print()
|
2019-06-14 20:55:46 +03:00
|
|
|
entity_frequencies = wp.get_all_frequencies(count_input=count_input)
|
|
|
|
|
2019-06-19 10:15:43 +03:00
|
|
|
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
|
2019-06-14 20:55:46 +03:00
|
|
|
filtered_title_to_id = dict()
|
2019-06-24 11:55:04 +03:00
|
|
|
entity_list = []
|
|
|
|
description_list = []
|
|
|
|
frequency_list = []
|
2019-06-14 20:55:46 +03:00
|
|
|
for title, entity in title_to_id.items():
|
|
|
|
freq = entity_frequencies.get(title, 0)
|
|
|
|
desc = id_to_descr.get(entity, None)
|
|
|
|
if desc and freq > min_entity_freq:
|
|
|
|
entity_list.append(entity)
|
|
|
|
description_list.append(desc)
|
|
|
|
frequency_list.append(freq)
|
|
|
|
filtered_title_to_id[title] = entity
|
|
|
|
|
2019-07-23 13:17:19 +03:00
|
|
|
print(len(title_to_id.keys()), "original titles")
|
2019-08-13 16:38:59 +03:00
|
|
|
kept_nr = len(filtered_title_to_id.keys())
|
|
|
|
print("kept", kept_nr, "entities with min. frequency", min_entity_freq)
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
print()
|
2019-08-13 16:38:59 +03:00
|
|
|
print(now(), " * train entity encoder:")
|
2019-05-06 11:56:56 +03:00
|
|
|
print()
|
2019-08-13 16:38:59 +03:00
|
|
|
encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
|
2019-06-06 20:51:27 +03:00
|
|
|
encoder.train(description_list=description_list, to_print=True)
|
|
|
|
|
2019-06-19 10:15:43 +03:00
|
|
|
print()
|
2019-08-13 16:38:59 +03:00
|
|
|
print(now(), " * get entity embeddings:")
|
2019-06-06 20:51:27 +03:00
|
|
|
print()
|
|
|
|
embeddings = encoder.apply_encoder(description_list)
|
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
print(now(), " * adding", len(entity_list), "entities")
|
2019-07-23 13:17:19 +03:00
|
|
|
kb.set_entities(
|
|
|
|
entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
|
|
|
|
)
|
2019-06-06 20:51:27 +03:00
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
alias_cnt = _add_aliases(
|
2019-07-23 13:17:19 +03:00
|
|
|
kb,
|
|
|
|
title_to_id=filtered_title_to_id,
|
|
|
|
max_entities_per_alias=max_entities_per_alias,
|
|
|
|
min_occ=min_occ,
|
|
|
|
prior_prob_input=prior_prob_input,
|
|
|
|
)
|
2019-08-13 16:38:59 +03:00
|
|
|
print()
|
|
|
|
print(now(), " * adding", alias_cnt, "aliases")
|
|
|
|
print()
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-06-19 10:15:43 +03:00
|
|
|
print()
|
2019-08-13 16:38:59 +03:00
|
|
|
print("# of entities in kb:", kb.get_size_entities())
|
|
|
|
print("# of aliases in kb:", kb.get_size_aliases())
|
2019-05-06 11:56:56 +03:00
|
|
|
|
2019-08-13 16:38:59 +03:00
|
|
|
print(now(), "Done with kb")
|
2019-05-06 11:56:56 +03:00
|
|
|
return kb
|
|
|
|
|
|
|
|
|
2019-07-23 13:17:19 +03:00
|
|
|
def _write_entity_files(
|
|
|
|
entity_def_output, entity_descr_output, title_to_id, id_to_descr
|
|
|
|
):
|
|
|
|
with entity_def_output.open("w", encoding="utf8") as id_file:
|
2019-05-07 17:03:42 +03:00
|
|
|
id_file.write("WP_title" + "|" + "WD_id" + "\n")
|
|
|
|
for title, qid in title_to_id.items():
|
|
|
|
id_file.write(title + "|" + str(qid) + "\n")
|
2019-06-19 10:15:43 +03:00
|
|
|
|
2019-07-23 13:17:19 +03:00
|
|
|
with entity_descr_output.open("w", encoding="utf8") as descr_file:
|
2019-05-07 17:03:42 +03:00
|
|
|
descr_file.write("WD_id" + "|" + "description" + "\n")
|
|
|
|
for qid, descr in id_to_descr.items():
|
|
|
|
descr_file.write(str(qid) + "|" + descr + "\n")
|
|
|
|
|
2019-06-06 21:22:14 +03:00
|
|
|
|
2019-06-18 14:20:40 +03:00
|
|
|
def get_entity_to_id(entity_def_output):
|
2019-05-07 17:03:42 +03:00
|
|
|
entity_to_id = dict()
|
2019-07-23 13:17:19 +03:00
|
|
|
with entity_def_output.open("r", encoding="utf8") as csvfile:
|
|
|
|
csvreader = csv.reader(csvfile, delimiter="|")
|
2019-05-07 17:03:42 +03:00
|
|
|
# skip header
|
|
|
|
next(csvreader)
|
|
|
|
for row in csvreader:
|
|
|
|
entity_to_id[row[0]] = row[1]
|
|
|
|
return entity_to_id
|
|
|
|
|
|
|
|
|
2019-06-28 09:29:31 +03:00
|
|
|
def get_id_to_description(entity_descr_output):
|
2019-05-07 17:03:42 +03:00
|
|
|
id_to_desc = dict()
|
2019-07-23 13:17:19 +03:00
|
|
|
with entity_descr_output.open("r", encoding="utf8") as csvfile:
|
|
|
|
csvreader = csv.reader(csvfile, delimiter="|")
|
2019-05-07 17:03:42 +03:00
|
|
|
# skip header
|
|
|
|
next(csvreader)
|
|
|
|
for row in csvreader:
|
|
|
|
id_to_desc[row[0]] = row[1]
|
|
|
|
return id_to_desc
|
|
|
|
|
|
|
|
|
2019-06-19 10:15:43 +03:00
|
|
|
def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input):
|
2019-05-06 11:56:56 +03:00
|
|
|
wp_titles = title_to_id.keys()
|
2019-08-13 16:38:59 +03:00
|
|
|
cnt = 0
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
# adding aliases with prior probabilities
|
2019-06-06 21:22:14 +03:00
|
|
|
# we can read this file sequentially, it's sorted by alias, and then by count
|
2019-07-23 13:17:19 +03:00
|
|
|
with prior_prob_input.open("r", encoding="utf8") as prior_file:
|
2019-05-06 11:56:56 +03:00
|
|
|
# skip header
|
|
|
|
prior_file.readline()
|
|
|
|
line = prior_file.readline()
|
|
|
|
previous_alias = None
|
|
|
|
total_count = 0
|
2019-06-24 11:55:04 +03:00
|
|
|
counts = []
|
|
|
|
entities = []
|
2019-05-06 11:56:56 +03:00
|
|
|
while line:
|
2019-07-23 13:17:19 +03:00
|
|
|
splits = line.replace("\n", "").split(sep="|")
|
2019-05-06 11:56:56 +03:00
|
|
|
new_alias = splits[0]
|
|
|
|
count = int(splits[1])
|
|
|
|
entity = splits[2]
|
|
|
|
|
|
|
|
if new_alias != previous_alias and previous_alias:
|
|
|
|
# done reading the previous alias --> output
|
|
|
|
if len(entities) > 0:
|
2019-06-24 11:55:04 +03:00
|
|
|
selected_entities = []
|
|
|
|
prior_probs = []
|
2019-05-06 11:56:56 +03:00
|
|
|
for ent_count, ent_string in zip(counts, entities):
|
|
|
|
if ent_string in wp_titles:
|
|
|
|
wd_id = title_to_id[ent_string]
|
|
|
|
p_entity_givenalias = ent_count / total_count
|
|
|
|
selected_entities.append(wd_id)
|
|
|
|
prior_probs.append(p_entity_givenalias)
|
|
|
|
|
|
|
|
if selected_entities:
|
|
|
|
try:
|
2019-07-23 13:17:19 +03:00
|
|
|
kb.add_alias(
|
|
|
|
alias=previous_alias,
|
|
|
|
entities=selected_entities,
|
|
|
|
probabilities=prior_probs,
|
|
|
|
)
|
2019-08-13 16:38:59 +03:00
|
|
|
cnt += 1
|
2019-05-06 11:56:56 +03:00
|
|
|
except ValueError as e:
|
|
|
|
print(e)
|
|
|
|
total_count = 0
|
2019-06-24 11:55:04 +03:00
|
|
|
counts = []
|
|
|
|
entities = []
|
2019-05-06 11:56:56 +03:00
|
|
|
|
|
|
|
total_count += count
|
|
|
|
|
|
|
|
if len(entities) < max_entities_per_alias and count >= min_occ:
|
|
|
|
counts.append(count)
|
|
|
|
entities.append(entity)
|
|
|
|
previous_alias = new_alias
|
|
|
|
|
|
|
|
line = prior_file.readline()
|
2019-08-13 16:38:59 +03:00
|
|
|
return cnt
|
|
|
|
|
|
|
|
|
|
|
|
def now():
|
|
|
|
return datetime.datetime.now()
|