fixes for prior prob and linking wikidata IDs with wikipedia titles

This commit is contained in:
svlandeg 2019-04-18 16:14:25 +02:00
parent 10ee8dfea2
commit 9f308eb5dc

View File

@ -38,10 +38,13 @@ map_alias_to_link = dict()
def create_kb(vocab, max_entities_per_alias, min_occ): def create_kb(vocab, max_entities_per_alias, min_occ):
kb = KnowledgeBase(vocab=vocab) kb = KnowledgeBase(vocab=vocab)
_add_entities(kb) id_to_title = _read_wikidata(limit=100, to_print=False)
_add_aliases(kb, max_entities_per_alias, min_occ) title_to_id = {v:k for k,v in id_to_title.items()}
# _read_wikidata() _add_entities(kb, entities=id_to_title.keys(), probs=[0.4 for x in id_to_title.keys()])
_add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
# TODO: read wikipedia texts for entity context
# _read_wikipedia() # _read_wikipedia()
print() print()
@ -50,20 +53,17 @@ def create_kb(vocab, max_entities_per_alias, min_occ):
return kb return kb
def _add_entities(kb): def _add_entities(kb, entities, probs):
for entity, prob in zip(entities, probs):
kb.add_entity(entity="Earthquake", prob=0.342) kb.add_entity(entity=entity, prob=prob)
kb.add_entity(entity="2010 haiti earthquake", prob=0.1)
kb.add_entity(entity="1906 san francisco earthquake", prob=0.1)
kb.add_entity(entity="2011 christchurch earthquak", prob=0.1)
kb.add_entity(entity="Soft drink", prob=0.342)
print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings()) print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings())
def _add_aliases(kb, max_entities_per_alias, min_occ): def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ):
all_entities = kb.get_entity_strings() wp_titles = title_to_id.keys()
print("wp titles", wp_titles)
# adding aliases with prior probabilities # adding aliases with prior probabilities
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
# skip header # skip header
@ -86,13 +86,17 @@ def _add_aliases(kb, max_entities_per_alias, min_occ):
selected_entities = list() selected_entities = list()
prior_probs = list() prior_probs = list()
for ent_count, ent_string in zip(counts, entities): for ent_count, ent_string in zip(counts, entities):
if ent_string in all_entities: if ent_string in wp_titles:
wd_id = title_to_id[ent_string]
p_entity_givenalias = ent_count / total_count p_entity_givenalias = ent_count / total_count
selected_entities.append(ent_string) selected_entities.append(wd_id)
prior_probs.append(p_entity_givenalias) prior_probs.append(p_entity_givenalias)
if selected_entities: if selected_entities:
kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
print("analysed", previous_alias, "with entities", entities, "and counts", counts)
print("added", previous_alias, "with selected entities", selected_entities, "and probs", prior_probs)
print()
total_count = 0 total_count = 0
counts = list() counts = list()
entities = list() entities = list()
@ -110,48 +114,68 @@ def _add_aliases(kb, max_entities_per_alias, min_occ):
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
def _read_wikidata(): def _read_wikidata(limit=None, to_print=False):
""" Read the JSON wiki data """ """ Read the JSON wiki data """
# TODO remove hardcoded path
languages = {'en', 'de'} languages = {'en', 'de'}
properties = {'P31'} prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
sites = {'enwiki'} sites = {'enwiki'}
entity_dict = dict()
with bz2.open(WIKIDATA_JSON, mode='rb') as file: with bz2.open(WIKIDATA_JSON, mode='rb') as file:
line = file.readline() line = file.readline()
cnt = 1 cnt = 1
while line and cnt < 100000: while line and (not limit or cnt < limit):
clean_line = line.strip() clean_line = line.strip()
if clean_line.endswith(b","): if clean_line.endswith(b","):
clean_line = clean_line[:-1] clean_line = clean_line[:-1]
if len(clean_line) > 1: if len(clean_line) > 1:
obj = json.loads(clean_line) obj = json.loads(clean_line)
keep = False
unique_id = obj["id"] # filtering records on their properties
print("ID:", unique_id)
entry_type = obj["type"]
print("type:", entry_type)
# TODO: filter on rank: preferred, normal or deprecated # TODO: filter on rank: preferred, normal or deprecated
claims = obj["claims"] claims = obj["claims"]
for prop in properties: for prop, value_set in prop_filter.items():
claim_property = claims.get(prop, None) claim_property = claims.get(prop, None)
if claim_property: if claim_property:
for cp in claim_property: for cp in claim_property:
print(prop, cp['mainsnak']['datavalue']['value']['id']) cp_id = cp['mainsnak']['datavalue']['value']['id']
if cp_id in value_set:
keep = True
if keep:
unique_id = obj["id"]
entry_type = obj["type"]
if to_print:
print("ID:", unique_id)
print("type:", entry_type)
# parsing all properties that refer to other entities
for prop, claim_property in claims.items():
cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
if cp_values:
if to_print:
print("prop:", prop, cp_values)
entry_sites = obj["sitelinks"] entry_sites = obj["sitelinks"]
for site in sites: for site in sites:
site_value = entry_sites.get(site, None) site_value = entry_sites.get(site, None)
if site_value:
if to_print:
print(site, ":", site_value['title']) print(site, ":", site_value['title'])
if site == "enwiki":
entity_dict[unique_id] = site_value['title']
labels = obj["labels"] labels = obj["labels"]
if labels: if labels:
for lang in languages: for lang in languages:
lang_label = labels.get(lang, None) lang_label = labels.get(lang, None)
if lang_label: if lang_label:
if to_print:
print("label (" + lang + "):", lang_label["value"]) print("label (" + lang + "):", lang_label["value"])
descriptions = obj["descriptions"] descriptions = obj["descriptions"]
@ -159,6 +183,7 @@ def _read_wikidata():
for lang in languages: for lang in languages:
lang_descr = descriptions.get(lang, None) lang_descr = descriptions.get(lang, None)
if lang_descr: if lang_descr:
if to_print:
print("description (" + lang + "):", lang_descr["value"]) print("description (" + lang + "):", lang_descr["value"])
aliases = obj["aliases"] aliases = obj["aliases"]
@ -167,12 +192,16 @@ def _read_wikidata():
lang_aliases = aliases.get(lang, None) lang_aliases = aliases.get(lang, None)
if lang_aliases: if lang_aliases:
for item in lang_aliases: for item in lang_aliases:
if to_print:
print("alias (" + lang + "):", item["value"]) print("alias (" + lang + "):", item["value"])
if to_print:
print() print()
line = file.readline() line = file.readline()
cnt += 1 cnt += 1
return entity_dict
def _read_wikipedia_prior_probs(): def _read_wikipedia_prior_probs():
""" Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """ """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """
@ -206,7 +235,7 @@ def _read_wikipedia_prior_probs():
# this is a simple link, with the alias the same as the mention # this is a simple link, with the alias the same as the mention
elif "|" not in match: elif "|" not in match:
_store_alias(match, match) _store_alias(match, match, normalize_alias=True, normalize_entity=True)
# in wiki format, the link is written as [[entity|alias]] # in wiki format, the link is written as [[entity|alias]]
else: else:
@ -216,9 +245,9 @@ def _read_wikipedia_prior_probs():
# specific wiki format [[alias (specification)|]] # specific wiki format [[alias (specification)|]]
if len(alias) == 0 and "(" in entity: if len(alias) == 0 and "(" in entity:
alias = entity.split("(")[0] alias = entity.split("(")[0]
_store_alias(alias, entity) _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
else: else:
_store_alias(alias, entity) _store_alias(alias, entity, normalize_alias=False, normalize_entity=True)
line = file.readline() line = file.readline()
cnt += 1 cnt += 1
@ -231,17 +260,20 @@ def _read_wikipedia_prior_probs():
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
def _store_alias(alias, entity): def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
alias = alias.strip() alias = alias.strip()
entity = entity.strip() entity = entity.strip()
# remove everything after # as this is not part of the title but refers to a specific paragraph # remove everything after # as this is not part of the title but refers to a specific paragraph
clean_entity = entity.split("#")[0].capitalize() if normalize_entity:
entity = capitalize_first(entity.split("#")[0])
if normalize_alias:
alias = capitalize_first(alias.split("#")[0])
if len(alias) > 0 and len(clean_entity) > 0: if alias and entity:
alias_dict = map_alias_to_link.get(alias, dict()) alias_dict = map_alias_to_link.get(alias, dict())
entity_count = alias_dict.get(clean_entity, 0) entity_count = alias_dict.get(entity, 0)
alias_dict[clean_entity] = entity_count + 1 alias_dict[entity] = entity_count + 1
map_alias_to_link[alias] = alias_dict map_alias_to_link[alias] = alias_dict
@ -360,14 +392,22 @@ def add_el(kb, nlp):
print("ent", ent.text, ent.label_, ent.kb_id_) print("ent", ent.text, ent.label_, ent.kb_id_)
def capitalize_first(text):
if not text:
return None
result = text[0].capitalize()
if len(result) > 0:
result += text[1:]
return result
if __name__ == "__main__": if __name__ == "__main__":
# STEP 1 : create prior probabilities from WP # STEP 1 : create prior probabilities from WP
# run only once ! # run only once !
# _read_wikipedia_prior_probs() _read_wikipedia_prior_probs()
# STEP 2 : create KB # STEP 2 : create KB
nlp = spacy.load('en_core_web_sm') # nlp = spacy.load('en_core_web_sm')
my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5) # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5)
# add_el(my_kb, nlp) # add_el(my_kb, nlp)
# clean_text = "[[File:smomething]] jhk" # clean_text = "[[File:smomething]] jhk"