mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
deduce entity freq from WP corpus and serialize vocab in WP test
This commit is contained in:
parent
387263d618
commit
19e8f339cb
|
@ -1,7 +1,10 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
"""
|
||||||
|
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
import json
|
import json
|
||||||
|
@ -17,6 +20,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar
|
||||||
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||||
|
|
||||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||||
|
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
||||||
|
|
||||||
|
|
||||||
# these will/should be matched ignoring case
|
# these will/should be matched ignoring case
|
||||||
|
@ -40,12 +44,16 @@ map_alias_to_link = dict()
|
||||||
def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
|
def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
|
||||||
kb = KnowledgeBase(vocab=vocab)
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
|
|
||||||
id_to_title = _read_wikidata(limit=1000)
|
id_to_title = _read_wikidata_entities(limit=None)
|
||||||
title_to_id = {v:k for k,v in id_to_title.items()}
|
title_to_id = {v: k for k, v in id_to_title.items()}
|
||||||
|
|
||||||
|
entity_list = list(id_to_title.keys())
|
||||||
|
title_list = [id_to_title[x] for x in entity_list]
|
||||||
|
entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False)
|
||||||
|
|
||||||
_add_entities(kb,
|
_add_entities(kb,
|
||||||
entities=id_to_title.keys(),
|
entities=entity_list,
|
||||||
probs=[0.4 for x in id_to_title.keys()],
|
probs=entity_frequencies,
|
||||||
to_print=to_print)
|
to_print=to_print)
|
||||||
|
|
||||||
_add_aliases(kb,
|
_add_aliases(kb,
|
||||||
|
@ -64,6 +72,38 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
|
||||||
return kb
|
return kb
|
||||||
|
|
||||||
|
|
||||||
|
def _get_entity_frequencies(entities, to_print=False):
|
||||||
|
count_entities = [0 for _ in entities]
|
||||||
|
total_count = 0
|
||||||
|
|
||||||
|
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
||||||
|
# skip header
|
||||||
|
prior_file.readline()
|
||||||
|
line = prior_file.readline()
|
||||||
|
# we can read this file sequentially, it's sorted by alias, and then by count
|
||||||
|
|
||||||
|
while line:
|
||||||
|
splits = line.replace('\n', "").split(sep='|')
|
||||||
|
# alias = splits[0]
|
||||||
|
count = int(splits[1])
|
||||||
|
entity = splits[2]
|
||||||
|
|
||||||
|
if entity in entities:
|
||||||
|
index = entities.index(entity)
|
||||||
|
count_entities[index] = count_entities[index] + count
|
||||||
|
|
||||||
|
total_count += count
|
||||||
|
|
||||||
|
line = prior_file.readline()
|
||||||
|
|
||||||
|
if to_print:
|
||||||
|
for entity, count in zip(entities, count_entities):
|
||||||
|
print("Entity count:", entity, count)
|
||||||
|
print("Total count:", total_count)
|
||||||
|
|
||||||
|
return [x*100 / total_count for x in count_entities]
|
||||||
|
|
||||||
|
|
||||||
def _add_entities(kb, entities, probs, to_print=False):
|
def _add_entities(kb, entities, probs, to_print=False):
|
||||||
for entity, prob in zip(entities, probs):
|
for entity, prob in zip(entities, probs):
|
||||||
kb.add_entity(entity=entity, prob=prob)
|
kb.add_entity(entity=entity, prob=prob)
|
||||||
|
@ -76,7 +116,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
|
||||||
wp_titles = title_to_id.keys()
|
wp_titles = title_to_id.keys()
|
||||||
|
|
||||||
if to_print:
|
if to_print:
|
||||||
print("wp titles", wp_titles)
|
print("wp titles:", wp_titles)
|
||||||
|
|
||||||
# adding aliases with prior probabilities
|
# adding aliases with prior probabilities
|
||||||
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
||||||
|
@ -125,89 +165,100 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
|
||||||
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
||||||
|
|
||||||
|
|
||||||
def _read_wikidata(limit=None, to_print=False):
|
def _read_wikidata_entities(limit=None, to_print=False):
|
||||||
""" Read the JSON wiki data """
|
""" Read the JSON wiki data and parse out the entities"""
|
||||||
|
|
||||||
languages = {'en', 'de'}
|
languages = {'en', 'de'}
|
||||||
prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
|
prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
|
||||||
sites = {'enwiki'}
|
site_filter = 'enwiki'
|
||||||
|
|
||||||
entity_dict = dict()
|
entity_dict = dict()
|
||||||
|
|
||||||
|
# parse appropriate fields - depending on what we need in the KB
|
||||||
|
parse_properties = False
|
||||||
|
parse_sitelinks = True
|
||||||
|
parse_labels = False
|
||||||
|
parse_descriptions = False
|
||||||
|
parse_aliases = False
|
||||||
|
|
||||||
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||||||
line = file.readline()
|
line = file.readline()
|
||||||
cnt = 1
|
cnt = 0
|
||||||
while line and (not limit or cnt < limit):
|
while line and (not limit or cnt < limit):
|
||||||
|
if cnt % 100000 == 0:
|
||||||
|
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||||
clean_line = line.strip()
|
clean_line = line.strip()
|
||||||
if clean_line.endswith(b","):
|
if clean_line.endswith(b","):
|
||||||
clean_line = clean_line[:-1]
|
clean_line = clean_line[:-1]
|
||||||
if len(clean_line) > 1:
|
if len(clean_line) > 1:
|
||||||
obj = json.loads(clean_line)
|
obj = json.loads(clean_line)
|
||||||
keep = False
|
unique_id = obj["id"]
|
||||||
|
entry_type = obj["type"]
|
||||||
|
|
||||||
# filtering records on their properties
|
if unique_id[0] == 'Q' and entry_type == "item":
|
||||||
# TODO: filter on rank: preferred, normal or deprecated
|
# filtering records on their properties
|
||||||
claims = obj["claims"]
|
keep = False
|
||||||
for prop, value_set in prop_filter.items():
|
claims = obj["claims"]
|
||||||
claim_property = claims.get(prop, None)
|
for prop, value_set in prop_filter.items():
|
||||||
if claim_property:
|
claim_property = claims.get(prop, None)
|
||||||
for cp in claim_property:
|
if claim_property:
|
||||||
cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
|
for cp in claim_property:
|
||||||
if cp_id in value_set:
|
cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
|
||||||
keep = True
|
cp_rank = cp['rank']
|
||||||
|
if cp_rank != "deprecated" and cp_id in value_set:
|
||||||
|
keep = True
|
||||||
|
|
||||||
if keep:
|
if keep:
|
||||||
unique_id = obj["id"]
|
if to_print:
|
||||||
entry_type = obj["type"]
|
print("ID:", unique_id)
|
||||||
|
print("type:", entry_type)
|
||||||
|
|
||||||
if to_print:
|
# parsing all properties that refer to other entities
|
||||||
print("ID:", unique_id)
|
if parse_properties:
|
||||||
print("type:", entry_type)
|
for prop, claim_property in claims.items():
|
||||||
|
cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
|
||||||
|
cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
|
||||||
|
if cp_values:
|
||||||
|
if to_print:
|
||||||
|
print("prop:", prop, cp_values)
|
||||||
|
|
||||||
# parsing all properties that refer to other entities
|
if parse_sitelinks:
|
||||||
for prop, claim_property in claims.items():
|
site_value = obj["sitelinks"].get(site_filter, None)
|
||||||
cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
|
if site_value:
|
||||||
cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
|
if to_print:
|
||||||
if cp_values:
|
print(site_filter, ":", site_value['title'])
|
||||||
if to_print:
|
|
||||||
print("prop:", prop, cp_values)
|
|
||||||
|
|
||||||
entry_sites = obj["sitelinks"]
|
|
||||||
for site in sites:
|
|
||||||
site_value = entry_sites.get(site, None)
|
|
||||||
if site_value:
|
|
||||||
if to_print:
|
|
||||||
print(site, ":", site_value['title'])
|
|
||||||
if site == "enwiki":
|
|
||||||
entity_dict[unique_id] = site_value['title']
|
entity_dict[unique_id] = site_value['title']
|
||||||
|
|
||||||
labels = obj["labels"]
|
if parse_labels:
|
||||||
if labels:
|
labels = obj["labels"]
|
||||||
for lang in languages:
|
if labels:
|
||||||
lang_label = labels.get(lang, None)
|
for lang in languages:
|
||||||
if lang_label:
|
lang_label = labels.get(lang, None)
|
||||||
if to_print:
|
if lang_label:
|
||||||
print("label (" + lang + "):", lang_label["value"])
|
if to_print:
|
||||||
|
print("label (" + lang + "):", lang_label["value"])
|
||||||
|
|
||||||
descriptions = obj["descriptions"]
|
if parse_descriptions:
|
||||||
if descriptions:
|
descriptions = obj["descriptions"]
|
||||||
for lang in languages:
|
if descriptions:
|
||||||
lang_descr = descriptions.get(lang, None)
|
for lang in languages:
|
||||||
if lang_descr:
|
lang_descr = descriptions.get(lang, None)
|
||||||
if to_print:
|
if lang_descr:
|
||||||
print("description (" + lang + "):", lang_descr["value"])
|
if to_print:
|
||||||
|
print("description (" + lang + "):", lang_descr["value"])
|
||||||
|
|
||||||
aliases = obj["aliases"]
|
if parse_aliases:
|
||||||
if aliases:
|
aliases = obj["aliases"]
|
||||||
for lang in languages:
|
if aliases:
|
||||||
lang_aliases = aliases.get(lang, None)
|
for lang in languages:
|
||||||
if lang_aliases:
|
lang_aliases = aliases.get(lang, None)
|
||||||
for item in lang_aliases:
|
if lang_aliases:
|
||||||
if to_print:
|
for item in lang_aliases:
|
||||||
print("alias (" + lang + "):", item["value"])
|
if to_print:
|
||||||
|
print("alias (" + lang + "):", item["value"])
|
||||||
|
|
||||||
if to_print:
|
if to_print:
|
||||||
print()
|
print()
|
||||||
line = file.readline()
|
line = file.readline()
|
||||||
cnt += 1
|
cnt += 1
|
||||||
|
|
||||||
|
@ -236,7 +287,7 @@ def _read_wikipedia_prior_probs():
|
||||||
cnt = 0
|
cnt = 0
|
||||||
while line:
|
while line:
|
||||||
if cnt % 5000000 == 0:
|
if cnt % 5000000 == 0:
|
||||||
print(datetime.datetime.now(), "processed", cnt, "lines")
|
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
||||||
clean_line = line.strip().decode("utf-8")
|
clean_line = line.strip().decode("utf-8")
|
||||||
|
|
||||||
matches = link_regex.findall(clean_line)
|
matches = link_regex.findall(clean_line)
|
||||||
|
@ -394,7 +445,8 @@ def add_el(kb, nlp):
|
||||||
|
|
||||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||||
"Douglas reminds us to always bring our towel. " \
|
"Douglas reminds us to always bring our towel. " \
|
||||||
"The main character in Doug's novel is the man Arthur Dent, but Douglas doesn't write about George Washington."
|
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||||
|
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||||
doc = nlp(text)
|
doc = nlp(text)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
|
@ -414,48 +466,46 @@ def capitalize_first(text):
|
||||||
result += text[1:]
|
result += text[1:]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
to_create_prior_probs = False
|
||||||
|
to_create_kb = True
|
||||||
|
to_read_kb = False
|
||||||
|
|
||||||
# STEP 1 : create prior probabilities from WP
|
# STEP 1 : create prior probabilities from WP
|
||||||
# run only once !
|
# run only once !
|
||||||
# _read_wikipedia_prior_probs()
|
if to_create_prior_probs:
|
||||||
|
_read_wikipedia_prior_probs()
|
||||||
|
|
||||||
# STEP 2 : create KB
|
if to_create_kb:
|
||||||
# nlp = spacy.load('en_core_web_sm')
|
# STEP 2 : create KB
|
||||||
# my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True)
|
my_nlp = spacy.load('en_core_web_sm')
|
||||||
|
my_vocab = my_nlp.vocab
|
||||||
|
my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
|
||||||
|
print("kb entities:", my_kb.get_size_entities())
|
||||||
|
print("kb aliases:", my_kb.get_size_aliases())
|
||||||
|
|
||||||
# STEP 3 : write KB to file
|
# STEP 3 : write KB to file
|
||||||
nlp1 = spacy.load('en_core_web_sm')
|
my_kb.dump(KB_FILE)
|
||||||
my_vocab = nlp1.vocab
|
my_vocab.to_disk(VOCAB_DIR)
|
||||||
kb1 = KnowledgeBase(vocab=my_vocab)
|
|
||||||
|
|
||||||
kb1.add_entity(entity="Q53", prob=0.33)
|
if to_read_kb:
|
||||||
kb1.add_entity(entity="Q17", prob=0.1)
|
# STEP 4 : read KB back in from file
|
||||||
kb1.add_entity(entity="Q007", prob=0.7)
|
my_vocab = Vocab()
|
||||||
kb1.add_entity(entity="Q44", prob=0.4)
|
my_vocab.from_disk(VOCAB_DIR)
|
||||||
kb1.add_alias(alias="double07", entities=["Q007", "Q17"], probabilities=[0.9, 0.1])
|
my_kb = KnowledgeBase(vocab=my_vocab)
|
||||||
kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
|
my_kb.load_bulk(KB_FILE)
|
||||||
kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
print("kb entities:", my_kb.get_size_entities())
|
||||||
|
print("kb aliases:", my_kb.get_size_aliases())
|
||||||
|
|
||||||
print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases())
|
# test KB
|
||||||
print("kb1 entities:", kb1.get_entity_strings())
|
candidates = my_kb.get_candidates("Bush")
|
||||||
print("kb1 aliases:", kb1.get_alias_strings())
|
for c in candidates:
|
||||||
|
print()
|
||||||
|
print("entity:", c.entity_)
|
||||||
|
print("entity freq:", c.entity_freq)
|
||||||
|
print("alias:", c.alias_)
|
||||||
|
print("prior prob:", c.prior_prob)
|
||||||
|
|
||||||
print()
|
# STEP 5: add KB to NLP pipeline
|
||||||
print("dumping kb1")
|
|
||||||
print(KB_FILE, type(KB_FILE))
|
|
||||||
kb1.dump(KB_FILE)
|
|
||||||
|
|
||||||
# STEP 4 : read KB back in from file
|
|
||||||
|
|
||||||
kb3 = KnowledgeBase(vocab=my_vocab)
|
|
||||||
|
|
||||||
print("loading kb3")
|
|
||||||
kb3.load_bulk(KB_FILE)
|
|
||||||
|
|
||||||
print()
|
|
||||||
print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases())
|
|
||||||
print("kb3 entities:", kb3.get_entity_strings())
|
|
||||||
print("kb3 aliases:", kb3.get_alias_strings())
|
|
||||||
|
|
||||||
# STEP 5 : actually use the EL functionality
|
|
||||||
# add_el(my_kb, nlp)
|
# add_el(my_kb, nlp)
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
import spacy
|
||||||
|
from spacy.lang.en import English
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
from ...util import ensure_path
|
from ...util import ensure_path
|
||||||
|
|
||||||
|
@ -5,17 +7,8 @@ from spacy.kb import KnowledgeBase
|
||||||
|
|
||||||
|
|
||||||
def test_serialize_kb_disk(en_vocab):
|
def test_serialize_kb_disk(en_vocab):
|
||||||
kb1 = KnowledgeBase(vocab=en_vocab)
|
|
||||||
|
|
||||||
kb1.add_entity(entity="Q53", prob=0.33)
|
|
||||||
kb1.add_entity(entity="Q17", prob=0.2)
|
|
||||||
kb1.add_entity(entity="Q007", prob=0.7)
|
|
||||||
kb1.add_entity(entity="Q44", prob=0.4)
|
|
||||||
kb1.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
|
|
||||||
kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
|
|
||||||
kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
|
||||||
|
|
||||||
# baseline assertions
|
# baseline assertions
|
||||||
|
kb1 = _get_dummy_kb(en_vocab)
|
||||||
_check_kb(kb1)
|
_check_kb(kb1)
|
||||||
|
|
||||||
# dumping to file & loading back in
|
# dumping to file & loading back in
|
||||||
|
@ -34,6 +27,20 @@ def test_serialize_kb_disk(en_vocab):
|
||||||
_check_kb(kb2)
|
_check_kb(kb2)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_dummy_kb(vocab):
|
||||||
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
|
|
||||||
|
kb.add_entity(entity="Q53", prob=0.33)
|
||||||
|
kb.add_entity(entity="Q17", prob=0.2)
|
||||||
|
kb.add_entity(entity="Q007", prob=0.7)
|
||||||
|
kb.add_entity(entity="Q44", prob=0.4)
|
||||||
|
kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9])
|
||||||
|
kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1])
|
||||||
|
kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0])
|
||||||
|
|
||||||
|
return kb
|
||||||
|
|
||||||
|
|
||||||
def _check_kb(kb):
|
def _check_kb(kb):
|
||||||
# check entities
|
# check entities
|
||||||
assert kb.get_size_entities() == 4
|
assert kb.get_size_entities() == 4
|
||||||
|
|
Loading…
Reference in New Issue
Block a user