bulk entity writing and experiment with regex wikidata reader to speed up processing

This commit is contained in:
svlandeg 2019-05-01 00:00:38 +02:00
parent 653b7d9c87
commit 60b54ae8ce
3 changed files with 135 additions and 15 deletions

View File

@ -49,7 +49,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
print() print()
print("1. _read_wikidata_entities", datetime.datetime.now()) print("1. _read_wikidata_entities", datetime.datetime.now())
print() print()
title_to_id = _read_wikidata_entities(limit=100000) # title_to_id = _read_wikidata_entities_regex(limit=1000)
title_to_id = _read_wikidata_entities_json(limit=1000)
title_list = list(title_to_id.keys()) title_list = list(title_to_id.keys())
entity_list = [title_to_id[x] for x in title_list] entity_list = [title_to_id[x] for x in title_list]
@ -62,19 +63,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
print() print()
print("3. _add_entities", datetime.datetime.now()) print("3. _add_entities", datetime.datetime.now())
print() print()
_add_entities(kb, kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
entities=entity_list, # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print)
probs=entity_frequencies,
to_print=to_print)
print() print()
print("4. _add_aliases", datetime.datetime.now()) print("4. _add_aliases", datetime.datetime.now())
print() print()
_add_aliases(kb, _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,)
title_to_id=title_to_id,
max_entities_per_alias=max_entities_per_alias,
min_occ=min_occ,
to_print=to_print)
# TODO: read wikipedia texts for entity context # TODO: read wikipedia texts for entity context
# _read_wikipedia() # _read_wikipedia()
@ -83,6 +78,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False):
print() print()
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
print("done with kb", datetime.datetime.now())
return kb return kb
@ -131,8 +128,7 @@ def _write_entity_counts(to_print=False):
print("Total count:", total_count) print("Total count:", total_count)
def _add_entities(kb, entities, probs, to_print=False): def _add_entities_depr(kb, entities, probs, to_print=False):
# TODO: this should be a bulk method
for entity, prob in zip(entities, probs): for entity, prob in zip(entities, probs):
kb.add_entity(entity=entity, prob=prob) kb.add_entity(entity=entity, prob=prob)
@ -193,7 +189,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
def _read_wikidata_entities(limit=None, to_print=False): def _read_wikidata_entities_json(limit=None, to_print=False):
""" Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
languages = {'en', 'de'} languages = {'en', 'de'}
@ -259,6 +255,7 @@ def _read_wikidata_entities(limit=None, to_print=False):
if to_print: if to_print:
print(site_filter, ":", site) print(site_filter, ":", site)
title_to_id[site] = unique_id title_to_id[site] = unique_id
# print(site, "for", unique_id)
if parse_labels: if parse_labels:
labels = obj["labels"] labels = obj["labels"]
@ -296,6 +293,56 @@ def _read_wikidata_entities(limit=None, to_print=False):
return title_to_id return title_to_id
def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
""" Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """
regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
title_to_id = dict()
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
line = file.readline()
cnt = 0
while line and (not limit or cnt < limit):
if cnt % 100000 == 0:
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
clean_line = line.strip()
if clean_line.endswith(b","):
clean_line = clean_line[:-1]
if len(clean_line) > 1:
clean_line = line.strip().decode("utf-8")
keep = False
p31_matches = regex_p31.findall(clean_line)
if p31_matches:
for p31_match in p31_matches:
id_matches = regex_id.findall(p31_match)
for id_match in id_matches:
id_match = id_match[6:][:-1]
if id_match == "Q5" or id_match == "Q15632617":
keep = True
if keep:
id_match = regex_id.search(clean_line).group(0)
id_match = id_match[6:][:-1]
enwiki_matches = regex_enwiki.findall(clean_line)
if enwiki_matches:
for enwiki_match in enwiki_matches:
title_match = regex_title.search(enwiki_match).group(0)
title = title_match[9:][:-1]
title_to_id[title] = id_match
# print(title, "for", id_match)
line = file.readline()
cnt += 1
return title_to_id
def _read_wikipedia_prior_probs(): def _read_wikipedia_prior_probs():
""" Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
The full file takes about 2h to parse 1100M lines (update printed every 5M lines) The full file takes about 2h to parse 1100M lines (update printed every 5M lines)
@ -499,50 +546,65 @@ def capitalize_first(text):
if __name__ == "__main__": if __name__ == "__main__":
print("START", datetime.datetime.now())
to_create_prior_probs = False to_create_prior_probs = False
to_create_entity_counts = False to_create_entity_counts = False
to_create_kb = True to_create_kb = True
to_read_kb = False to_read_kb = True
# STEP 1 : create prior probabilities from WP # STEP 1 : create prior probabilities from WP
# run only once ! # run only once !
if to_create_prior_probs: if to_create_prior_probs:
print("STEP 1: to_create_prior_probs", datetime.datetime.now())
_read_wikipedia_prior_probs() _read_wikipedia_prior_probs()
print()
# STEP 2 : deduce entity frequencies from WP # STEP 2 : deduce entity frequencies from WP
# run only once ! # run only once !
if to_create_entity_counts: if to_create_entity_counts:
print("STEP 2: to_create_entity_counts", datetime.datetime.now())
_write_entity_counts() _write_entity_counts()
print()
if to_create_kb: if to_create_kb:
# STEP 3 : create KB # STEP 3 : create KB
print("STEP 3: to_create_kb", datetime.datetime.now())
my_nlp = spacy.load('en_core_web_sm') my_nlp = spacy.load('en_core_web_sm')
my_vocab = my_nlp.vocab my_vocab = my_nlp.vocab
my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
print("kb entities:", my_kb.get_size_entities()) print("kb entities:", my_kb.get_size_entities())
print("kb aliases:", my_kb.get_size_aliases()) print("kb aliases:", my_kb.get_size_aliases())
print()
# STEP 4 : write KB to file # STEP 4 : write KB to file
print("STEP 4: write KB", datetime.datetime.now())
my_kb.dump(KB_FILE) my_kb.dump(KB_FILE)
my_vocab.to_disk(VOCAB_DIR) my_vocab.to_disk(VOCAB_DIR)
print()
if to_read_kb: if to_read_kb:
# STEP 5 : read KB back in from file # STEP 5 : read KB back in from file
print("STEP 5: to_read_kb", datetime.datetime.now())
my_vocab = Vocab() my_vocab = Vocab()
my_vocab.from_disk(VOCAB_DIR) my_vocab.from_disk(VOCAB_DIR)
my_kb = KnowledgeBase(vocab=my_vocab) my_kb = KnowledgeBase(vocab=my_vocab)
my_kb.load_bulk(KB_FILE) my_kb.load_bulk(KB_FILE)
print("kb entities:", my_kb.get_size_entities()) print("kb entities:", my_kb.get_size_entities())
print("kb aliases:", my_kb.get_size_aliases()) print("kb aliases:", my_kb.get_size_aliases())
print()
# test KB # test KB
candidates = my_kb.get_candidates("Bush") candidates = my_kb.get_candidates("Bush")
for c in candidates: for c in candidates:
print()
print("entity:", c.entity_) print("entity:", c.entity_)
print("entity freq:", c.entity_freq) print("entity freq:", c.entity_freq)
print("alias:", c.alias_) print("alias:", c.alias_)
print("prior prob:", c.prior_prob) print("prior prob:", c.prior_prob)
print()
# STEP 6: add KB to NLP pipeline # STEP 6: add KB to NLP pipeline
# print("STEP 6: use KB", datetime.datetime.now())
# add_el(my_kb, nlp) # add_el(my_kb, nlp)
print("STOP", datetime.datetime.now())

View File

@ -131,6 +131,8 @@ cdef class KnowledgeBase:
self._aliases_table.push_back(alias) self._aliases_table.push_back(alias)
cpdef load_bulk(self, loc) cpdef load_bulk(self, loc)
cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list)
cpdef set_aliases(self, alias_list, entities_list, probabilities_list)
cdef class Writer: cdef class Writer:

View File

@ -111,6 +111,62 @@ cdef class KnowledgeBase:
return entity_hash return entity_hash
cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list):
nr_entities = len(entity_list)
self._entry_index = PreshMap(nr_entities+1)
self._entries = entry_vec(nr_entities+1)
i = 0
cdef EntryC entry
cdef int32_t dummy_value = 342
while i < nr_entities:
# TODO features and vectors
entity_hash = self.vocab.strings.add(entity_list[i])
entry.entity_hash = entity_hash
entry.prob = prob_list[i]
entry.vector_rows = &dummy_value
entry.feats_row = dummy_value
self._entries[i+1] = entry
self._entry_index[entity_hash] = i+1
i += 1
# TODO: this method is untested
cpdef set_aliases(self, alias_list, entities_list, probabilities_list):
nr_aliases = len(alias_list)
self._alias_index = PreshMap(nr_aliases+1)
self._aliases_table = alias_vec(nr_aliases+1)
i = 0
cdef AliasC alias
cdef int32_t dummy_value = 342
while i <= nr_aliases:
alias_hash = self.vocab.strings.add(alias_list[i])
entities = entities_list[i]
probabilities = probabilities_list[i]
nr_candidates = len(entities)
entry_indices = vector[int64_t](nr_candidates)
probs = vector[float](nr_candidates)
for j in range(0, nr_candidates):
entity = entities[j]
entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index:
raise ValueError(Errors.E134.format(alias=alias, entity=entity))
entry_index = <int64_t>self._entry_index.get(entity_hash)
entry_indices[j] = entry_index
alias.entry_indices = entry_indices
alias.probs = probs
self._aliases_table[i] = alias
self._alias_index[alias_hash] = i
i += 1
def add_alias(self, unicode alias, entities, probabilities): def add_alias(self, unicode alias, entities, probabilities):
""" """
For a given alias, add its potential entities and prior probabilies to the KB. For a given alias, add its potential entities and prior probabilies to the KB.