mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
refactor code to separate functionality into different files
This commit is contained in:
parent
f5190267e7
commit
6961215578
|
@ -66,6 +66,6 @@ def add_el(kb, nlp):
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
my_kb = create_kb(nlp.vocab)
|
||||
add_el(my_kb, nlp)
|
||||
my_nlp = spacy.load('en_core_web_sm')
|
||||
my_kb = create_kb(my_nlp.vocab)
|
||||
add_el(my_kb, my_nlp)
|
||||
|
|
0
examples/pipeline/wiki_entity_linking/__init__.py
Normal file
0
examples/pipeline/wiki_entity_linking/__init__.py
Normal file
137
examples/pipeline/wiki_entity_linking/kb_creator.py
Normal file
137
examples/pipeline/wiki_entity_linking/kb_creator.py
Normal file
|
@ -0,0 +1,137 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
from spacy.kb import KnowledgeBase
|
||||
|
||||
import datetime
|
||||
|
||||
from . import wikipedia_processor as wp
|
||||
from . import wikidata_processor as wd
|
||||
|
||||
|
||||
def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input,
|
||||
to_print=False, write_entity_defs=True):
|
||||
""" Create the knowledge base from Wikidata entries """
|
||||
kb = KnowledgeBase(vocab=vocab)
|
||||
|
||||
print()
|
||||
print("1. _read_wikidata_entities", datetime.datetime.now())
|
||||
print()
|
||||
# title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
|
||||
title_to_id = wd.read_wikidata_entities_json(limit=None)
|
||||
|
||||
# write the title-ID mapping to file
|
||||
if write_entity_defs:
|
||||
with open(entity_output, mode='w', encoding='utf8') as entity_file:
|
||||
entity_file.write("WP_title" + "|" + "WD_id" + "\n")
|
||||
for title, qid in title_to_id.items():
|
||||
entity_file.write(title + "|" + str(qid) + "\n")
|
||||
|
||||
title_list = list(title_to_id.keys())
|
||||
entity_list = [title_to_id[x] for x in title_list]
|
||||
|
||||
print()
|
||||
print("2. _get_entity_frequencies", datetime.datetime.now())
|
||||
print()
|
||||
entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list)
|
||||
|
||||
print()
|
||||
print("3. adding", len(entity_list), "entities", datetime.datetime.now())
|
||||
print()
|
||||
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
|
||||
|
||||
print()
|
||||
print("4. adding aliases", datetime.datetime.now())
|
||||
print()
|
||||
_add_aliases(kb, title_to_id=title_to_id,
|
||||
max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,
|
||||
prior_prob_input=prior_prob_input)
|
||||
|
||||
if to_print:
|
||||
print()
|
||||
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
|
||||
|
||||
print("done with kb", datetime.datetime.now())
|
||||
|
||||
return kb
|
||||
|
||||
|
||||
def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False):
|
||||
wp_titles = title_to_id.keys()
|
||||
|
||||
if to_print:
|
||||
print("wp titles:", wp_titles)
|
||||
|
||||
# adding aliases with prior probabilities
|
||||
with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
|
||||
# skip header
|
||||
prior_file.readline()
|
||||
line = prior_file.readline()
|
||||
# we can read this file sequentially, it's sorted by alias, and then by count
|
||||
previous_alias = None
|
||||
total_count = 0
|
||||
counts = list()
|
||||
entities = list()
|
||||
while line:
|
||||
splits = line.replace('\n', "").split(sep='|')
|
||||
new_alias = splits[0]
|
||||
count = int(splits[1])
|
||||
entity = splits[2]
|
||||
|
||||
if new_alias != previous_alias and previous_alias:
|
||||
# done reading the previous alias --> output
|
||||
if len(entities) > 0:
|
||||
selected_entities = list()
|
||||
prior_probs = list()
|
||||
for ent_count, ent_string in zip(counts, entities):
|
||||
if ent_string in wp_titles:
|
||||
wd_id = title_to_id[ent_string]
|
||||
p_entity_givenalias = ent_count / total_count
|
||||
selected_entities.append(wd_id)
|
||||
prior_probs.append(p_entity_givenalias)
|
||||
|
||||
if selected_entities:
|
||||
try:
|
||||
kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
total_count = 0
|
||||
counts = list()
|
||||
entities = list()
|
||||
|
||||
total_count += count
|
||||
|
||||
if len(entities) < max_entities_per_alias and count >= min_occ:
|
||||
counts.append(count)
|
||||
entities.append(entity)
|
||||
previous_alias = new_alias
|
||||
|
||||
line = prior_file.readline()
|
||||
|
||||
if to_print:
|
||||
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
||||
|
||||
|
||||
def test_kb(kb):
|
||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
|
||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||
nlp.add_pipe(el_pipe, last=True)
|
||||
|
||||
candidates = kb.get_candidates("Bush")
|
||||
|
||||
print("generating candidates for 'Bush' :")
|
||||
for c in candidates:
|
||||
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
||||
print()
|
||||
|
||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||
"Douglas reminds us to always bring our towel. " \
|
||||
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||
doc = nlp(text)
|
||||
|
||||
for ent in doc.ents:
|
||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
36
examples/pipeline/wiki_entity_linking/run_el.py
Normal file
36
examples/pipeline/wiki_entity_linking/run_el.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
|
||||
# requires: pip install neuralcoref --no-binary neuralcoref
|
||||
# import neuralcoref
|
||||
|
||||
|
||||
# TODO
|
||||
def add_coref():
|
||||
""" Add coreference resolution to our model """
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
# nlp = spacy.load('en')
|
||||
|
||||
# TODO: this doesn't work yet
|
||||
# neuralcoref.add_to_pipe(nlp)
|
||||
print("done adding to pipe")
|
||||
|
||||
doc = nlp(u'My sister has a dog. She loves him.')
|
||||
print("done doc")
|
||||
|
||||
print(doc._.has_coref)
|
||||
print(doc._.coref_clusters)
|
||||
|
||||
|
||||
# TODO
|
||||
def _run_ner_depr(nlp, clean_text, article_dict):
|
||||
doc = nlp(clean_text)
|
||||
for ent in doc.ents:
|
||||
if ent.label_ == "PERSON": # TODO: expand to non-persons
|
||||
ent_id = article_dict.get(ent.text)
|
||||
if ent_id:
|
||||
print(" -", ent.text, ent.label_, ent_id)
|
||||
else:
|
||||
print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases
|
276
examples/pipeline/wiki_entity_linking/training_set_creator.py
Normal file
276
examples/pipeline/wiki_entity_linking/training_set_creator.py
Normal file
|
@ -0,0 +1,276 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import csv
|
||||
import bz2
|
||||
import datetime
|
||||
|
||||
from . import wikipedia_processor as wp
|
||||
|
||||
"""
|
||||
Process Wikipedia interlinks to generate a training dataset for the EL algorithm
|
||||
"""
|
||||
|
||||
|
||||
def create_training(kb, entity_input, training_output):
|
||||
if not kb:
|
||||
raise ValueError("kb should be defined")
|
||||
# nlp = spacy.load('en_core_web_sm')
|
||||
wp_to_id = _get_entity_to_id(entity_input)
|
||||
_process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset
|
||||
|
||||
|
||||
def _get_entity_to_id(entity_input):
|
||||
entity_to_id = dict()
|
||||
with open(entity_input, 'r', encoding='utf8') as csvfile:
|
||||
csvreader = csv.reader(csvfile, delimiter='|')
|
||||
# skip header
|
||||
next(csvreader)
|
||||
for row in csvreader:
|
||||
entity_to_id[row[0]] = row[1]
|
||||
|
||||
return entity_to_id
|
||||
|
||||
|
||||
def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None):
|
||||
"""
|
||||
Read the XML wikipedia data to parse out training data:
|
||||
raw text data + positive and negative instances
|
||||
"""
|
||||
|
||||
title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
|
||||
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
|
||||
|
||||
read_ids = set()
|
||||
|
||||
entityfile_loc = training_output + "/" + "gold_entities.csv"
|
||||
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
|
||||
# write entity training header file
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
article_id="article_id",
|
||||
alias="alias",
|
||||
entity="entity",
|
||||
correct="correct")
|
||||
|
||||
with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
reading_text = False
|
||||
reading_revision = False
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 1000000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
||||
clean_line = line.strip().decode("utf-8")
|
||||
# print(clean_line)
|
||||
|
||||
if clean_line == "<revision>":
|
||||
reading_revision = True
|
||||
elif clean_line == "</revision>":
|
||||
reading_revision = False
|
||||
|
||||
# Start reading new page
|
||||
if clean_line == "<page>":
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
|
||||
# finished reading this page
|
||||
elif clean_line == "</page>":
|
||||
if article_id:
|
||||
try:
|
||||
_process_wp_text(kb, wp_to_id, entityfile, article_id, article_text.strip(), training_output)
|
||||
# on a previous run, an error occurred after 46M lines and 2h
|
||||
except Exception as e:
|
||||
print("Error processing article", article_id, article_title, e)
|
||||
else:
|
||||
print("Done processing a page, but couldn't find an article_id ?")
|
||||
print(article_title)
|
||||
print(article_text)
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
reading_text = False
|
||||
reading_revision = False
|
||||
|
||||
# start reading text within a page
|
||||
if "<text" in clean_line:
|
||||
reading_text = True
|
||||
|
||||
if reading_text:
|
||||
article_text += " " + clean_line
|
||||
|
||||
# stop reading text within a page (we assume a new page doesn't start on the same line)
|
||||
if "</text" in clean_line:
|
||||
reading_text = False
|
||||
|
||||
# read the ID of this article (outside the revision portion of the document)
|
||||
if not reading_revision:
|
||||
ids = id_regex.search(clean_line)
|
||||
if ids:
|
||||
article_id = ids[0]
|
||||
if article_id in read_ids:
|
||||
print("Found duplicate article ID", article_id, clean_line) # This should never happen ...
|
||||
read_ids.add(article_id)
|
||||
|
||||
# read the title of this article (outside the revision portion of the document)
|
||||
if not reading_revision:
|
||||
titles = title_regex.search(clean_line)
|
||||
if titles:
|
||||
article_title = titles[0].strip()
|
||||
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
|
||||
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
|
||||
|
||||
|
||||
def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text, training_output):
|
||||
# remove the text tags
|
||||
text = text_regex.search(article_text).group(0)
|
||||
|
||||
# stop processing if this is a redirect page
|
||||
if text.startswith("#REDIRECT"):
|
||||
return
|
||||
|
||||
# print("WP article", article_id, ":", article_title)
|
||||
# print()
|
||||
# print(text)
|
||||
|
||||
# get the raw text without markup etc
|
||||
clean_text = _get_clean_wp_text(text)
|
||||
# print()
|
||||
# print(clean_text)
|
||||
|
||||
article_dict = dict()
|
||||
ambiguous_aliases = set()
|
||||
aliases, entities, normalizations = wp.get_wp_links(text)
|
||||
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||
if alias not in ambiguous_aliases:
|
||||
entity_id = wp_to_id.get(entity)
|
||||
if entity_id:
|
||||
# TODO: take care of these conflicts ! Currently they are being removed from the dataset
|
||||
if article_dict.get(alias) and article_dict[alias] != entity_id:
|
||||
ambiguous_aliases.add(alias)
|
||||
article_dict.pop(alias)
|
||||
# print("Found conflicting alias", alias, "in article", article_id, article_title)
|
||||
else:
|
||||
article_dict[alias] = entity_id
|
||||
|
||||
# print("found entities:")
|
||||
for alias, entity in article_dict.items():
|
||||
# print(alias, "-->", entity)
|
||||
candidates = kb.get_candidates(alias)
|
||||
|
||||
# as training data, we only store entities that are sufficiently ambiguous
|
||||
if len(candidates) > 1:
|
||||
_write_training_article(article_id=article_id, clean_text=clean_text, training_output=training_output)
|
||||
# print("alias", alias)
|
||||
|
||||
# print all incorrect candidates
|
||||
for c in candidates:
|
||||
if entity != c.entity_:
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
article_id=article_id,
|
||||
alias=alias,
|
||||
entity=c.entity_,
|
||||
correct="0")
|
||||
|
||||
# print the one correct candidate
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
article_id=article_id,
|
||||
alias=alias,
|
||||
entity=entity,
|
||||
correct="1")
|
||||
|
||||
# print("gold entity", entity)
|
||||
# print()
|
||||
|
||||
# _run_ner_depr(nlp, clean_text, article_dict)
|
||||
# print()
|
||||
|
||||
|
||||
info_regex = re.compile(r'{[^{]*?}')
|
||||
interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
|
||||
interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
|
||||
htlm_regex = re.compile(r'<!--[^!]*-->')
|
||||
category_regex = re.compile(r'\[\[Category:[^\[]*]]')
|
||||
file_regex = re.compile(r'\[\[File:[^[\]]+]]')
|
||||
ref_regex = re.compile(r'<ref.*?>') # non-greedy
|
||||
ref_2_regex = re.compile(r'</ref.*?>') # non-greedy
|
||||
|
||||
|
||||
def _get_clean_wp_text(article_text):
|
||||
clean_text = article_text.strip()
|
||||
|
||||
# remove bolding & italic markup
|
||||
clean_text = clean_text.replace('\'\'\'', '')
|
||||
clean_text = clean_text.replace('\'\'', '')
|
||||
|
||||
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
||||
try_again = True
|
||||
previous_length = len(clean_text)
|
||||
while try_again:
|
||||
clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested {
|
||||
if len(clean_text) < previous_length:
|
||||
try_again = True
|
||||
else:
|
||||
try_again = False
|
||||
previous_length = len(clean_text)
|
||||
|
||||
# remove simple interwiki links (no alternative name)
|
||||
clean_text = interwiki_regex.sub(r'\1', clean_text)
|
||||
|
||||
# remove simple interwiki links by picking the alternative name
|
||||
clean_text = interwiki_2_regex.sub(r'\1', clean_text)
|
||||
|
||||
# remove HTML comments
|
||||
clean_text = htlm_regex.sub('', clean_text)
|
||||
|
||||
# remove Category and File statements
|
||||
clean_text = category_regex.sub('', clean_text)
|
||||
clean_text = file_regex.sub('', clean_text)
|
||||
|
||||
# remove multiple =
|
||||
while '==' in clean_text:
|
||||
clean_text = clean_text.replace("==", "=")
|
||||
|
||||
clean_text = clean_text.replace(". =", ".")
|
||||
clean_text = clean_text.replace(" = ", ". ")
|
||||
clean_text = clean_text.replace("= ", ".")
|
||||
clean_text = clean_text.replace(" =", "")
|
||||
|
||||
# remove refs (non-greedy match)
|
||||
clean_text = ref_regex.sub('', clean_text)
|
||||
clean_text = ref_2_regex.sub('', clean_text)
|
||||
|
||||
# remove additional wikiformatting
|
||||
clean_text = re.sub(r'<blockquote>', '', clean_text)
|
||||
clean_text = re.sub(r'</blockquote>', '', clean_text)
|
||||
|
||||
# change special characters back to normal ones
|
||||
clean_text = clean_text.replace(r'<', '<')
|
||||
clean_text = clean_text.replace(r'>', '>')
|
||||
clean_text = clean_text.replace(r'"', '"')
|
||||
clean_text = clean_text.replace(r'&nbsp;', ' ')
|
||||
clean_text = clean_text.replace(r'&', '&')
|
||||
|
||||
# remove multiple spaces
|
||||
while ' ' in clean_text:
|
||||
clean_text = clean_text.replace(' ', ' ')
|
||||
|
||||
return clean_text.strip()
|
||||
|
||||
|
||||
def _write_training_article(article_id, clean_text, training_output):
|
||||
file_loc = training_output + "/" + str(article_id) + ".txt"
|
||||
with open(file_loc, mode='w', encoding='utf8') as outputfile:
|
||||
outputfile.write(clean_text)
|
||||
|
||||
|
||||
def _write_training_entity(outputfile, article_id, alias, entity, correct):
|
||||
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
|
103
examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
Normal file
103
examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
Normal file
|
@ -0,0 +1,103 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from . import wikipedia_processor as wp, kb_creator, training_set_creator
|
||||
|
||||
import spacy
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.kb import KnowledgeBase
|
||||
import datetime
|
||||
|
||||
"""
|
||||
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
|
||||
"""
|
||||
|
||||
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
||||
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
||||
|
||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
||||
|
||||
TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
|
||||
TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("START", datetime.datetime.now())
|
||||
print()
|
||||
my_kb = None
|
||||
|
||||
# one-time methods to create KB and write to file
|
||||
to_create_prior_probs = False
|
||||
to_create_entity_counts = False
|
||||
to_create_kb = False
|
||||
|
||||
# read KB back in from file
|
||||
to_read_kb = True
|
||||
to_test_kb = False
|
||||
|
||||
create_wp_training = False
|
||||
|
||||
# STEP 1 : create prior probabilities from WP
|
||||
# run only once !
|
||||
if to_create_prior_probs:
|
||||
print("STEP 1: to_create_prior_probs", datetime.datetime.now())
|
||||
wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB)
|
||||
print()
|
||||
|
||||
# STEP 2 : deduce entity frequencies from WP
|
||||
# run only once !
|
||||
if to_create_entity_counts:
|
||||
print("STEP 2: to_create_entity_counts", datetime.datetime.now())
|
||||
wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False)
|
||||
print()
|
||||
|
||||
# STEP 3 : create KB and write to file
|
||||
# run only once !
|
||||
if to_create_kb:
|
||||
print("STEP 3a: to_create_kb", datetime.datetime.now())
|
||||
my_nlp = spacy.load('en_core_web_sm')
|
||||
my_vocab = my_nlp.vocab
|
||||
my_kb = kb_creator.create_kb(my_vocab,
|
||||
max_entities_per_alias=10,
|
||||
min_occ=5,
|
||||
entity_output=ENTITY_DEFS,
|
||||
count_input=ENTITY_COUNTS,
|
||||
prior_prob_input=PRIOR_PROB,
|
||||
to_print=False)
|
||||
print("kb entities:", my_kb.get_size_entities())
|
||||
print("kb aliases:", my_kb.get_size_aliases())
|
||||
print()
|
||||
|
||||
print("STEP 3b: write KB", datetime.datetime.now())
|
||||
my_kb.dump(KB_FILE)
|
||||
my_vocab.to_disk(VOCAB_DIR)
|
||||
print()
|
||||
|
||||
# STEP 4 : read KB back in from file
|
||||
if to_read_kb:
|
||||
print("STEP 4: to_read_kb", datetime.datetime.now())
|
||||
my_vocab = Vocab()
|
||||
my_vocab.from_disk(VOCAB_DIR)
|
||||
my_kb = KnowledgeBase(vocab=my_vocab)
|
||||
my_kb.load_bulk(KB_FILE)
|
||||
print("kb entities:", my_kb.get_size_entities())
|
||||
print("kb aliases:", my_kb.get_size_aliases())
|
||||
print()
|
||||
|
||||
# test KB
|
||||
if to_test_kb:
|
||||
kb_creator.test_kb(my_kb)
|
||||
print()
|
||||
|
||||
# STEP 5: create a training dataset from WP
|
||||
if create_wp_training:
|
||||
print("STEP 5: create training dataset", datetime.datetime.now())
|
||||
training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR)
|
||||
|
||||
# TODO coreference resolution
|
||||
# add_coref()
|
||||
|
||||
print()
|
||||
print("STOP", datetime.datetime.now())
|
166
examples/pipeline/wiki_entity_linking/wikidata_processor.py
Normal file
166
examples/pipeline/wiki_entity_linking/wikidata_processor.py
Normal file
|
@ -0,0 +1,166 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import bz2
|
||||
import json
|
||||
import datetime
|
||||
|
||||
# TODO: remove hardcoded paths
|
||||
WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
|
||||
|
||||
|
||||
def read_wikidata_entities_json(limit=None, to_print=False):
|
||||
""" Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
|
||||
|
||||
languages = {'en', 'de'}
|
||||
prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
|
||||
site_filter = 'enwiki'
|
||||
|
||||
title_to_id = dict()
|
||||
|
||||
# parse appropriate fields - depending on what we need in the KB
|
||||
parse_properties = False
|
||||
parse_sitelinks = True
|
||||
parse_labels = False
|
||||
parse_descriptions = False
|
||||
parse_aliases = False
|
||||
|
||||
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 500000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||
clean_line = line.strip()
|
||||
if clean_line.endswith(b","):
|
||||
clean_line = clean_line[:-1]
|
||||
if len(clean_line) > 1:
|
||||
obj = json.loads(clean_line)
|
||||
entry_type = obj["type"]
|
||||
|
||||
if entry_type == "item":
|
||||
# filtering records on their properties
|
||||
keep = False
|
||||
|
||||
claims = obj["claims"]
|
||||
for prop, value_set in prop_filter.items():
|
||||
claim_property = claims.get(prop, None)
|
||||
if claim_property:
|
||||
for cp in claim_property:
|
||||
cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
|
||||
cp_rank = cp['rank']
|
||||
if cp_rank != "deprecated" and cp_id in value_set:
|
||||
keep = True
|
||||
|
||||
if keep:
|
||||
unique_id = obj["id"]
|
||||
|
||||
if to_print:
|
||||
print("ID:", unique_id)
|
||||
print("type:", entry_type)
|
||||
|
||||
# parsing all properties that refer to other entities
|
||||
if parse_properties:
|
||||
for prop, claim_property in claims.items():
|
||||
cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
|
||||
cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
|
||||
if cp_values:
|
||||
if to_print:
|
||||
print("prop:", prop, cp_values)
|
||||
|
||||
if parse_sitelinks:
|
||||
site_value = obj["sitelinks"].get(site_filter, None)
|
||||
if site_value:
|
||||
site = site_value['title']
|
||||
if to_print:
|
||||
print(site_filter, ":", site)
|
||||
title_to_id[site] = unique_id
|
||||
# print(site, "for", unique_id)
|
||||
|
||||
if parse_labels:
|
||||
labels = obj["labels"]
|
||||
if labels:
|
||||
for lang in languages:
|
||||
lang_label = labels.get(lang, None)
|
||||
if lang_label:
|
||||
if to_print:
|
||||
print("label (" + lang + "):", lang_label["value"])
|
||||
|
||||
if parse_descriptions:
|
||||
descriptions = obj["descriptions"]
|
||||
if descriptions:
|
||||
for lang in languages:
|
||||
lang_descr = descriptions.get(lang, None)
|
||||
if lang_descr:
|
||||
if to_print:
|
||||
print("description (" + lang + "):", lang_descr["value"])
|
||||
|
||||
if parse_aliases:
|
||||
aliases = obj["aliases"]
|
||||
if aliases:
|
||||
for lang in languages:
|
||||
lang_aliases = aliases.get(lang, None)
|
||||
if lang_aliases:
|
||||
for item in lang_aliases:
|
||||
if to_print:
|
||||
print("alias (" + lang + "):", item["value"])
|
||||
|
||||
if to_print:
|
||||
print()
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
return title_to_id
|
||||
|
||||
|
||||
def _read_wikidata_entities_regex_depr(limit=None):
|
||||
"""
|
||||
Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines.
|
||||
TODO: doesn't work yet. may be deleted ?
|
||||
"""
|
||||
|
||||
regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
|
||||
regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
|
||||
regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
|
||||
regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
|
||||
|
||||
title_to_id = dict()
|
||||
|
||||
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 500000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||
clean_line = line.strip()
|
||||
if clean_line.endswith(b","):
|
||||
clean_line = clean_line[:-1]
|
||||
if len(clean_line) > 1:
|
||||
clean_line = line.strip().decode("utf-8")
|
||||
keep = False
|
||||
|
||||
p31_matches = regex_p31.findall(clean_line)
|
||||
if p31_matches:
|
||||
for p31_match in p31_matches:
|
||||
id_matches = regex_id.findall(p31_match)
|
||||
for id_match in id_matches:
|
||||
id_match = id_match[6:][:-1]
|
||||
if id_match == "Q5" or id_match == "Q15632617":
|
||||
keep = True
|
||||
|
||||
if keep:
|
||||
id_match = regex_id.search(clean_line).group(0)
|
||||
id_match = id_match[6:][:-1]
|
||||
|
||||
enwiki_matches = regex_enwiki.findall(clean_line)
|
||||
if enwiki_matches:
|
||||
for enwiki_match in enwiki_matches:
|
||||
title_match = regex_title.search(enwiki_match).group(0)
|
||||
title = title_match[9:][:-1]
|
||||
title_to_id[title] = id_match
|
||||
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
return title_to_id
|
187
examples/pipeline/wiki_entity_linking/wikipedia_processor.py
Normal file
187
examples/pipeline/wiki_entity_linking/wikipedia_processor.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
import bz2
|
||||
import csv
|
||||
import datetime
|
||||
|
||||
"""
|
||||
Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
|
||||
"""
|
||||
|
||||
|
||||
# TODO: remove hardcoded paths
|
||||
ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
|
||||
ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
|
||||
|
||||
map_alias_to_link = dict()
|
||||
|
||||
# these will/should be matched ignoring case
|
||||
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
||||
"d", "dbdump", "download", "Draft", "Education", "Foundation",
|
||||
"Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
|
||||
"m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
|
||||
"MediaZilla", "Meta", "Metawikipedia", "Module",
|
||||
"mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
|
||||
"Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
|
||||
"s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
|
||||
"Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
|
||||
"User", "User talk", "v", "voy",
|
||||
"w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
|
||||
"Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
|
||||
"Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
|
||||
|
||||
# find the links
|
||||
link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
|
||||
|
||||
# match on interwiki links, e.g. `en:` or `:fr:`
|
||||
ns_regex = r":?" + "[a-z][a-z]" + ":"
|
||||
|
||||
# match on Namespace: optionally preceded by a :
|
||||
for ns in wiki_namespaces:
|
||||
ns_regex += "|" + ":?" + ns + ":"
|
||||
|
||||
ns_regex = re.compile(ns_regex, re.IGNORECASE)
|
||||
|
||||
|
||||
def read_wikipedia_prior_probs(prior_prob_output):
|
||||
"""
|
||||
Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
|
||||
The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
|
||||
It works relatively fast because we don't care about which article we parsed the interwiki from,
|
||||
we just process line by line.
|
||||
"""
|
||||
|
||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
while line:
|
||||
if cnt % 5000000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
||||
clean_line = line.strip().decode("utf-8")
|
||||
|
||||
aliases, entities, normalizations = get_wp_links(clean_line)
|
||||
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||
_store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
|
||||
_store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
|
||||
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
# write all aliases and their entities and occurrences to file
|
||||
with open(prior_prob_output, mode='w', encoding='utf8') as outputfile:
|
||||
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
|
||||
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
|
||||
for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
|
||||
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
|
||||
|
||||
|
||||
def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
|
||||
alias = alias.strip()
|
||||
entity = entity.strip()
|
||||
|
||||
# remove everything after # as this is not part of the title but refers to a specific paragraph
|
||||
if normalize_entity:
|
||||
# wikipedia titles are always capitalized
|
||||
entity = _capitalize_first(entity.split("#")[0])
|
||||
if normalize_alias:
|
||||
alias = alias.split("#")[0]
|
||||
|
||||
if alias and entity:
|
||||
alias_dict = map_alias_to_link.get(alias, dict())
|
||||
entity_count = alias_dict.get(entity, 0)
|
||||
alias_dict[entity] = entity_count + 1
|
||||
map_alias_to_link[alias] = alias_dict
|
||||
|
||||
|
||||
def get_wp_links(text):
|
||||
aliases = []
|
||||
entities = []
|
||||
normalizations = []
|
||||
|
||||
matches = link_regex.findall(text)
|
||||
for match in matches:
|
||||
match = match[2:][:-2].replace("_", " ").strip()
|
||||
|
||||
if ns_regex.match(match):
|
||||
pass # ignore namespaces at the beginning of the string
|
||||
|
||||
# this is a simple link, with the alias the same as the mention
|
||||
elif "|" not in match:
|
||||
aliases.append(match)
|
||||
entities.append(match)
|
||||
normalizations.append(True)
|
||||
|
||||
# in wiki format, the link is written as [[entity|alias]]
|
||||
else:
|
||||
splits = match.split("|")
|
||||
entity = splits[0].strip()
|
||||
alias = splits[1].strip()
|
||||
# specific wiki format [[alias (specification)|]]
|
||||
if len(alias) == 0 and "(" in entity:
|
||||
alias = entity.split("(")[0]
|
||||
aliases.append(alias)
|
||||
entities.append(entity)
|
||||
normalizations.append(False)
|
||||
else:
|
||||
aliases.append(alias)
|
||||
entities.append(entity)
|
||||
normalizations.append(False)
|
||||
|
||||
return aliases, entities, normalizations
|
||||
|
||||
|
||||
def _capitalize_first(text):
|
||||
if not text:
|
||||
return None
|
||||
result = text[0].capitalize()
|
||||
if len(result) > 0:
|
||||
result += text[1:]
|
||||
return result
|
||||
|
||||
|
||||
def write_entity_counts(prior_prob_input, count_output, to_print=False):
|
||||
""" Write entity counts for quick access later """
|
||||
entity_to_count = dict()
|
||||
total_count = 0
|
||||
|
||||
with open(prior_prob_input, mode='r', encoding='utf8') as prior_file:
|
||||
# skip header
|
||||
prior_file.readline()
|
||||
line = prior_file.readline()
|
||||
|
||||
while line:
|
||||
splits = line.replace('\n', "").split(sep='|')
|
||||
# alias = splits[0]
|
||||
count = int(splits[1])
|
||||
entity = splits[2]
|
||||
|
||||
current_count = entity_to_count.get(entity, 0)
|
||||
entity_to_count[entity] = current_count + count
|
||||
|
||||
total_count += count
|
||||
|
||||
line = prior_file.readline()
|
||||
|
||||
with open(count_output, mode='w', encoding='utf8') as entity_file:
|
||||
entity_file.write("entity" + "|" + "count" + "\n")
|
||||
for entity, count in entity_to_count.items():
|
||||
entity_file.write(entity + "|" + str(count) + "\n")
|
||||
|
||||
if to_print:
|
||||
for entity, count in entity_to_count.items():
|
||||
print("Entity count:", entity, count)
|
||||
print("Total count:", total_count)
|
||||
|
||||
|
||||
def get_entity_frequencies(count_input, entities):
|
||||
entity_to_count = dict()
|
||||
with open(count_input, 'r', encoding='utf8') as csvfile:
|
||||
csvreader = csv.reader(csvfile, delimiter='|')
|
||||
# skip header
|
||||
next(csvreader)
|
||||
for row in csvreader:
|
||||
entity_to_count[row[0]] = int(row[1])
|
||||
|
||||
return [entity_to_count.get(e, 0) for e in entities]
|
|
@ -1,852 +0,0 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
"""
|
||||
Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm.
|
||||
"""
|
||||
import re
|
||||
import csv
|
||||
import json
|
||||
import spacy
|
||||
import datetime
|
||||
import bz2
|
||||
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
# requires: pip install neuralcoref --no-binary neuralcoref
|
||||
# import neuralcoref
|
||||
|
||||
# TODO: remove hardcoded paths
|
||||
WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2'
|
||||
ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2'
|
||||
ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2'
|
||||
|
||||
PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv'
|
||||
ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv'
|
||||
ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
|
||||
|
||||
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
|
||||
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
|
||||
|
||||
TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
|
||||
TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
|
||||
|
||||
|
||||
# these will/should be matched ignoring case
|
||||
wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons",
|
||||
"d", "dbdump", "download", "Draft", "Education", "Foundation",
|
||||
"Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator",
|
||||
"m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki",
|
||||
"MediaZilla", "Meta", "Metawikipedia", "Module",
|
||||
"mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki",
|
||||
"Portal", "phab", "Phabricator", "Project", "q", "quality", "rev",
|
||||
"s", "spcom", "Special", "species", "Strategy", "sulutil", "svn",
|
||||
"Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki",
|
||||
"User", "User talk", "v", "voy",
|
||||
"w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews",
|
||||
"Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech",
|
||||
"Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"]
|
||||
|
||||
map_alias_to_link = dict()
|
||||
|
||||
|
||||
def read_wikipedia_prior_probs():
|
||||
"""
|
||||
STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities
|
||||
The full file takes about 2h to parse 1100M lines (update printed every 5M lines).
|
||||
It works relatively fast because we don't care about which article we parsed the interwiki from,
|
||||
we just process line by line.
|
||||
"""
|
||||
|
||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
while line:
|
||||
if cnt % 5000000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
||||
clean_line = line.strip().decode("utf-8")
|
||||
|
||||
aliases, entities, normalizations = _get_wp_links(clean_line)
|
||||
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||
_store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
|
||||
_store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
|
||||
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
# write all aliases and their entities and occurrences to file
|
||||
with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile:
|
||||
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
|
||||
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
|
||||
for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True):
|
||||
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
|
||||
|
||||
|
||||
# find the links
|
||||
link_regex = re.compile(r'\[\[[^\[\]]*\]\]')
|
||||
|
||||
# match on interwiki links, e.g. `en:` or `:fr:`
|
||||
ns_regex = r":?" + "[a-z][a-z]" + ":"
|
||||
|
||||
# match on Namespace: optionally preceded by a :
|
||||
for ns in wiki_namespaces:
|
||||
ns_regex += "|" + ":?" + ns + ":"
|
||||
|
||||
ns_regex = re.compile(ns_regex, re.IGNORECASE)
|
||||
|
||||
|
||||
def _get_wp_links(text):
|
||||
aliases = []
|
||||
entities = []
|
||||
normalizations = []
|
||||
|
||||
matches = link_regex.findall(text)
|
||||
for match in matches:
|
||||
match = match[2:][:-2].replace("_", " ").strip()
|
||||
|
||||
if ns_regex.match(match):
|
||||
pass # ignore namespaces at the beginning of the string
|
||||
|
||||
# this is a simple link, with the alias the same as the mention
|
||||
elif "|" not in match:
|
||||
aliases.append(match)
|
||||
entities.append(match)
|
||||
normalizations.append(True)
|
||||
|
||||
# in wiki format, the link is written as [[entity|alias]]
|
||||
else:
|
||||
splits = match.split("|")
|
||||
entity = splits[0].strip()
|
||||
alias = splits[1].strip()
|
||||
# specific wiki format [[alias (specification)|]]
|
||||
if len(alias) == 0 and "(" in entity:
|
||||
alias = entity.split("(")[0]
|
||||
aliases.append(alias)
|
||||
entities.append(entity)
|
||||
normalizations.append(False)
|
||||
else:
|
||||
aliases.append(alias)
|
||||
entities.append(entity)
|
||||
normalizations.append(False)
|
||||
|
||||
return aliases, entities, normalizations
|
||||
|
||||
|
||||
def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
|
||||
alias = alias.strip()
|
||||
entity = entity.strip()
|
||||
|
||||
# remove everything after # as this is not part of the title but refers to a specific paragraph
|
||||
if normalize_entity:
|
||||
# wikipedia titles are always capitalized
|
||||
entity = _capitalize_first(entity.split("#")[0])
|
||||
if normalize_alias:
|
||||
alias = alias.split("#")[0]
|
||||
|
||||
if alias and entity:
|
||||
alias_dict = map_alias_to_link.get(alias, dict())
|
||||
entity_count = alias_dict.get(entity, 0)
|
||||
alias_dict[entity] = entity_count + 1
|
||||
map_alias_to_link[alias] = alias_dict
|
||||
|
||||
|
||||
def _capitalize_first(text):
|
||||
if not text:
|
||||
return None
|
||||
result = text[0].capitalize()
|
||||
if len(result) > 0:
|
||||
result += text[1:]
|
||||
return result
|
||||
|
||||
|
||||
def write_entity_counts(to_print=False):
|
||||
""" STEP 2: write entity counts """
|
||||
entity_to_count = dict()
|
||||
total_count = 0
|
||||
|
||||
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
||||
# skip header
|
||||
prior_file.readline()
|
||||
line = prior_file.readline()
|
||||
|
||||
while line:
|
||||
splits = line.replace('\n', "").split(sep='|')
|
||||
# alias = splits[0]
|
||||
count = int(splits[1])
|
||||
entity = splits[2]
|
||||
|
||||
current_count = entity_to_count.get(entity, 0)
|
||||
entity_to_count[entity] = current_count + count
|
||||
|
||||
total_count += count
|
||||
|
||||
line = prior_file.readline()
|
||||
|
||||
with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file:
|
||||
entity_file.write("entity" + "|" + "count" + "\n")
|
||||
for entity, count in entity_to_count.items():
|
||||
entity_file.write(entity + "|" + str(count) + "\n")
|
||||
|
||||
if to_print:
|
||||
for entity, count in entity_to_count.items():
|
||||
print("Entity count:", entity, count)
|
||||
print("Total count:", total_count)
|
||||
|
||||
|
||||
def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True):
|
||||
""" STEP 3: create the knowledge base """
|
||||
kb = KnowledgeBase(vocab=vocab)
|
||||
|
||||
print()
|
||||
print("1. _read_wikidata_entities", datetime.datetime.now())
|
||||
print()
|
||||
# title_to_id = _read_wikidata_entities_regex_depr(limit=1000)
|
||||
title_to_id = _read_wikidata_entities_json(limit=None)
|
||||
|
||||
# write the title-ID mapping to file
|
||||
if write_entity_defs:
|
||||
with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file:
|
||||
entity_file.write("WP_title" + "|" + "WD_id" + "\n")
|
||||
for title, qid in title_to_id.items():
|
||||
entity_file.write(title + "|" + str(qid) + "\n")
|
||||
|
||||
title_list = list(title_to_id.keys())
|
||||
entity_list = [title_to_id[x] for x in title_list]
|
||||
|
||||
print()
|
||||
print("2. _get_entity_frequencies", datetime.datetime.now())
|
||||
print()
|
||||
entity_frequencies = _get_entity_frequencies(entities=title_list)
|
||||
|
||||
print()
|
||||
print("3. adding", len(entity_list), "entities", datetime.datetime.now())
|
||||
print()
|
||||
kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None)
|
||||
|
||||
print()
|
||||
print("4. adding aliases", datetime.datetime.now())
|
||||
print()
|
||||
_add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ)
|
||||
|
||||
# TODO: read wikipedia texts for entity context
|
||||
# _read_wikipedia()
|
||||
|
||||
if to_print:
|
||||
print()
|
||||
print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases())
|
||||
|
||||
print("done with kb", datetime.datetime.now())
|
||||
|
||||
return kb
|
||||
|
||||
|
||||
def _get_entity_frequencies(entities):
|
||||
entity_to_count = dict()
|
||||
with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile:
|
||||
csvreader = csv.reader(csvfile, delimiter='|')
|
||||
# skip header
|
||||
next(csvreader)
|
||||
for row in csvreader:
|
||||
entity_to_count[row[0]] = int(row[1])
|
||||
|
||||
return [entity_to_count.get(e, 0) for e in entities]
|
||||
|
||||
|
||||
def _get_entity_to_id():
|
||||
entity_to_id = dict()
|
||||
with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile:
|
||||
csvreader = csv.reader(csvfile, delimiter='|')
|
||||
# skip header
|
||||
next(csvreader)
|
||||
for row in csvreader:
|
||||
entity_to_id[row[0]] = row[1]
|
||||
|
||||
return entity_to_id
|
||||
|
||||
|
||||
def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False):
|
||||
wp_titles = title_to_id.keys()
|
||||
|
||||
if to_print:
|
||||
print("wp titles:", wp_titles)
|
||||
|
||||
# adding aliases with prior probabilities
|
||||
with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file:
|
||||
# skip header
|
||||
prior_file.readline()
|
||||
line = prior_file.readline()
|
||||
# we can read this file sequentially, it's sorted by alias, and then by count
|
||||
previous_alias = None
|
||||
total_count = 0
|
||||
counts = list()
|
||||
entities = list()
|
||||
while line:
|
||||
splits = line.replace('\n', "").split(sep='|')
|
||||
new_alias = splits[0]
|
||||
count = int(splits[1])
|
||||
entity = splits[2]
|
||||
|
||||
if new_alias != previous_alias and previous_alias:
|
||||
# done reading the previous alias --> output
|
||||
if len(entities) > 0:
|
||||
selected_entities = list()
|
||||
prior_probs = list()
|
||||
for ent_count, ent_string in zip(counts, entities):
|
||||
if ent_string in wp_titles:
|
||||
wd_id = title_to_id[ent_string]
|
||||
p_entity_givenalias = ent_count / total_count
|
||||
selected_entities.append(wd_id)
|
||||
prior_probs.append(p_entity_givenalias)
|
||||
|
||||
if selected_entities:
|
||||
try:
|
||||
kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs)
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
total_count = 0
|
||||
counts = list()
|
||||
entities = list()
|
||||
|
||||
total_count += count
|
||||
|
||||
if len(entities) < max_entities_per_alias and count >= min_occ:
|
||||
counts.append(count)
|
||||
entities.append(entity)
|
||||
previous_alias = new_alias
|
||||
|
||||
line = prior_file.readline()
|
||||
|
||||
if to_print:
|
||||
print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings())
|
||||
|
||||
|
||||
def _read_wikidata_entities_json(limit=None, to_print=False):
|
||||
""" Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """
|
||||
|
||||
languages = {'en', 'de'}
|
||||
prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
|
||||
site_filter = 'enwiki'
|
||||
|
||||
title_to_id = dict()
|
||||
|
||||
# parse appropriate fields - depending on what we need in the KB
|
||||
parse_properties = False
|
||||
parse_sitelinks = True
|
||||
parse_labels = False
|
||||
parse_descriptions = False
|
||||
parse_aliases = False
|
||||
|
||||
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 500000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||
clean_line = line.strip()
|
||||
if clean_line.endswith(b","):
|
||||
clean_line = clean_line[:-1]
|
||||
if len(clean_line) > 1:
|
||||
obj = json.loads(clean_line)
|
||||
entry_type = obj["type"]
|
||||
|
||||
if entry_type == "item":
|
||||
# filtering records on their properties
|
||||
keep = False
|
||||
|
||||
claims = obj["claims"]
|
||||
for prop, value_set in prop_filter.items():
|
||||
claim_property = claims.get(prop, None)
|
||||
if claim_property:
|
||||
for cp in claim_property:
|
||||
cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id')
|
||||
cp_rank = cp['rank']
|
||||
if cp_rank != "deprecated" and cp_id in value_set:
|
||||
keep = True
|
||||
|
||||
if keep:
|
||||
unique_id = obj["id"]
|
||||
|
||||
if to_print:
|
||||
print("ID:", unique_id)
|
||||
print("type:", entry_type)
|
||||
|
||||
# parsing all properties that refer to other entities
|
||||
if parse_properties:
|
||||
for prop, claim_property in claims.items():
|
||||
cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')]
|
||||
cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None]
|
||||
if cp_values:
|
||||
if to_print:
|
||||
print("prop:", prop, cp_values)
|
||||
|
||||
if parse_sitelinks:
|
||||
site_value = obj["sitelinks"].get(site_filter, None)
|
||||
if site_value:
|
||||
site = site_value['title']
|
||||
if to_print:
|
||||
print(site_filter, ":", site)
|
||||
title_to_id[site] = unique_id
|
||||
# print(site, "for", unique_id)
|
||||
|
||||
if parse_labels:
|
||||
labels = obj["labels"]
|
||||
if labels:
|
||||
for lang in languages:
|
||||
lang_label = labels.get(lang, None)
|
||||
if lang_label:
|
||||
if to_print:
|
||||
print("label (" + lang + "):", lang_label["value"])
|
||||
|
||||
if parse_descriptions:
|
||||
descriptions = obj["descriptions"]
|
||||
if descriptions:
|
||||
for lang in languages:
|
||||
lang_descr = descriptions.get(lang, None)
|
||||
if lang_descr:
|
||||
if to_print:
|
||||
print("description (" + lang + "):", lang_descr["value"])
|
||||
|
||||
if parse_aliases:
|
||||
aliases = obj["aliases"]
|
||||
if aliases:
|
||||
for lang in languages:
|
||||
lang_aliases = aliases.get(lang, None)
|
||||
if lang_aliases:
|
||||
for item in lang_aliases:
|
||||
if to_print:
|
||||
print("alias (" + lang + "):", item["value"])
|
||||
|
||||
if to_print:
|
||||
print()
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
return title_to_id
|
||||
|
||||
|
||||
def _read_wikidata_entities_regex_depr(limit=None, to_print=False):
|
||||
""" Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """
|
||||
|
||||
regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE)
|
||||
regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE)
|
||||
regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE)
|
||||
regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE)
|
||||
|
||||
title_to_id = dict()
|
||||
|
||||
with bz2.open(WIKIDATA_JSON, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 500000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump")
|
||||
clean_line = line.strip()
|
||||
if clean_line.endswith(b","):
|
||||
clean_line = clean_line[:-1]
|
||||
if len(clean_line) > 1:
|
||||
clean_line = line.strip().decode("utf-8")
|
||||
keep = False
|
||||
|
||||
p31_matches = regex_p31.findall(clean_line)
|
||||
if p31_matches:
|
||||
for p31_match in p31_matches:
|
||||
id_matches = regex_id.findall(p31_match)
|
||||
for id_match in id_matches:
|
||||
id_match = id_match[6:][:-1]
|
||||
if id_match == "Q5" or id_match == "Q15632617":
|
||||
keep = True
|
||||
|
||||
if keep:
|
||||
id_match = regex_id.search(clean_line).group(0)
|
||||
id_match = id_match[6:][:-1]
|
||||
|
||||
enwiki_matches = regex_enwiki.findall(clean_line)
|
||||
if enwiki_matches:
|
||||
for enwiki_match in enwiki_matches:
|
||||
title_match = regex_title.search(enwiki_match).group(0)
|
||||
title = title_match[9:][:-1]
|
||||
title_to_id[title] = id_match
|
||||
# print(title, "for", id_match)
|
||||
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
return title_to_id
|
||||
|
||||
|
||||
def test_kb(kb):
|
||||
# TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
|
||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb})
|
||||
nlp.add_pipe(el_pipe, last=True)
|
||||
|
||||
candidates = my_kb.get_candidates("Bush")
|
||||
|
||||
print("generating candidates for 'Bush' :")
|
||||
for c in candidates:
|
||||
print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")")
|
||||
print()
|
||||
|
||||
text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \
|
||||
"Douglas reminds us to always bring our towel. " \
|
||||
"The main character in Doug's novel is the man Arthur Dent, " \
|
||||
"but Douglas doesn't write about George Washington or Homer Simpson."
|
||||
doc = nlp(text)
|
||||
|
||||
for ent in doc.ents:
|
||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||
|
||||
|
||||
def add_coref():
|
||||
""" STEP 5: add coreference resolution to our model """
|
||||
nlp = spacy.load('en_core_web_sm')
|
||||
# nlp = spacy.load('en')
|
||||
|
||||
# TODO: this doesn't work yet
|
||||
# neuralcoref.add_to_pipe(nlp)
|
||||
print("done adding to pipe")
|
||||
|
||||
doc = nlp(u'My sister has a dog. She loves him.')
|
||||
print("done doc")
|
||||
|
||||
print(doc._.has_coref)
|
||||
print(doc._.coref_clusters)
|
||||
|
||||
|
||||
def create_training(kb):
|
||||
if not kb:
|
||||
raise ValueError("kb should be defined")
|
||||
# nlp = spacy.load('en_core_web_sm')
|
||||
wp_to_id = _get_entity_to_id()
|
||||
_read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset
|
||||
|
||||
|
||||
def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
||||
"""
|
||||
Read the XML wikipedia data to parse out training data:
|
||||
raw text data + positive and negative instances
|
||||
"""
|
||||
|
||||
title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
|
||||
id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')
|
||||
|
||||
read_ids = set()
|
||||
|
||||
entityfile_loc = TRAINING_OUTPUT_SET_DIR + "/" + "gold_entities.csv"
|
||||
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
|
||||
# write entity training header file
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
article_id="article_id",
|
||||
alias="alias",
|
||||
entity="entity",
|
||||
correct="correct")
|
||||
|
||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||
line = file.readline()
|
||||
cnt = 0
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
reading_text = False
|
||||
reading_revision = False
|
||||
while line and (not limit or cnt < limit):
|
||||
if cnt % 1000000 == 0:
|
||||
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
|
||||
clean_line = line.strip().decode("utf-8")
|
||||
# print(clean_line)
|
||||
|
||||
if clean_line == "<revision>":
|
||||
reading_revision = True
|
||||
elif clean_line == "</revision>":
|
||||
reading_revision = False
|
||||
|
||||
# Start reading new page
|
||||
if clean_line == "<page>":
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
|
||||
# finished reading this page
|
||||
elif clean_line == "</page>":
|
||||
if article_id:
|
||||
try:
|
||||
_process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip())
|
||||
# on a previous run, an error occurred after 46M lines and 2h
|
||||
except Exception as e:
|
||||
print("Error processing article", article_id, article_title)
|
||||
print(e)
|
||||
else:
|
||||
print("Done processing a page, but couldn't find an article_id ?")
|
||||
print(article_title)
|
||||
print(article_text)
|
||||
article_text = ""
|
||||
article_title = None
|
||||
article_id = None
|
||||
reading_text = False
|
||||
reading_revision = False
|
||||
|
||||
# start reading text within a page
|
||||
if "<text" in clean_line:
|
||||
reading_text = True
|
||||
|
||||
if reading_text:
|
||||
article_text += " " + clean_line
|
||||
|
||||
# stop reading text within a page (we assume a new page doesn't start on the same line)
|
||||
if "</text" in clean_line:
|
||||
reading_text = False
|
||||
|
||||
# read the ID of this article (outside the revision portion of the document)
|
||||
if not reading_revision:
|
||||
ids = id_regex.search(clean_line)
|
||||
if ids:
|
||||
article_id = ids[0]
|
||||
if article_id in read_ids:
|
||||
print("Found duplicate article ID", article_id, clean_line) # This should never happen ...
|
||||
read_ids.add(article_id)
|
||||
|
||||
# read the title of this article (outside the revision portion of the document)
|
||||
if not reading_revision:
|
||||
titles = title_regex.search(clean_line)
|
||||
if titles:
|
||||
article_title = titles[0].strip()
|
||||
|
||||
line = file.readline()
|
||||
cnt += 1
|
||||
|
||||
|
||||
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
|
||||
|
||||
|
||||
def _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text):
|
||||
# remove the text tags
|
||||
text = text_regex.search(article_text).group(0)
|
||||
|
||||
# stop processing if this is a redirect page
|
||||
if text.startswith("#REDIRECT"):
|
||||
return
|
||||
|
||||
# print("WP article", article_id, ":", article_title)
|
||||
# print()
|
||||
# print(text)
|
||||
|
||||
# get the raw text without markup etc
|
||||
clean_text = _get_clean_wp_text(text)
|
||||
# print()
|
||||
# print(clean_text)
|
||||
|
||||
article_dict = dict()
|
||||
ambiguous_aliases = set()
|
||||
aliases, entities, normalizations = _get_wp_links(text)
|
||||
for alias, entity, norm in zip(aliases, entities, normalizations):
|
||||
if alias not in ambiguous_aliases:
|
||||
entity_id = wp_to_id.get(entity)
|
||||
if entity_id:
|
||||
# TODO: take care of these conflicts ! Currently they are being removed from the dataset
|
||||
if article_dict.get(alias) and article_dict[alias] != entity_id:
|
||||
ambiguous_aliases.add(alias)
|
||||
article_dict.pop(alias)
|
||||
# print("Found conflicting alias", alias, "in article", article_id, article_title)
|
||||
else:
|
||||
article_dict[alias] = entity_id
|
||||
|
||||
# print("found entities:")
|
||||
for alias, entity in article_dict.items():
|
||||
# print(alias, "-->", entity)
|
||||
candidates = kb.get_candidates(alias)
|
||||
|
||||
# as training data, we only store entities that are sufficiently ambiguous
|
||||
if len(candidates) > 1:
|
||||
_write_training_article(article_id=article_id, clean_text=clean_text)
|
||||
# print("alias", alias)
|
||||
|
||||
# print all incorrect candidates
|
||||
for c in candidates:
|
||||
if entity != c.entity_:
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
article_id=article_id,
|
||||
alias=alias,
|
||||
entity=c.entity_,
|
||||
correct="0")
|
||||
|
||||
# print the one correct candidate
|
||||
_write_training_entity(outputfile=entityfile,
|
||||
article_id=article_id,
|
||||
alias=alias,
|
||||
entity=entity,
|
||||
correct="1")
|
||||
|
||||
# print("gold entity", entity)
|
||||
# print()
|
||||
|
||||
# _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict)
|
||||
# print()
|
||||
|
||||
|
||||
info_regex = re.compile(r'{[^{]*?}')
|
||||
interwiki_regex = re.compile(r'\[\[([^|]*?)]]')
|
||||
interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]')
|
||||
htlm_regex = re.compile(r'<!--[^!]*-->')
|
||||
category_regex = re.compile(r'\[\[Category:[^\[]*]]')
|
||||
file_regex = re.compile(r'\[\[File:[^[\]]+]]')
|
||||
ref_regex = re.compile(r'<ref.*?>') # non-greedy
|
||||
ref_2_regex = re.compile(r'</ref.*?>') # non-greedy
|
||||
|
||||
|
||||
def _get_clean_wp_text(article_text):
|
||||
clean_text = article_text.strip()
|
||||
|
||||
# remove bolding & italic markup
|
||||
clean_text = clean_text.replace('\'\'\'', '')
|
||||
clean_text = clean_text.replace('\'\'', '')
|
||||
|
||||
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
||||
try_again = True
|
||||
previous_length = len(clean_text)
|
||||
while try_again:
|
||||
clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested {
|
||||
if len(clean_text) < previous_length:
|
||||
try_again = True
|
||||
else:
|
||||
try_again = False
|
||||
previous_length = len(clean_text)
|
||||
|
||||
# remove simple interwiki links (no alternative name)
|
||||
clean_text = interwiki_regex.sub(r'\1', clean_text)
|
||||
|
||||
# remove simple interwiki links by picking the alternative name
|
||||
clean_text = interwiki_2_regex.sub(r'\1', clean_text)
|
||||
|
||||
# remove HTML comments
|
||||
clean_text = htlm_regex.sub('', clean_text)
|
||||
|
||||
# remove Category and File statements
|
||||
clean_text = category_regex.sub('', clean_text)
|
||||
clean_text = file_regex.sub('', clean_text)
|
||||
|
||||
# remove multiple =
|
||||
while '==' in clean_text:
|
||||
clean_text = clean_text.replace("==", "=")
|
||||
|
||||
clean_text = clean_text.replace(". =", ".")
|
||||
clean_text = clean_text.replace(" = ", ". ")
|
||||
clean_text = clean_text.replace("= ", ".")
|
||||
clean_text = clean_text.replace(" =", "")
|
||||
|
||||
# remove refs (non-greedy match)
|
||||
clean_text = ref_regex.sub('', clean_text)
|
||||
clean_text = ref_2_regex.sub('', clean_text)
|
||||
|
||||
# remove additional wikiformatting
|
||||
clean_text = re.sub(r'<blockquote>', '', clean_text)
|
||||
clean_text = re.sub(r'</blockquote>', '', clean_text)
|
||||
|
||||
# change special characters back to normal ones
|
||||
clean_text = clean_text.replace(r'<', '<')
|
||||
clean_text = clean_text.replace(r'>', '>')
|
||||
clean_text = clean_text.replace(r'"', '"')
|
||||
clean_text = clean_text.replace(r'&nbsp;', ' ')
|
||||
clean_text = clean_text.replace(r'&', '&')
|
||||
|
||||
# remove multiple spaces
|
||||
while ' ' in clean_text:
|
||||
clean_text = clean_text.replace(' ', ' ')
|
||||
|
||||
return clean_text.strip()
|
||||
|
||||
|
||||
def _write_training_article(article_id, clean_text):
|
||||
file_loc = TRAINING_OUTPUT_SET_DIR + "/" + str(article_id) + ".txt"
|
||||
with open(file_loc, mode='w', encoding='utf8') as outputfile:
|
||||
outputfile.write(clean_text)
|
||||
|
||||
|
||||
def _write_training_entity(outputfile, article_id, alias, entity, correct):
|
||||
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n")
|
||||
|
||||
|
||||
def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict):
|
||||
doc = nlp(clean_text)
|
||||
for ent in doc.ents:
|
||||
if ent.label_ == "PERSON": # TODO: expand to non-persons
|
||||
ent_id = article_dict.get(ent.text)
|
||||
if ent_id:
|
||||
print(" -", ent.text, ent.label_, ent_id)
|
||||
else:
|
||||
print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("START", datetime.datetime.now())
|
||||
print()
|
||||
my_kb = None
|
||||
|
||||
# one-time methods to create KB and write to file
|
||||
to_create_prior_probs = False
|
||||
to_create_entity_counts = False
|
||||
to_create_kb = False
|
||||
|
||||
# read KB back in from file
|
||||
to_read_kb = True
|
||||
to_test_kb = False
|
||||
|
||||
create_wp_training = True
|
||||
|
||||
# STEP 1 : create prior probabilities from WP
|
||||
# run only once !
|
||||
if to_create_prior_probs:
|
||||
print("STEP 1: to_create_prior_probs", datetime.datetime.now())
|
||||
read_wikipedia_prior_probs()
|
||||
print()
|
||||
|
||||
# STEP 2 : deduce entity frequencies from WP
|
||||
# run only once !
|
||||
if to_create_entity_counts:
|
||||
print("STEP 2: to_create_entity_counts", datetime.datetime.now())
|
||||
write_entity_counts()
|
||||
print()
|
||||
|
||||
# STEP 3 : create KB and write to file
|
||||
# run only once !
|
||||
if to_create_kb:
|
||||
print("STEP 3a: to_create_kb", datetime.datetime.now())
|
||||
my_nlp = spacy.load('en_core_web_sm')
|
||||
my_vocab = my_nlp.vocab
|
||||
my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False)
|
||||
print("kb entities:", my_kb.get_size_entities())
|
||||
print("kb aliases:", my_kb.get_size_aliases())
|
||||
print()
|
||||
|
||||
print("STEP 3b: write KB", datetime.datetime.now())
|
||||
my_kb.dump(KB_FILE)
|
||||
my_vocab.to_disk(VOCAB_DIR)
|
||||
print()
|
||||
|
||||
# STEP 4 : read KB back in from file
|
||||
if to_read_kb:
|
||||
print("STEP 4: to_read_kb", datetime.datetime.now())
|
||||
my_vocab = Vocab()
|
||||
my_vocab.from_disk(VOCAB_DIR)
|
||||
my_kb = KnowledgeBase(vocab=my_vocab)
|
||||
my_kb.load_bulk(KB_FILE)
|
||||
print("kb entities:", my_kb.get_size_entities())
|
||||
print("kb aliases:", my_kb.get_size_aliases())
|
||||
print()
|
||||
|
||||
# test KB
|
||||
if to_test_kb:
|
||||
test_kb(my_kb)
|
||||
print()
|
||||
|
||||
# STEP 5: create a training dataset from WP
|
||||
if create_wp_training:
|
||||
print("STEP 5: create training dataset", datetime.datetime.now())
|
||||
create_training(my_kb)
|
||||
|
||||
# TODO coreference resolution
|
||||
# add_coref()
|
||||
|
||||
print()
|
||||
print("STOP", datetime.datetime.now())
|
Loading…
Reference in New Issue
Block a user