# coding: utf-8
from __future__ import unicode_literals
import os
import re
import bz2
import datetime
from spacy.gold import GoldParse
from . import wikipedia_processor as wp, kb_creator
"""
Process Wikipedia interlinks to generate a training dataset for the EL algorithm
"""
# ENTITY_FILE = "gold_entities.csv"
ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing
def create_training(entity_def_input, training_output):
wp_to_id = kb_creator._get_entity_to_id(entity_def_input)
_process_wikipedia_texts(wp_to_id, training_output, limit=None)
def _process_wikipedia_texts(wp_to_id, training_output, limit=None):
"""
Read the XML wikipedia data to parse out training data:
raw text data + positive instances
"""
title_regex = re.compile(r'(?<=
).*(?=)')
id_regex = re.compile(r'(?<=)\d*(?=)')
read_ids = set()
entityfile_loc = training_output + "/" + ENTITY_FILE
with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
# write entity training header file
_write_training_entity(outputfile=entityfile,
article_id="article_id",
alias="alias",
entity="WD_id",
start="start",
end="end")
with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file:
line = file.readline()
cnt = 0
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
while line and (not limit or cnt < limit):
if cnt % 1000000 == 0:
print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
clean_line = line.strip().decode("utf-8")
if clean_line == "":
reading_revision = True
elif clean_line == "":
reading_revision = False
# Start reading new page
if clean_line == "":
article_text = ""
article_title = None
article_id = None
# finished reading this page
elif clean_line == "":
if article_id:
try:
_process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), training_output)
except Exception as e:
print("Error processing article", article_id, article_title, e)
else:
print("Done processing a page, but couldn't find an article_id ?", article_title)
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
# start reading text within a page
if ").*(?= 2:
reading_special_case = True
if open_read == 2 and reading_text:
reading_text = False
reading_entity = True
reading_mention = False
# we just finished reading an entity
if open_read == 0 and not reading_text:
if '#' in entity_buffer or entity_buffer.startswith(':'):
reading_special_case = True
# Ignore cases with nested structures like File: handles etc
if not reading_special_case:
if not mention_buffer:
mention_buffer = entity_buffer
start = len(final_text)
end = start + len(mention_buffer)
qid = wp_to_id.get(entity_buffer, None)
if qid:
_write_training_entity(outputfile=entityfile,
article_id=article_id,
alias=mention_buffer,
entity=qid,
start=start,
end=end)
found_entities = True
final_text += mention_buffer
entity_buffer = ""
mention_buffer = ""
reading_text = True
reading_entity = False
reading_mention = False
reading_special_case = False
if found_entities:
_write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output)
info_regex = re.compile(r'{[^{]*?}')
htlm_regex = re.compile(r'<!--[^-]*-->')
category_regex = re.compile(r'\[\[Category:[^\[]*]]')
file_regex = re.compile(r'\[\[File:[^[\]]+]]')
ref_regex = re.compile(r'<ref.*?>') # non-greedy
ref_2_regex = re.compile(r'</ref.*?>') # non-greedy
def _get_clean_wp_text(article_text):
clean_text = article_text.strip()
# remove bolding & italic markup
clean_text = clean_text.replace('\'\'\'', '')
clean_text = clean_text.replace('\'\'', '')
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
try_again = True
previous_length = len(clean_text)
while try_again:
clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested {
if len(clean_text) < previous_length:
try_again = True
else:
try_again = False
previous_length = len(clean_text)
# remove HTML comments
clean_text = htlm_regex.sub('', clean_text)
# remove Category and File statements
clean_text = category_regex.sub('', clean_text)
clean_text = file_regex.sub('', clean_text)
# remove multiple =
while '==' in clean_text:
clean_text = clean_text.replace("==", "=")
clean_text = clean_text.replace(". =", ".")
clean_text = clean_text.replace(" = ", ". ")
clean_text = clean_text.replace("= ", ".")
clean_text = clean_text.replace(" =", "")
# remove refs (non-greedy match)
clean_text = ref_regex.sub('', clean_text)
clean_text = ref_2_regex.sub('', clean_text)
# remove additional wikiformatting
clean_text = re.sub(r'<blockquote>', '', clean_text)
clean_text = re.sub(r'</blockquote>', '', clean_text)
# change special characters back to normal ones
clean_text = clean_text.replace(r'<', '<')
clean_text = clean_text.replace(r'>', '>')
clean_text = clean_text.replace(r'"', '"')
clean_text = clean_text.replace(r' ', ' ')
clean_text = clean_text.replace(r'&', '&')
# remove multiple spaces
while ' ' in clean_text:
clean_text = clean_text.replace(' ', ' ')
return clean_text.strip()
def _write_training_article(article_id, clean_text, training_output):
file_loc = training_output + "/" + str(article_id) + ".txt"
with open(file_loc, mode='w', encoding='utf8') as outputfile:
outputfile.write(clean_text)
def _write_training_entity(outputfile, article_id, alias, entity, start, end):
outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n")
def is_dev(article_id):
return article_id.endswith("3")
def read_training(nlp, training_dir, dev, limit):
# This method provides training examples that correspond to the entity annotations found by the nlp object
entityfile_loc = training_dir + "/" + ENTITY_FILE
data = []
# we assume the data is written sequentially
current_article_id = None
current_doc = None
gold_entities = list()
ents_by_offset = dict()
skip_articles = set()
total_entities = 0
with open(entityfile_loc, mode='r', encoding='utf8') as file:
for line in file:
if not limit or len(data) < limit:
if len(data) > 0 and len(data) % 50 == 0:
print("Read", total_entities, "entities in", len(data), "articles")
fields = line.replace('\n', "").split(sep='|')
article_id = fields[0]
alias = fields[1]
wp_title = fields[2]
start = fields[3]
end = fields[4]
if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:
if not current_doc or (current_article_id != article_id):
# store the data from the previous article
if gold_entities and current_doc:
gold = GoldParse(doc=current_doc, links=gold_entities)
data.append((current_doc, gold))
total_entities += len(gold_entities)
# parse the new article text
file_name = article_id + ".txt"
try:
with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:
text = f.read()
current_doc = nlp(text)
for ent in current_doc.ents:
ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent.text
except Exception as e:
print("Problem parsing article", article_id, e)
current_article_id = article_id
gold_entities = list()
# repeat checking this condition in case an exception was thrown
if current_doc and (current_article_id == article_id):
found_ent = ents_by_offset.get(start + "_" + end, None)
if found_ent:
if found_ent != alias:
skip_articles.add(current_article_id)
else:
gold_entities.append((int(start), int(end), wp_title))
print("Read", total_entities, "entities in", len(data), "articles")
return data