spaCy/bin/wiki_entity_linking/training_set_creator.py

430 lines
16 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
import os
2019-07-17 13:17:02 +03:00
import random
import re
import bz2
import datetime
from spacy.gold import GoldParse
2019-07-03 11:25:51 +03:00
from bin.wiki_entity_linking import kb_creator
"""
2019-06-18 19:38:09 +03:00
Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
Gold-standard entities are stored in one file in standoff format (by character offset).
"""
ENTITY_FILE = "gold_entities.csv"
2019-06-19 10:15:43 +03:00
def create_training(wikipedia_input, entity_def_input, training_output):
wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
2019-06-19 10:15:43 +03:00
_process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)
2019-06-19 10:15:43 +03:00
def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):
"""
Read the XML wikipedia data to parse out training data:
raw text data + positive instances
"""
2019-07-17 13:17:02 +03:00
title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
read_ids = set()
2019-06-19 10:15:43 +03:00
entityfile_loc = training_output / ENTITY_FILE
2019-07-17 13:17:02 +03:00
with open(entityfile_loc, mode="w", encoding="utf8") as entityfile:
# write entity training header file
2019-07-17 13:17:02 +03:00
_write_training_entity(
outputfile=entityfile,
article_id="article_id",
alias="alias",
entity="WD_id",
start="start",
end="end",
)
with bz2.open(wikipedia_input, mode="rb") as file:
line = file.readline()
cnt = 0
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
while line and (not limit or cnt < limit):
if cnt % 1000000 == 0:
2019-07-17 13:17:02 +03:00
print(
datetime.datetime.now(),
"processed",
cnt,
"lines of Wikipedia dump",
)
clean_line = line.strip().decode("utf-8")
if clean_line == "<revision>":
reading_revision = True
elif clean_line == "</revision>":
reading_revision = False
# Start reading new page
if clean_line == "<page>":
article_text = ""
article_title = None
article_id = None
# finished reading this page
elif clean_line == "</page>":
if article_id:
try:
2019-07-17 13:17:02 +03:00
_process_wp_text(
wp_to_id,
entityfile,
article_id,
article_title,
article_text.strip(),
training_output,
)
except Exception as e:
2019-07-17 13:17:02 +03:00
print(
"Error processing article", article_id, article_title, e
)
else:
2019-07-17 13:17:02 +03:00
print(
"Done processing a page, but couldn't find an article_id ?",
article_title,
)
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
# start reading text within a page
if "<text" in clean_line:
reading_text = True
if reading_text:
article_text += " " + clean_line
# stop reading text within a page (we assume a new page doesn't start on the same line)
if "</text" in clean_line:
reading_text = False
# read the ID of this article (outside the revision portion of the document)
if not reading_revision:
ids = id_regex.search(clean_line)
if ids:
article_id = ids[0]
if article_id in read_ids:
2019-07-17 13:17:02 +03:00
print(
"Found duplicate article ID", article_id, clean_line
) # This should never happen ...
read_ids.add(article_id)
2019-06-19 10:15:43 +03:00
# read the title of this article (outside the revision portion of the document)
if not reading_revision:
titles = title_regex.search(clean_line)
if titles:
article_title = titles[0].strip()
line = file.readline()
cnt += 1
2019-07-17 13:17:02 +03:00
text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
2019-07-17 13:17:02 +03:00
def _process_wp_text(
wp_to_id, entityfile, article_id, article_title, article_text, training_output
):
found_entities = False
# ignore meta Wikipedia pages
if article_title.startswith("Wikipedia:"):
return
# remove the text tags
text = text_regex.search(article_text).group(0)
# stop processing if this is a redirect page
if text.startswith("#REDIRECT"):
return
# get the raw text without markup etc, keeping only interwiki links
clean_text = _get_clean_wp_text(text)
2019-06-19 10:15:43 +03:00
# read the text char by char to get the right offsets for the interwiki links
final_text = ""
open_read = 0
reading_text = True
reading_entity = False
reading_mention = False
reading_special_case = False
entity_buffer = ""
mention_buffer = ""
for index, letter in enumerate(clean_text):
2019-07-17 13:17:02 +03:00
if letter == "[":
open_read += 1
2019-07-17 13:17:02 +03:00
elif letter == "]":
open_read -= 1
2019-07-17 13:17:02 +03:00
elif letter == "|":
if reading_text:
final_text += letter
# switch from reading entity to mention in the [[entity|mention]] pattern
elif reading_entity:
reading_text = False
reading_entity = False
reading_mention = True
else:
reading_special_case = True
else:
if reading_entity:
entity_buffer += letter
elif reading_mention:
mention_buffer += letter
elif reading_text:
final_text += letter
else:
2019-07-17 13:17:02 +03:00
raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
if open_read > 2:
reading_special_case = True
if open_read == 2 and reading_text:
reading_text = False
reading_entity = True
reading_mention = False
# we just finished reading an entity
if open_read == 0 and not reading_text:
2019-07-17 13:17:02 +03:00
if "#" in entity_buffer or entity_buffer.startswith(":"):
reading_special_case = True
# Ignore cases with nested structures like File: handles etc
if not reading_special_case:
if not mention_buffer:
mention_buffer = entity_buffer
start = len(final_text)
end = start + len(mention_buffer)
qid = wp_to_id.get(entity_buffer, None)
if qid:
2019-07-17 13:17:02 +03:00
_write_training_entity(
outputfile=entityfile,
article_id=article_id,
alias=mention_buffer,
entity=qid,
start=start,
end=end,
)
found_entities = True
final_text += mention_buffer
entity_buffer = ""
mention_buffer = ""
reading_text = True
reading_entity = False
reading_mention = False
reading_special_case = False
if found_entities:
2019-07-17 13:17:02 +03:00
_write_training_article(
article_id=article_id,
clean_text=final_text,
training_output=training_output,
)
2019-07-17 13:17:02 +03:00
info_regex = re.compile(r"{[^{]*?}")
htlm_regex = re.compile(r"&lt;!--[^-]*--&gt;")
category_regex = re.compile(r"\[\[Category:[^\[]*]]")
file_regex = re.compile(r"\[\[File:[^[\]]+]]")
ref_regex = re.compile(r"&lt;ref.*?&gt;") # non-greedy
ref_2_regex = re.compile(r"&lt;/ref.*?&gt;") # non-greedy
def _get_clean_wp_text(article_text):
clean_text = article_text.strip()
# remove bolding & italic markup
2019-07-17 13:17:02 +03:00
clean_text = clean_text.replace("'''", "")
clean_text = clean_text.replace("''", "")
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
try_again = True
previous_length = len(clean_text)
while try_again:
2019-07-17 13:17:02 +03:00
clean_text = info_regex.sub(
"", clean_text
) # non-greedy match excluding a nested {
if len(clean_text) < previous_length:
try_again = True
else:
try_again = False
previous_length = len(clean_text)
# remove HTML comments
2019-07-17 13:17:02 +03:00
clean_text = htlm_regex.sub("", clean_text)
# remove Category and File statements
2019-07-17 13:17:02 +03:00
clean_text = category_regex.sub("", clean_text)
clean_text = file_regex.sub("", clean_text)
# remove multiple =
2019-07-17 13:17:02 +03:00
while "==" in clean_text:
clean_text = clean_text.replace("==", "=")
clean_text = clean_text.replace(". =", ".")
clean_text = clean_text.replace(" = ", ". ")
clean_text = clean_text.replace("= ", ".")
clean_text = clean_text.replace(" =", "")
# remove refs (non-greedy match)
2019-07-17 13:17:02 +03:00
clean_text = ref_regex.sub("", clean_text)
clean_text = ref_2_regex.sub("", clean_text)
# remove additional wikiformatting
2019-07-17 13:17:02 +03:00
clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
# change special characters back to normal ones
2019-07-17 13:17:02 +03:00
clean_text = clean_text.replace(r"&lt;", "<")
clean_text = clean_text.replace(r"&gt;", ">")
clean_text = clean_text.replace(r"&quot;", '"')
clean_text = clean_text.replace(r"&amp;nbsp;", " ")
clean_text = clean_text.replace(r"&amp;", "&")
# remove multiple spaces
2019-07-17 13:17:02 +03:00
while " " in clean_text:
clean_text = clean_text.replace(" ", " ")
return clean_text.strip()
def _write_training_article(article_id, clean_text, training_output):
2019-06-19 10:15:43 +03:00
file_loc = training_output / str(article_id) + ".txt"
2019-07-17 13:17:02 +03:00
with open(file_loc, mode="w", encoding="utf8") as outputfile:
outputfile.write(clean_text)
def _write_training_entity(outputfile, article_id, alias, entity, start, end):
2019-07-17 13:17:02 +03:00
outputfile.write(
article_id
+ "|"
+ alias
+ "|"
+ entity
+ "|"
+ str(start)
+ "|"
+ str(end)
+ "\n"
)
def is_dev(article_id):
return article_id.endswith("3")
2019-07-17 13:17:02 +03:00
def read_training(nlp, training_dir, dev, limit, kb=None):
""" This method provides training examples that correspond to the entity annotations found by the nlp object.
When kb is provided, it will include also negative training examples by using the candidate generator.
When kb=None, it will only include positive training examples."""
2019-06-19 10:15:43 +03:00
entityfile_loc = training_dir / ENTITY_FILE
data = []
2019-06-19 10:15:43 +03:00
# assume the data is written sequentially, so we can reuse the article docs
current_article_id = None
current_doc = None
ents_by_offset = dict()
skip_articles = set()
total_entities = 0
2019-07-17 13:17:02 +03:00
with open(entityfile_loc, mode="r", encoding="utf8") as file:
for line in file:
if not limit or len(data) < limit:
2019-07-17 13:17:02 +03:00
fields = line.replace("\n", "").split(sep="|")
article_id = fields[0]
alias = fields[1]
2019-07-17 13:17:02 +03:00
wd_id = fields[2]
start = fields[3]
end = fields[4]
2019-07-17 13:17:02 +03:00
if (
dev == is_dev(article_id)
and article_id != "article_id"
and article_id not in skip_articles
):
if not current_doc or (current_article_id != article_id):
# parse the new article text
file_name = article_id + ".txt"
try:
2019-07-17 13:17:02 +03:00
with open(
os.path.join(training_dir, file_name),
mode="r",
encoding="utf8",
) as f:
text = f.read()
2019-07-17 13:17:02 +03:00
if (
len(text) < 30000
): # threshold for convenience / speed of processing
current_doc = nlp(text)
current_article_id = article_id
ents_by_offset = dict()
for ent in current_doc.ents:
2019-06-18 19:38:09 +03:00
sent_length = len(ent.sent)
# custom filtering to avoid too long or too short sentences
if 5 < sent_length < 100:
2019-07-17 13:17:02 +03:00
ents_by_offset[
str(ent.start_char)
+ "_"
+ str(ent.end_char)
] = ent
else:
2019-06-19 13:35:26 +03:00
skip_articles.add(article_id)
current_doc = None
except Exception as e:
print("Problem parsing article", article_id, e)
2019-06-19 13:35:26 +03:00
skip_articles.add(article_id)
raise e
# repeat checking this condition in case an exception was thrown
if current_doc and (current_article_id == article_id):
2019-07-17 13:17:02 +03:00
found_ent = ents_by_offset.get(start + "_" + end, None)
if found_ent:
if found_ent.text != alias:
2019-06-19 13:35:26 +03:00
skip_articles.add(article_id)
current_doc = None
else:
sent = found_ent.sent.as_doc()
# currently feeding the gold data one entity per sentence at a time
gold_start = int(start) - found_ent.sent.start_char
gold_end = int(end) - found_ent.sent.start_char
2019-07-17 13:17:02 +03:00
2019-07-17 18:18:26 +03:00
# add both pos and neg examples (in random order)
2019-07-17 13:17:02 +03:00
if kb:
gold_entities = {}
2019-07-17 18:18:26 +03:00
candidates = kb.get_candidates(alias)
candidate_ids = [c.entity_ for c in candidates]
# add positive example in case the KB doesn't have it
candidate_ids.append(wd_id)
2019-07-17 13:17:02 +03:00
random.shuffle(candidate_ids)
for kb_id in candidate_ids:
entry = (gold_start, gold_end, kb_id)
if kb_id != wd_id:
gold_entities[entry] = 0.0
else:
gold_entities[entry] = 1.0
else:
2019-07-17 18:18:26 +03:00
entry = (gold_start, gold_end, wd_id)
gold_entities = {entry: 1.0}
2019-07-17 13:17:02 +03:00
2019-06-19 10:15:43 +03:00
gold = GoldParse(doc=sent, links=gold_entities)
data.append((sent, gold))
total_entities += 1
2019-06-19 10:15:43 +03:00
if len(data) % 2500 == 0:
print(" -read", total_entities, "entities")
print(" -read", total_entities, "entities")
2019-06-07 14:54:45 +03:00
return data