mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 20:28:20 +03:00
431 lines
17 KiB
Python
431 lines
17 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import os
|
|
import random
|
|
import re
|
|
import bz2
|
|
import datetime
|
|
|
|
from spacy.gold import GoldParse
|
|
from bin.wiki_entity_linking import kb_creator
|
|
|
|
"""
|
|
Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
|
|
Gold-standard entities are stored in one file in standoff format (by character offset).
|
|
"""
|
|
|
|
ENTITY_FILE = "gold_entities.csv"
|
|
|
|
|
|
def now():
|
|
return datetime.datetime.now()
|
|
|
|
|
|
def create_training(wikipedia_input, entity_def_input, training_output):
|
|
wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
|
|
_process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)
|
|
|
|
|
|
def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):
|
|
"""
|
|
Read the XML wikipedia data to parse out training data:
|
|
raw text data + positive instances
|
|
"""
|
|
title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
|
|
id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
|
|
|
|
read_ids = set()
|
|
entityfile_loc = training_output / ENTITY_FILE
|
|
with open(entityfile_loc, mode="w", encoding="utf8") as entityfile:
|
|
# write entity training header file
|
|
_write_training_entity(
|
|
outputfile=entityfile,
|
|
article_id="article_id",
|
|
alias="alias",
|
|
entity="WD_id",
|
|
start="start",
|
|
end="end",
|
|
)
|
|
|
|
with bz2.open(wikipedia_input, mode="rb") as file:
|
|
line = file.readline()
|
|
cnt = 0
|
|
article_text = ""
|
|
article_title = None
|
|
article_id = None
|
|
reading_text = False
|
|
reading_revision = False
|
|
while line and (not limit or cnt < limit):
|
|
if cnt % 1000000 == 0:
|
|
print(now(), "processed", cnt, "lines of Wikipedia dump")
|
|
clean_line = line.strip().decode("utf-8")
|
|
|
|
if clean_line == "<revision>":
|
|
reading_revision = True
|
|
elif clean_line == "</revision>":
|
|
reading_revision = False
|
|
|
|
# Start reading new page
|
|
if clean_line == "<page>":
|
|
article_text = ""
|
|
article_title = None
|
|
article_id = None
|
|
|
|
# finished reading this page
|
|
elif clean_line == "</page>":
|
|
if article_id:
|
|
try:
|
|
_process_wp_text(
|
|
wp_to_id,
|
|
entityfile,
|
|
article_id,
|
|
article_title,
|
|
article_text.strip(),
|
|
training_output,
|
|
)
|
|
except Exception as e:
|
|
print(
|
|
"Error processing article", article_id, article_title, e
|
|
)
|
|
else:
|
|
print(
|
|
"Done processing a page, but couldn't find an article_id ?",
|
|
article_title,
|
|
)
|
|
article_text = ""
|
|
article_title = None
|
|
article_id = None
|
|
reading_text = False
|
|
reading_revision = False
|
|
|
|
# start reading text within a page
|
|
if "<text" in clean_line:
|
|
reading_text = True
|
|
|
|
if reading_text:
|
|
article_text += " " + clean_line
|
|
|
|
# stop reading text within a page (we assume a new page doesn't start on the same line)
|
|
if "</text" in clean_line:
|
|
reading_text = False
|
|
|
|
# read the ID of this article (outside the revision portion of the document)
|
|
if not reading_revision:
|
|
ids = id_regex.search(clean_line)
|
|
if ids:
|
|
article_id = ids[0]
|
|
if article_id in read_ids:
|
|
print(
|
|
"Found duplicate article ID", article_id, clean_line
|
|
) # This should never happen ...
|
|
read_ids.add(article_id)
|
|
|
|
# read the title of this article (outside the revision portion of the document)
|
|
if not reading_revision:
|
|
titles = title_regex.search(clean_line)
|
|
if titles:
|
|
article_title = titles[0].strip()
|
|
|
|
line = file.readline()
|
|
cnt += 1
|
|
|
|
|
|
text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
|
|
|
|
|
|
def _process_wp_text(
|
|
wp_to_id, entityfile, article_id, article_title, article_text, training_output
|
|
):
|
|
found_entities = False
|
|
|
|
# ignore meta Wikipedia pages
|
|
if article_title.startswith("Wikipedia:"):
|
|
return
|
|
|
|
# remove the text tags
|
|
text = text_regex.search(article_text).group(0)
|
|
|
|
# stop processing if this is a redirect page
|
|
if text.startswith("#REDIRECT"):
|
|
return
|
|
|
|
# get the raw text without markup etc, keeping only interwiki links
|
|
clean_text = _get_clean_wp_text(text)
|
|
|
|
# read the text char by char to get the right offsets for the interwiki links
|
|
final_text = ""
|
|
open_read = 0
|
|
reading_text = True
|
|
reading_entity = False
|
|
reading_mention = False
|
|
reading_special_case = False
|
|
entity_buffer = ""
|
|
mention_buffer = ""
|
|
for index, letter in enumerate(clean_text):
|
|
if letter == "[":
|
|
open_read += 1
|
|
elif letter == "]":
|
|
open_read -= 1
|
|
elif letter == "|":
|
|
if reading_text:
|
|
final_text += letter
|
|
# switch from reading entity to mention in the [[entity|mention]] pattern
|
|
elif reading_entity:
|
|
reading_text = False
|
|
reading_entity = False
|
|
reading_mention = True
|
|
else:
|
|
reading_special_case = True
|
|
else:
|
|
if reading_entity:
|
|
entity_buffer += letter
|
|
elif reading_mention:
|
|
mention_buffer += letter
|
|
elif reading_text:
|
|
final_text += letter
|
|
else:
|
|
raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
|
|
|
|
if open_read > 2:
|
|
reading_special_case = True
|
|
|
|
if open_read == 2 and reading_text:
|
|
reading_text = False
|
|
reading_entity = True
|
|
reading_mention = False
|
|
|
|
# we just finished reading an entity
|
|
if open_read == 0 and not reading_text:
|
|
if "#" in entity_buffer or entity_buffer.startswith(":"):
|
|
reading_special_case = True
|
|
# Ignore cases with nested structures like File: handles etc
|
|
if not reading_special_case:
|
|
if not mention_buffer:
|
|
mention_buffer = entity_buffer
|
|
start = len(final_text)
|
|
end = start + len(mention_buffer)
|
|
qid = wp_to_id.get(entity_buffer, None)
|
|
if qid:
|
|
_write_training_entity(
|
|
outputfile=entityfile,
|
|
article_id=article_id,
|
|
alias=mention_buffer,
|
|
entity=qid,
|
|
start=start,
|
|
end=end,
|
|
)
|
|
found_entities = True
|
|
final_text += mention_buffer
|
|
|
|
entity_buffer = ""
|
|
mention_buffer = ""
|
|
|
|
reading_text = True
|
|
reading_entity = False
|
|
reading_mention = False
|
|
reading_special_case = False
|
|
|
|
if found_entities:
|
|
_write_training_article(
|
|
article_id=article_id,
|
|
clean_text=final_text,
|
|
training_output=training_output,
|
|
)
|
|
|
|
|
|
info_regex = re.compile(r"{[^{]*?}")
|
|
htlm_regex = re.compile(r"<!--[^-]*-->")
|
|
category_regex = re.compile(r"\[\[Category:[^\[]*]]")
|
|
file_regex = re.compile(r"\[\[File:[^[\]]+]]")
|
|
ref_regex = re.compile(r"<ref.*?>") # non-greedy
|
|
ref_2_regex = re.compile(r"</ref.*?>") # non-greedy
|
|
|
|
|
|
def _get_clean_wp_text(article_text):
|
|
clean_text = article_text.strip()
|
|
|
|
# remove bolding & italic markup
|
|
clean_text = clean_text.replace("'''", "")
|
|
clean_text = clean_text.replace("''", "")
|
|
|
|
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
|
|
try_again = True
|
|
previous_length = len(clean_text)
|
|
while try_again:
|
|
clean_text = info_regex.sub(
|
|
"", clean_text
|
|
) # non-greedy match excluding a nested {
|
|
if len(clean_text) < previous_length:
|
|
try_again = True
|
|
else:
|
|
try_again = False
|
|
previous_length = len(clean_text)
|
|
|
|
# remove HTML comments
|
|
clean_text = htlm_regex.sub("", clean_text)
|
|
|
|
# remove Category and File statements
|
|
clean_text = category_regex.sub("", clean_text)
|
|
clean_text = file_regex.sub("", clean_text)
|
|
|
|
# remove multiple =
|
|
while "==" in clean_text:
|
|
clean_text = clean_text.replace("==", "=")
|
|
|
|
clean_text = clean_text.replace(". =", ".")
|
|
clean_text = clean_text.replace(" = ", ". ")
|
|
clean_text = clean_text.replace("= ", ".")
|
|
clean_text = clean_text.replace(" =", "")
|
|
|
|
# remove refs (non-greedy match)
|
|
clean_text = ref_regex.sub("", clean_text)
|
|
clean_text = ref_2_regex.sub("", clean_text)
|
|
|
|
# remove additional wikiformatting
|
|
clean_text = re.sub(r"<blockquote>", "", clean_text)
|
|
clean_text = re.sub(r"</blockquote>", "", clean_text)
|
|
|
|
# change special characters back to normal ones
|
|
clean_text = clean_text.replace(r"<", "<")
|
|
clean_text = clean_text.replace(r">", ">")
|
|
clean_text = clean_text.replace(r""", '"')
|
|
clean_text = clean_text.replace(r"&nbsp;", " ")
|
|
clean_text = clean_text.replace(r"&", "&")
|
|
|
|
# remove multiple spaces
|
|
while " " in clean_text:
|
|
clean_text = clean_text.replace(" ", " ")
|
|
|
|
return clean_text.strip()
|
|
|
|
|
|
def _write_training_article(article_id, clean_text, training_output):
|
|
file_loc = training_output / str(article_id) + ".txt"
|
|
with open(file_loc, mode="w", encoding="utf8") as outputfile:
|
|
outputfile.write(clean_text)
|
|
|
|
|
|
def _write_training_entity(outputfile, article_id, alias, entity, start, end):
|
|
line = "{}|{}|{}|{}|{}\n".format(article_id, alias, entity, start, end)
|
|
outputfile.write(line)
|
|
|
|
|
|
def is_dev(article_id):
|
|
return article_id.endswith("3")
|
|
|
|
|
|
def read_training(nlp, training_dir, dev, limit, kb=None):
|
|
""" This method provides training examples that correspond to the entity annotations found by the nlp object.
|
|
When kb is provided (for training), it will include negative training examples by using the candidate generator,
|
|
and it will only keep positive training examples that can be found in the KB.
|
|
When kb=None (for testing), it will include all positive examples only."""
|
|
entityfile_loc = training_dir / ENTITY_FILE
|
|
data = []
|
|
|
|
# assume the data is written sequentially, so we can reuse the article docs
|
|
current_article_id = None
|
|
current_doc = None
|
|
ents_by_offset = dict()
|
|
skip_articles = set()
|
|
total_entities = 0
|
|
|
|
with open(entityfile_loc, mode="r", encoding="utf8") as file:
|
|
for line in file:
|
|
if not limit or len(data) < limit:
|
|
fields = line.replace("\n", "").split(sep="|")
|
|
article_id = fields[0]
|
|
alias = fields[1]
|
|
wd_id = fields[2]
|
|
start = fields[3]
|
|
end = fields[4]
|
|
|
|
if (
|
|
dev == is_dev(article_id)
|
|
and article_id != "article_id"
|
|
and article_id not in skip_articles
|
|
):
|
|
if not current_doc or (current_article_id != article_id):
|
|
# parse the new article text
|
|
file_name = article_id + ".txt"
|
|
try:
|
|
with open(
|
|
os.path.join(training_dir, file_name),
|
|
mode="r",
|
|
encoding="utf8",
|
|
) as f:
|
|
text = f.read()
|
|
# threshold for convenience / speed of processing
|
|
if len(text) < 30000:
|
|
current_doc = nlp(text)
|
|
current_article_id = article_id
|
|
ents_by_offset = dict()
|
|
for ent in current_doc.ents:
|
|
sent_length = len(ent.sent)
|
|
# custom filtering to avoid too long or too short sentences
|
|
if 5 < sent_length < 100:
|
|
offset = "{}_{}".format(ent.start_char, ent.end_char)
|
|
ents_by_offset[offset] = ent
|
|
else:
|
|
skip_articles.add(article_id)
|
|
current_doc = None
|
|
except Exception as e:
|
|
print("Problem parsing article", article_id, e)
|
|
skip_articles.add(article_id)
|
|
|
|
# repeat checking this condition in case an exception was thrown
|
|
if current_doc and (current_article_id == article_id):
|
|
offset = "{}_{}".format(start, end)
|
|
found_ent = ents_by_offset.get(offset, None)
|
|
if found_ent:
|
|
if found_ent.text != alias:
|
|
skip_articles.add(article_id)
|
|
current_doc = None
|
|
else:
|
|
sent = found_ent.sent.as_doc()
|
|
|
|
gold_start = int(start) - found_ent.sent.start_char
|
|
gold_end = int(end) - found_ent.sent.start_char
|
|
|
|
gold_entities = {}
|
|
found_useful = False
|
|
for ent in sent.ents:
|
|
entry = (ent.start_char, ent.end_char)
|
|
gold_entry = (gold_start, gold_end)
|
|
if entry == gold_entry:
|
|
# add both pos and neg examples (in random order)
|
|
# this will exclude examples not in the KB
|
|
if kb:
|
|
value_by_id = {}
|
|
candidates = kb.get_candidates(alias)
|
|
candidate_ids = [
|
|
c.entity_ for c in candidates
|
|
]
|
|
random.shuffle(candidate_ids)
|
|
for kb_id in candidate_ids:
|
|
found_useful = True
|
|
if kb_id != wd_id:
|
|
value_by_id[kb_id] = 0.0
|
|
else:
|
|
value_by_id[kb_id] = 1.0
|
|
gold_entities[entry] = value_by_id
|
|
# if no KB, keep all positive examples
|
|
else:
|
|
found_useful = True
|
|
value_by_id = {wd_id: 1.0}
|
|
|
|
gold_entities[entry] = value_by_id
|
|
# currently feeding the gold data one entity per sentence at a time
|
|
# setting all other entities to empty gold dictionary
|
|
else:
|
|
gold_entities[entry] = {}
|
|
if found_useful:
|
|
gold = GoldParse(doc=sent, links=gold_entities)
|
|
data.append((sent, gold))
|
|
total_entities += 1
|
|
if len(data) % 2500 == 0:
|
|
print(" -read", total_entities, "entities")
|
|
|
|
print(" -read", total_entities, "entities")
|
|
return data
|