spaCy/bin/wiki_entity_linking/wikipedia_processor.py

555 lines
19 KiB
Python
Raw Normal View History

# coding: utf-8
from __future__ import unicode_literals
import re
import bz2
import logging
import random
import json
from tqdm import tqdm
from functools import partial
from spacy.gold import GoldParse
from bin.wiki_entity_linking import wiki_io as io
from bin.wiki_entity_linking.wiki_namespaces import (
WP_META_NAMESPACE,
WP_FILE_NAMESPACE,
WP_CATEGORY_NAMESPACE,
)
"""
Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
2019-06-18 19:38:09 +03:00
Write these results to file for downstream KB and training data generation.
Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
"""
ENTITY_FILE = "gold_entities.csv"
map_alias_to_link = dict()
logger = logging.getLogger(__name__)
title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")
text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")
info_regex = re.compile(r"{[^{]*?}")
html_regex = re.compile(r"&lt;!--[^-]*--&gt;")
ref_regex = re.compile(r"&lt;ref.*?&gt;") # non-greedy
ref_2_regex = re.compile(r"&lt;/ref.*?&gt;") # non-greedy
# find the links
link_regex = re.compile(r"\[\[[^\[\]]*\]\]")
# match on interwiki links, e.g. `en:` or `:fr:`
ns_regex = r":?" + "[a-z][a-z]" + ":"
# match on Namespace: optionally preceded by a :
for ns in WP_META_NAMESPACE:
ns_regex += "|" + ":?" + ns + ":"
ns_regex = re.compile(ns_regex, re.IGNORECASE)
files = r""
for f in WP_FILE_NAMESPACE:
files += "\[\[" + f + ":[^[\]]+]]" + "|"
files = files[0 : len(files) - 1]
file_regex = re.compile(files)
cats = r""
for c in WP_CATEGORY_NAMESPACE:
cats += "\[\[" + c + ":[^\[]*]]" + "|"
cats = cats[0 : len(cats) - 1]
category_regex = re.compile(cats)
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script
2019-08-13 16:38:59 +03:00
def read_prior_probs(wikipedia_input, prior_prob_output, limit=None):
"""
2019-06-19 10:15:43 +03:00
Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities.
The full file takes about 2-3h to parse 1100M lines.
It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from,
though dev test articles are excluded in order not to get an artificially strong baseline.
"""
cnt = 0
read_id = False
current_article_id = None
with bz2.open(wikipedia_input, mode="rb") as file:
line = file.readline()
CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script
2019-08-13 16:38:59 +03:00
while line and (not limit or cnt < limit):
if cnt % 25000000 == 0 and cnt > 0:
logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
clean_line = line.strip().decode("utf-8")
# we attempt at reading the article's ID (but not the revision or contributor ID)
if "<revision>" in clean_line or "<contributor>" in clean_line:
read_id = False
if "<page>" in clean_line:
read_id = True
if read_id:
ids = id_regex.search(clean_line)
if ids:
current_article_id = ids[0]
# only processing prior probabilities from true training (non-dev) articles
if not is_dev(current_article_id):
aliases, entities, normalizations = get_wp_links(clean_line)
for alias, entity, norm in zip(aliases, entities, normalizations):
_store_alias(
alias, entity, normalize_alias=norm, normalize_entity=True
)
line = file.readline()
cnt += 1
logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))
2019-06-19 10:15:43 +03:00
# write all aliases and their entities and count occurrences to file
2019-07-23 13:17:19 +03:00
with prior_prob_output.open("w", encoding="utf8") as outputfile:
outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n")
for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]):
s_dict = sorted(alias_dict.items(), key=lambda x: x[1], reverse=True)
for entity, count in s_dict:
outputfile.write(alias + "|" + str(count) + "|" + entity + "\n")
def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True):
alias = alias.strip()
entity = entity.strip()
# remove everything after # as this is not part of the title but refers to a specific paragraph
if normalize_entity:
# wikipedia titles are always capitalized
entity = _capitalize_first(entity.split("#")[0])
if normalize_alias:
alias = alias.split("#")[0]
if alias and entity:
alias_dict = map_alias_to_link.get(alias, dict())
entity_count = alias_dict.get(entity, 0)
alias_dict[entity] = entity_count + 1
map_alias_to_link[alias] = alias_dict
def get_wp_links(text):
aliases = []
entities = []
normalizations = []
matches = link_regex.findall(text)
for match in matches:
match = match[2:][:-2].replace("_", " ").strip()
if ns_regex.match(match):
pass # ignore the entity if it points to a "meta" page
2019-06-19 10:15:43 +03:00
# this is a simple [[link]], with the alias the same as the mention
elif "|" not in match:
aliases.append(match)
entities.append(match)
normalizations.append(True)
# in wiki format, the link is written as [[entity|alias]]
else:
splits = match.split("|")
entity = splits[0].strip()
alias = splits[1].strip()
# specific wiki format [[alias (specification)|]]
if len(alias) == 0 and "(" in entity:
alias = entity.split("(")[0]
aliases.append(alias)
entities.append(entity)
normalizations.append(False)
else:
aliases.append(alias)
entities.append(entity)
normalizations.append(False)
return aliases, entities, normalizations
def _capitalize_first(text):
if not text:
return None
result = text[0].capitalize()
if len(result) > 0:
result += text[1:]
return result
def create_training_and_desc(
wp_input, def_input, desc_output, training_output, parse_desc, limit=None
):
wp_to_id = io.read_title_to_id(def_input)
_process_wikipedia_texts(
wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
)
def _process_wikipedia_texts(
wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
):
"""
Read the XML wikipedia data to parse out training data:
raw text data + positive instances
"""
read_ids = set()
with output.open("a", encoding="utf8") as descr_file, training_output.open(
"w", encoding="utf8"
) as entity_file:
if parse_descriptions:
_write_training_description(descr_file, "WD_id", "description")
with bz2.open(wikipedia_input, mode="rb") as file:
article_count = 0
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
for line in file:
clean_line = line.strip().decode("utf-8")
if clean_line == "<revision>":
reading_revision = True
elif clean_line == "</revision>":
reading_revision = False
# Start reading new page
if clean_line == "<page>":
article_text = ""
article_title = None
article_id = None
# finished reading this page
elif clean_line == "</page>":
if article_id:
clean_text, entities = _process_wp_text(
article_title, article_text, wp_to_id
)
if clean_text is not None and entities is not None:
_write_training_entities(
entity_file, article_id, clean_text, entities
)
if article_title in wp_to_id and parse_descriptions:
description = " ".join(
clean_text[:1000].split(" ")[:-1]
)
_write_training_description(
descr_file, wp_to_id[article_title], description
)
article_count += 1
if article_count % 10000 == 0 and article_count > 0:
logger.info(
"Processed {} articles".format(article_count)
)
if limit and article_count >= limit:
break
article_text = ""
article_title = None
article_id = None
reading_text = False
reading_revision = False
# start reading text within a page
if "<text" in clean_line:
reading_text = True
if reading_text:
article_text += " " + clean_line
# stop reading text within a page (we assume a new page doesn't start on the same line)
if "</text" in clean_line:
reading_text = False
# read the ID of this article (outside the revision portion of the document)
if not reading_revision:
ids = id_regex.search(clean_line)
if ids:
article_id = ids[0]
if article_id in read_ids:
logger.info(
"Found duplicate article ID", article_id, clean_line
) # This should never happen ...
read_ids.add(article_id)
# read the title of this article (outside the revision portion of the document)
if not reading_revision:
titles = title_regex.search(clean_line)
if titles:
article_title = titles[0].strip()
logger.info("Finished. Processed {} articles".format(article_count))
def _process_wp_text(article_title, article_text, wp_to_id):
# ignore meta Wikipedia pages
if ns_regex.match(article_title):
return None, None
# remove the text tags
text_search = text_regex.search(article_text)
if text_search is None:
return None, None
text = text_search.group(0)
# stop processing if this is a redirect page
if text.startswith("#REDIRECT"):
return None, None
# get the raw text without markup etc, keeping only interwiki links
clean_text, entities = _remove_links(_get_clean_wp_text(text), wp_to_id)
return clean_text, entities
def _get_clean_wp_text(article_text):
clean_text = article_text.strip()
# remove bolding & italic markup
clean_text = clean_text.replace("'''", "")
clean_text = clean_text.replace("''", "")
# remove nested {{info}} statements by removing the inner/smallest ones first and iterating
try_again = True
previous_length = len(clean_text)
while try_again:
clean_text = info_regex.sub(
"", clean_text
) # non-greedy match excluding a nested {
if len(clean_text) < previous_length:
try_again = True
else:
try_again = False
previous_length = len(clean_text)
# remove HTML comments
clean_text = html_regex.sub("", clean_text)
# remove Category and File statements
clean_text = category_regex.sub("", clean_text)
clean_text = file_regex.sub("", clean_text)
# remove multiple =
while "==" in clean_text:
clean_text = clean_text.replace("==", "=")
clean_text = clean_text.replace(". =", ".")
clean_text = clean_text.replace(" = ", ". ")
clean_text = clean_text.replace("= ", ".")
clean_text = clean_text.replace(" =", "")
# remove refs (non-greedy match)
clean_text = ref_regex.sub("", clean_text)
clean_text = ref_2_regex.sub("", clean_text)
# remove additional wikiformatting
clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)
# change special characters back to normal ones
clean_text = clean_text.replace(r"&lt;", "<")
clean_text = clean_text.replace(r"&gt;", ">")
clean_text = clean_text.replace(r"&quot;", '"')
clean_text = clean_text.replace(r"&amp;nbsp;", " ")
clean_text = clean_text.replace(r"&amp;", "&")
# remove multiple spaces
while " " in clean_text:
clean_text = clean_text.replace(" ", " ")
return clean_text.strip()
def _remove_links(clean_text, wp_to_id):
# read the text char by char to get the right offsets for the interwiki links
entities = []
final_text = ""
open_read = 0
reading_text = True
reading_entity = False
reading_mention = False
reading_special_case = False
entity_buffer = ""
mention_buffer = ""
for index, letter in enumerate(clean_text):
if letter == "[":
open_read += 1
elif letter == "]":
open_read -= 1
elif letter == "|":
if reading_text:
final_text += letter
# switch from reading entity to mention in the [[entity|mention]] pattern
elif reading_entity:
reading_text = False
reading_entity = False
reading_mention = True
else:
reading_special_case = True
else:
if reading_entity:
entity_buffer += letter
elif reading_mention:
mention_buffer += letter
elif reading_text:
final_text += letter
else:
raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
if open_read > 2:
reading_special_case = True
if open_read == 2 and reading_text:
reading_text = False
reading_entity = True
reading_mention = False
# we just finished reading an entity
if open_read == 0 and not reading_text:
if "#" in entity_buffer or entity_buffer.startswith(":"):
reading_special_case = True
# Ignore cases with nested structures like File: handles etc
if not reading_special_case:
if not mention_buffer:
mention_buffer = entity_buffer
start = len(final_text)
end = start + len(mention_buffer)
qid = wp_to_id.get(entity_buffer, None)
if qid:
entities.append((mention_buffer, qid, start, end))
final_text += mention_buffer
entity_buffer = ""
mention_buffer = ""
reading_text = True
reading_entity = False
reading_mention = False
reading_special_case = False
return final_text, entities
def _write_training_description(outputfile, qid, description):
if description is not None:
line = str(qid) + "|" + description + "\n"
outputfile.write(line)
def _write_training_entities(outputfile, article_id, clean_text, entities):
entities_data = [
{"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
for ent in entities
]
line = (
json.dumps(
{
"article_id": article_id,
"clean_text": clean_text,
"entities": entities_data,
},
ensure_ascii=False,
)
+ "\n"
)
outputfile.write(line)
def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None):
""" This method provides training examples that correspond to the entity annotations found by the nlp object.
For training, it will include both positive and negative examples by using the candidate generator from the kb.
For testing (kb=None), it will include all positive examples only."""
if not labels_discard:
labels_discard = []
data = []
num_entities = 0
get_gold_parse = partial(
_get_gold_parse, dev=dev, kb=kb, labels_discard=labels_discard
)
logger.info(
"Reading {} data with limit {}".format("dev" if dev else "train", limit)
)
with entity_file_path.open("r", encoding="utf8") as file:
with tqdm(total=limit, leave=False) as pbar:
for i, line in enumerate(file):
example = json.loads(line)
article_id = example["article_id"]
clean_text = example["clean_text"]
entities = example["entities"]
if dev != is_dev(article_id) or not is_valid_article(clean_text):
continue
doc = nlp(clean_text)
gold = get_gold_parse(doc, entities)
if gold and len(gold.links) > 0:
data.append((doc, gold))
num_entities += len(gold.links)
pbar.update(len(gold.links))
if limit and num_entities >= limit:
break
logger.info("Read {} entities in {} articles".format(num_entities, len(data)))
return data
def _get_gold_parse(doc, entities, dev, kb, labels_discard):
gold_entities = {}
tagged_ent_positions = {
(ent.start_char, ent.end_char): ent
for ent in doc.ents
if ent.label_ not in labels_discard
}
for entity in entities:
entity_id = entity["entity"]
alias = entity["alias"]
start = entity["start"]
end = entity["end"]
candidate_ids = []
if kb and not dev:
candidates = kb.get_candidates(alias)
candidate_ids = [cand.entity_ for cand in candidates]
tagged_ent = tagged_ent_positions.get((start, end), None)
if tagged_ent:
# TODO: check that alias == doc.text[start:end]
should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
tagged_ent.sent.text
)
if should_add_ent:
value_by_id = {entity_id: 1.0}
if not dev:
random.shuffle(candidate_ids)
value_by_id.update(
{kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
)
gold_entities[(start, end)] = value_by_id
return GoldParse(doc, links=gold_entities)
def is_dev(article_id):
if not article_id:
return False
return article_id.endswith("3")
def is_valid_article(doc_text):
# custom length cut-off
return 10 < len(doc_text) < 30000
def is_valid_sentence(sent_text):
if not 10 < len(sent_text) < 3000:
# custom length cut-off
return False
if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
# remove 'enumeration' sentences (occurs often on Wikipedia)
return False
return True