spaCy/bin/wiki_entity_linking/training_set_creator.py

# coding: utf-8
from __future__ import unicode_literals

import random
import re
import bz2
import datetime

from spacy.gold import GoldParse
from bin.wiki_entity_linking import kb_creator

"""
Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
Gold-standard entities are stored in one file in standoff format (by character offset).
"""

ENTITY_FILE = "gold_entities.csv"


def now():
    return datetime.datetime.now()


def create_training(wikipedia_input, entity_def_input, training_output):
    wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
    _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)


def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):
    """
    Read the XML wikipedia data to parse out training data:
    raw text data + positive instances
    """
    title_regex = re.compile(r"(?<=<title>).*(?=</title>)")
    id_regex = re.compile(r"(?<=<id>)\d*(?=</id>)")

    read_ids = set()
    entityfile_loc = training_output / ENTITY_FILE
    with entityfile_loc.open("w", encoding="utf8") as entityfile:
        # write entity training header file
        _write_training_entity(
            outputfile=entityfile,
            article_id="article_id",
            alias="alias",
            entity="WD_id",
            start="start",
            end="end",
        )

        with bz2.open(wikipedia_input, mode="rb") as file:
            line = file.readline()
            cnt = 0
            article_text = ""
            article_title = None
            article_id = None
            reading_text = False
            reading_revision = False
            while line and (not limit or cnt < limit):
                if cnt % 1000000 == 0:
                    print(now(), "processed", cnt, "lines of Wikipedia dump")
                clean_line = line.strip().decode("utf-8")

                if clean_line == "<revision>":
                    reading_revision = True
                elif clean_line == "</revision>":
                    reading_revision = False

                # Start reading new page
                if clean_line == "<page>":
                    article_text = ""
                    article_title = None
                    article_id = None

                # finished reading this page
                elif clean_line == "</page>":
                    if article_id:
                        try:
                            _process_wp_text(
                                wp_to_id,
                                entityfile,
                                article_id,
                                article_title,
                                article_text.strip(),
                                training_output,
                            )
                        except Exception as e:
                            print(
                                "Error processing article", article_id, article_title, e
                            )
                    else:
                        print(
                            "Done processing a page, but couldn't find an article_id ?",
                            article_title,
                        )
                    article_text = ""
                    article_title = None
                    article_id = None
                    reading_text = False
                    reading_revision = False

                # start reading text within a page
                if "<text" in clean_line:
                    reading_text = True

                if reading_text:
                    article_text += " " + clean_line

                # stop reading text within a page (we assume a new page doesn't start on the same line)
                if "</text" in clean_line:
                    reading_text = False

                # read the ID of this article (outside the revision portion of the document)
                if not reading_revision:
                    ids = id_regex.search(clean_line)
                    if ids:
                        article_id = ids[0]
                        if article_id in read_ids:
                            print(
                                "Found duplicate article ID", article_id, clean_line
                            )  # This should never happen ...
                        read_ids.add(article_id)

                # read the title of this article (outside the revision portion of the document)
                if not reading_revision:
                    titles = title_regex.search(clean_line)
                    if titles:
                        article_title = titles[0].strip()

                line = file.readline()
                cnt += 1


text_regex = re.compile(r"(?<=<text xml:space=\"preserve\">).*(?=</text)")


def _process_wp_text(
    wp_to_id, entityfile, article_id, article_title, article_text, training_output
):
    found_entities = False

    # ignore meta Wikipedia pages
    if article_title.startswith("Wikipedia:"):
        return

    # remove the text tags
    text = text_regex.search(article_text).group(0)

    # stop processing if this is a redirect page
    if text.startswith("#REDIRECT"):
        return

    # get the raw text without markup etc, keeping only interwiki links
    clean_text = _get_clean_wp_text(text)

    # read the text char by char to get the right offsets for the interwiki links
    final_text = ""
    open_read = 0
    reading_text = True
    reading_entity = False
    reading_mention = False
    reading_special_case = False
    entity_buffer = ""
    mention_buffer = ""
    for index, letter in enumerate(clean_text):
        if letter == "[":
            open_read += 1
        elif letter == "]":
            open_read -= 1
        elif letter == "|":
            if reading_text:
                final_text += letter
            # switch from reading entity to mention in the [[entity|mention]] pattern
            elif reading_entity:
                reading_text = False
                reading_entity = False
                reading_mention = True
            else:
                reading_special_case = True
        else:
            if reading_entity:
                entity_buffer += letter
            elif reading_mention:
                mention_buffer += letter
            elif reading_text:
                final_text += letter
            else:
                raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])

        if open_read > 2:
            reading_special_case = True

        if open_read == 2 and reading_text:
            reading_text = False
            reading_entity = True
            reading_mention = False

        # we just finished reading an entity
        if open_read == 0 and not reading_text:
            if "#" in entity_buffer or entity_buffer.startswith(":"):
                reading_special_case = True
            # Ignore cases with nested structures like File: handles etc
            if not reading_special_case:
                if not mention_buffer:
                    mention_buffer = entity_buffer
                start = len(final_text)
                end = start + len(mention_buffer)
                qid = wp_to_id.get(entity_buffer, None)
                if qid:
                    _write_training_entity(
                        outputfile=entityfile,
                        article_id=article_id,
                        alias=mention_buffer,
                        entity=qid,
                        start=start,
                        end=end,
                    )
                found_entities = True
                final_text += mention_buffer

            entity_buffer = ""
            mention_buffer = ""

            reading_text = True
            reading_entity = False
            reading_mention = False
            reading_special_case = False

    if found_entities:
        _write_training_article(
            article_id=article_id,
            clean_text=final_text,
            training_output=training_output,
        )


info_regex = re.compile(r"{[^{]*?}")
htlm_regex = re.compile(r"&lt;!--[^-]*--&gt;")
category_regex = re.compile(r"\[\[Category:[^\[]*]]")
file_regex = re.compile(r"\[\[File:[^[\]]+]]")
ref_regex = re.compile(r"&lt;ref.*?&gt;")  # non-greedy
ref_2_regex = re.compile(r"&lt;/ref.*?&gt;")  # non-greedy


def _get_clean_wp_text(article_text):
    clean_text = article_text.strip()

    # remove bolding & italic markup
    clean_text = clean_text.replace("'''", "")
    clean_text = clean_text.replace("''", "")

    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
    try_again = True
    previous_length = len(clean_text)
    while try_again:
        clean_text = info_regex.sub(
            "", clean_text
        )  # non-greedy match excluding a nested {
        if len(clean_text) < previous_length:
            try_again = True
        else:
            try_again = False
        previous_length = len(clean_text)

    # remove HTML comments
    clean_text = htlm_regex.sub("", clean_text)

    # remove Category and File statements
    clean_text = category_regex.sub("", clean_text)
    clean_text = file_regex.sub("", clean_text)

    # remove multiple =
    while "==" in clean_text:
        clean_text = clean_text.replace("==", "=")

    clean_text = clean_text.replace(". =", ".")
    clean_text = clean_text.replace(" = ", ". ")
    clean_text = clean_text.replace("= ", ".")
    clean_text = clean_text.replace(" =", "")

    # remove refs (non-greedy match)
    clean_text = ref_regex.sub("", clean_text)
    clean_text = ref_2_regex.sub("", clean_text)

    # remove additional wikiformatting
    clean_text = re.sub(r"&lt;blockquote&gt;", "", clean_text)
    clean_text = re.sub(r"&lt;/blockquote&gt;", "", clean_text)

    # change special characters back to normal ones
    clean_text = clean_text.replace(r"&lt;", "<")
    clean_text = clean_text.replace(r"&gt;", ">")
    clean_text = clean_text.replace(r"&quot;", '"')
    clean_text = clean_text.replace(r"&amp;nbsp;", " ")
    clean_text = clean_text.replace(r"&amp;", "&")

    # remove multiple spaces
    while "  " in clean_text:
        clean_text = clean_text.replace("  ", " ")

    return clean_text.strip()


def _write_training_article(article_id, clean_text, training_output):
    file_loc = training_output / "{}.txt".format(article_id)
    with file_loc.open("w", encoding="utf8") as outputfile:
        outputfile.write(clean_text)


def _write_training_entity(outputfile, article_id, alias, entity, start, end):
    line = "{}|{}|{}|{}|{}\n".format(article_id, alias, entity, start, end)
    outputfile.write(line)


def is_dev(article_id):
    return article_id.endswith("3")


def read_training(nlp, training_dir, dev, limit, kb=None):
    """ This method provides training examples that correspond to the entity annotations found by the nlp object.
     When kb is provided (for training), it will include negative training examples by using the candidate generator,
     and it will only keep positive training examples that can be found in the KB.
     When kb=None (for testing), it will include all positive examples only."""
    entityfile_loc = training_dir / ENTITY_FILE
    data = []

    # assume the data is written sequentially, so we can reuse the article docs
    current_article_id = None
    current_doc = None
    ents_by_offset = dict()
    skip_articles = set()
    total_entities = 0

    with entityfile_loc.open("r", encoding="utf8") as file:
        for line in file:
            if not limit or len(data) < limit:
                fields = line.replace("\n", "").split(sep="|")
                article_id = fields[0]
                alias = fields[1]
                wd_id = fields[2]
                start = fields[3]
                end = fields[4]

                if (
                    dev == is_dev(article_id)
                    and article_id != "article_id"
                    and article_id not in skip_articles
                ):
                    if not current_doc or (current_article_id != article_id):
                        # parse the new article text
                        file_name = article_id + ".txt"
                        try:
                            training_file = training_dir / file_name
                            with training_file.open("r", encoding="utf8") as f:
                                text = f.read()
                                # threshold for convenience / speed of processing
                                if len(text) < 30000:
                                    current_doc = nlp(text)
                                    current_article_id = article_id
                                    ents_by_offset = dict()
                                    for ent in current_doc.ents:
                                        sent_length = len(ent.sent)
                                        # custom filtering to avoid too long or too short sentences
                                        if 5 < sent_length < 100:
                                            offset = "{}_{}".format(
                                                ent.start_char, ent.end_char
                                            )
                                            ents_by_offset[offset] = ent
                                else:
                                    skip_articles.add(article_id)
                                    current_doc = None
                        except Exception as e:
                            print("Problem parsing article", article_id, e)
                            skip_articles.add(article_id)

                    # repeat checking this condition in case an exception was thrown
                    if current_doc and (current_article_id == article_id):
                        offset = "{}_{}".format(start, end)
                        found_ent = ents_by_offset.get(offset, None)
                        if found_ent:
                            if found_ent.text != alias:
                                skip_articles.add(article_id)
                                current_doc = None
                            else:
                                sent = found_ent.sent.as_doc()

                                gold_start = int(start) - found_ent.sent.start_char
                                gold_end = int(end) - found_ent.sent.start_char

                                gold_entities = {}
                                found_useful = False
                                for ent in sent.ents:
                                    entry = (ent.start_char, ent.end_char)
                                    gold_entry = (gold_start, gold_end)
                                    if entry == gold_entry:
                                        # add both pos and neg examples (in random order)
                                        # this will exclude examples not in the KB
                                        if kb:
                                            value_by_id = {}
                                            candidates = kb.get_candidates(alias)
                                            candidate_ids = [
                                                c.entity_ for c in candidates
                                            ]
                                            random.shuffle(candidate_ids)
                                            for kb_id in candidate_ids:
                                                found_useful = True
                                                if kb_id != wd_id:
                                                    value_by_id[kb_id] = 0.0
                                                else:
                                                    value_by_id[kb_id] = 1.0
                                            gold_entities[entry] = value_by_id
                                        # if no KB, keep all positive examples
                                        else:
                                            found_useful = True
                                            value_by_id = {wd_id: 1.0}

                                            gold_entities[entry] = value_by_id
                                    # currently feeding the gold data one entity per sentence at a time
                                    # setting all other entities to empty gold dictionary
                                    else:
                                        gold_entities[entry] = {}
                                if found_useful:
                                    gold = GoldParse(doc=sent, links=gold_entities)
                                    data.append((sent, gold))
                                    total_entities += 1
                                    if len(data) % 2500 == 0:
                                        print(" -read", total_entities, "entities")

    print(" -read", total_entities, "entities")
    return data