spaCy/bin/wiki_entity_linking/training_set_creator.py

# coding: utf-8
from __future__ import unicode_literals

import os
import re
import bz2
import datetime

from spacy.gold import GoldParse
from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp

"""
Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
Gold-standard entities are stored in one file in standoff format (by character offset).
"""

# ENTITY_FILE = "gold_entities.csv"
ENTITY_FILE = "gold_entities_1000000.csv"   # use this file for faster processing


def create_training(wikipedia_input, entity_def_input, training_output):
    wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
    _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)


def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):
    """
    Read the XML wikipedia data to parse out training data:
    raw text data + positive instances
    """
    title_regex = re.compile(r'(?<=<title>).*(?=</title>)')
    id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')

    read_ids = set()
    entityfile_loc = training_output / ENTITY_FILE
    with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:
        # write entity training header file
        _write_training_entity(outputfile=entityfile,
                               article_id="article_id",
                               alias="alias",
                               entity="WD_id",
                               start="start",
                               end="end")

        with bz2.open(wikipedia_input, mode='rb') as file:
            line = file.readline()
            cnt = 0
            article_text = ""
            article_title = None
            article_id = None
            reading_text = False
            reading_revision = False
            while line and (not limit or cnt < limit):
                if cnt % 1000000 == 0:
                    print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
                clean_line = line.strip().decode("utf-8")

                if clean_line == "<revision>":
                    reading_revision = True
                elif clean_line == "</revision>":
                    reading_revision = False

                # Start reading new page
                if clean_line == "<page>":
                    article_text = ""
                    article_title = None
                    article_id = None

                # finished reading this page
                elif clean_line == "</page>":
                    if article_id:
                        try:
                            _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(),
                                             training_output)
                        except Exception as e:
                            print("Error processing article", article_id, article_title, e)
                    else:
                        print("Done processing a page, but couldn't find an article_id ?", article_title)
                    article_text = ""
                    article_title = None
                    article_id = None
                    reading_text = False
                    reading_revision = False

                # start reading text within a page
                if "<text" in clean_line:
                    reading_text = True

                if reading_text:
                    article_text += " " + clean_line

                # stop reading text within a page (we assume a new page doesn't start on the same line)
                if "</text" in clean_line:
                    reading_text = False

                # read the ID of this article (outside the revision portion of the document)
                if not reading_revision:
                    ids = id_regex.search(clean_line)
                    if ids:
                        article_id = ids[0]
                        if article_id in read_ids:
                            print("Found duplicate article ID", article_id, clean_line)  # This should never happen ...
                        read_ids.add(article_id)

                # read the title of this article (outside the revision portion of the document)
                if not reading_revision:
                    titles = title_regex.search(clean_line)
                    if titles:
                        article_title = titles[0].strip()

                line = file.readline()
                cnt += 1


text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')


def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text, training_output):
    found_entities = False

    # ignore meta Wikipedia pages
    if article_title.startswith("Wikipedia:"):
        return

    # remove the text tags
    text = text_regex.search(article_text).group(0)

    # stop processing if this is a redirect page
    if text.startswith("#REDIRECT"):
        return

    # get the raw text without markup etc, keeping only interwiki links
    clean_text = _get_clean_wp_text(text)

    # read the text char by char to get the right offsets for the interwiki links
    final_text = ""
    open_read = 0
    reading_text = True
    reading_entity = False
    reading_mention = False
    reading_special_case = False
    entity_buffer = ""
    mention_buffer = ""
    for index, letter in enumerate(clean_text):
        if letter == '[':
            open_read += 1
        elif letter == ']':
            open_read -= 1
        elif letter == '|':
            if reading_text:
                final_text += letter
            # switch from reading entity to mention in the [[entity|mention]] pattern
            elif reading_entity:
                reading_text = False
                reading_entity = False
                reading_mention = True
            else:
                reading_special_case = True
        else:
            if reading_entity:
                entity_buffer += letter
            elif reading_mention:
                mention_buffer += letter
            elif reading_text:
                final_text += letter
            else:
                raise ValueError("Not sure at point", clean_text[index-2:index+2])

        if open_read > 2:
            reading_special_case = True

        if open_read == 2 and reading_text:
            reading_text = False
            reading_entity = True
            reading_mention = False

        # we just finished reading an entity
        if open_read == 0 and not reading_text:
            if '#' in entity_buffer or entity_buffer.startswith(':'):
                reading_special_case = True
            # Ignore cases with nested structures like File: handles etc
            if not reading_special_case:
                if not mention_buffer:
                    mention_buffer = entity_buffer
                start = len(final_text)
                end = start + len(mention_buffer)
                qid = wp_to_id.get(entity_buffer, None)
                if qid:
                    _write_training_entity(outputfile=entityfile,
                                           article_id=article_id,
                                           alias=mention_buffer,
                                           entity=qid,
                                           start=start,
                                           end=end)
                found_entities = True
                final_text += mention_buffer

            entity_buffer = ""
            mention_buffer = ""

            reading_text = True
            reading_entity = False
            reading_mention = False
            reading_special_case = False

    if found_entities:
        _write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output)


info_regex = re.compile(r'{[^{]*?}')
htlm_regex = re.compile(r'&lt;!--[^-]*--&gt;')
category_regex = re.compile(r'\[\[Category:[^\[]*]]')
file_regex = re.compile(r'\[\[File:[^[\]]+]]')
ref_regex = re.compile(r'&lt;ref.*?&gt;')     # non-greedy
ref_2_regex = re.compile(r'&lt;/ref.*?&gt;')  # non-greedy


def _get_clean_wp_text(article_text):
    clean_text = article_text.strip()

    # remove bolding & italic markup
    clean_text = clean_text.replace('\'\'\'', '')
    clean_text = clean_text.replace('\'\'', '')

    # remove nested {{info}} statements by removing the inner/smallest ones first and iterating
    try_again = True
    previous_length = len(clean_text)
    while try_again:
        clean_text = info_regex.sub('', clean_text)  # non-greedy match excluding a nested {
        if len(clean_text) < previous_length:
            try_again = True
        else:
            try_again = False
        previous_length = len(clean_text)

    # remove HTML comments
    clean_text = htlm_regex.sub('', clean_text)

    # remove Category and File statements
    clean_text = category_regex.sub('', clean_text)
    clean_text = file_regex.sub('', clean_text)

    # remove multiple =
    while '==' in clean_text:
        clean_text = clean_text.replace("==", "=")

    clean_text = clean_text.replace(". =", ".")
    clean_text = clean_text.replace(" = ", ". ")
    clean_text = clean_text.replace("= ", ".")
    clean_text = clean_text.replace(" =", "")

    # remove refs (non-greedy match)
    clean_text = ref_regex.sub('', clean_text)
    clean_text = ref_2_regex.sub('', clean_text)

    # remove additional wikiformatting
    clean_text = re.sub(r'&lt;blockquote&gt;', '', clean_text)
    clean_text = re.sub(r'&lt;/blockquote&gt;', '', clean_text)

    # change special characters back to normal ones
    clean_text = clean_text.replace(r'&lt;', '<')
    clean_text = clean_text.replace(r'&gt;', '>')
    clean_text = clean_text.replace(r'&quot;', '"')
    clean_text = clean_text.replace(r'&amp;nbsp;', ' ')
    clean_text = clean_text.replace(r'&amp;', '&')

    # remove multiple spaces
    while '  ' in clean_text:
        clean_text = clean_text.replace('  ', ' ')

    return clean_text.strip()


def _write_training_article(article_id, clean_text, training_output):
    file_loc = training_output / str(article_id) + ".txt"
    with open(file_loc, mode='w', encoding='utf8') as outputfile:
        outputfile.write(clean_text)


def _write_training_entity(outputfile, article_id, alias, entity, start, end):
    outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n")


def is_dev(article_id):
    return article_id.endswith("3")


def read_training(nlp, training_dir, dev, limit):
    # This method provides training examples that correspond to the entity annotations found by the nlp object
    entityfile_loc = training_dir / ENTITY_FILE
    data = []

    # assume the data is written sequentially, so we can reuse the article docs
    current_article_id = None
    current_doc = None
    ents_by_offset = dict()
    skip_articles = set()
    total_entities = 0

    with open(entityfile_loc, mode='r', encoding='utf8') as file:
        for line in file:
            if not limit or len(data) < limit:
                fields = line.replace('\n', "").split(sep='|')
                article_id = fields[0]
                alias = fields[1]
                wp_title = fields[2]
                start = fields[3]
                end = fields[4]

                if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:
                    if not current_doc or (current_article_id != article_id):
                        # parse the new article text
                        file_name = article_id + ".txt"
                        try:
                            with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:
                                text = f.read()
                                if len(text) < 30000:   # threshold for convenience / speed of processing
                                    current_doc = nlp(text)
                                    current_article_id = article_id
                                    ents_by_offset = dict()
                                    for ent in current_doc.ents:
                                        sent_length = len(ent.sent)
                                        # custom filtering to avoid too long or too short sentences
                                        if 5 < sent_length < 100:
                                            ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
                                else:
                                    skip_articles.add(current_article_id)
                                    current_doc = None
                        except Exception as e:
                            print("Problem parsing article", article_id, e)
                            skip_articles.add(current_article_id)

                    # repeat checking this condition in case an exception was thrown
                    if current_doc and (current_article_id == article_id):
                        found_ent = ents_by_offset.get(start + "_" + end,  None)
                        if found_ent:
                            if found_ent.text != alias:
                                skip_articles.add(current_article_id)
                                current_doc = None
                            else:
                                sent = found_ent.sent.as_doc()
                                # currently feeding the gold data one entity per sentence at a time
                                gold_start = int(start) - found_ent.sent.start_char
                                gold_end = int(end) - found_ent.sent.start_char
                                gold_entities = list()
                                gold_entities.append((gold_start, gold_end, wp_title))
                                gold = GoldParse(doc=sent, links=gold_entities)
                                data.append((sent, gold))
                                total_entities += 1
                                if len(data) % 2500 == 0:
                                    print(" -read", total_entities, "entities")

    print(" -read", total_entities, "entities")
    return data
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# coding: utf-8`
			`from __future__ import unicode_literals`

storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00			`import os`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`import re`
			`import bz2`
			`import datetime`

storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00			`from spacy.gold import GoldParse`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
			`"""`
small tweaks and documentation 2019-06-18 19:38:09 +03:00			`Process Wikipedia interlinks to generate a training dataset for the EL algorithm.`
			`Gold-standard entities are stored in one file in standoff format (by character offset).`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`"""`

reprocessing all of wikipedia for training data 2019-06-16 22:14:45 +03:00			`# ENTITY_FILE = "gold_entities.csv"`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
using entity descriptions and article texts as input embedding vectors for training 2019-05-07 17:03:42 +03:00
further code cleanup 2019-06-19 10:15:43 +03:00			`def create_training(wikipedia_input, entity_def_input, training_output):`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`wp_to_id = kb_creator.get_entity_to_id(entity_def_input)`
further code cleanup 2019-06-19 10:15:43 +03:00			`_process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00

further code cleanup 2019-06-19 10:15:43 +03:00			`def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None):`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`"""`
			`Read the XML wikipedia data to parse out training data:`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`raw text data + positive instances`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`"""`
			`title_regex = re.compile(r'(?<=<title>).*(?=</title>)')`
			`id_regex = re.compile(r'(?<=<id>)\d*(?=</id>)')`

			`read_ids = set()`
further code cleanup 2019-06-19 10:15:43 +03:00			`entityfile_loc = training_output / ENTITY_FILE`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`with open(entityfile_loc, mode="w", encoding='utf8') as entityfile:`
			`# write entity training header file`
			`_write_training_entity(outputfile=entityfile,`
			`article_id="article_id",`
			`alias="alias",`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`entity="WD_id",`
			`start="start",`
			`end="end")`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
further code cleanup 2019-06-19 10:15:43 +03:00			`with bz2.open(wikipedia_input, mode='rb') as file:`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`line = file.readline()`
			`cnt = 0`
			`article_text = ""`
			`article_title = None`
			`article_id = None`
			`reading_text = False`
			`reading_revision = False`
			`while line and (not limit or cnt < limit):`
			`if cnt % 1000000 == 0:`
			`print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")`
			`clean_line = line.strip().decode("utf-8")`

			`if clean_line == "<revision>":`
			`reading_revision = True`
			`elif clean_line == "</revision>":`
			`reading_revision = False`

			`# Start reading new page`
			`if clean_line == "<page>":`
			`article_text = ""`
			`article_title = None`
			`article_id = None`

			`# finished reading this page`
			`elif clean_line == "</page>":`
			`if article_id:`
			`try:`
clean up code, remove old code, move to bin 2019-06-18 14:20:40 +03:00			`_process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(),`
			`training_output)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`except Exception as e:`
			`print("Error processing article", article_id, article_title, e)`
			`else:`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`print("Done processing a page, but couldn't find an article_id ?", article_title)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`article_text = ""`
			`article_title = None`
			`article_id = None`
			`reading_text = False`
			`reading_revision = False`

			`# start reading text within a page`
			`if "<text" in clean_line:`
			`reading_text = True`

			`if reading_text:`
			`article_text += " " + clean_line`

			`# stop reading text within a page (we assume a new page doesn't start on the same line)`
			`if "</text" in clean_line:`
			`reading_text = False`

			`# read the ID of this article (outside the revision portion of the document)`
			`if not reading_revision:`
			`ids = id_regex.search(clean_line)`
			`if ids:`
			`article_id = ids[0]`
			`if article_id in read_ids:`
			`print("Found duplicate article ID", article_id, clean_line) # This should never happen ...`
			`read_ids.add(article_id)`

further code cleanup 2019-06-19 10:15:43 +03:00			`# read the title of this article (outside the revision portion of the document)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`if not reading_revision:`
			`titles = title_regex.search(clean_line)`
			`if titles:`
			`article_title = titles[0].strip()`

			`line = file.readline()`
			`cnt += 1`


			`text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')`


redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text, training_output):`
			`found_entities = False`

			`# ignore meta Wikipedia pages`
			`if article_title.startswith("Wikipedia:"):`
			`return`

refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`# remove the text tags`
			`text = text_regex.search(article_text).group(0)`

			`# stop processing if this is a redirect page`
			`if text.startswith("#REDIRECT"):`
			`return`

redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`# get the raw text without markup etc, keeping only interwiki links`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`clean_text = _get_clean_wp_text(text)`

further code cleanup 2019-06-19 10:15:43 +03:00			`# read the text char by char to get the right offsets for the interwiki links`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`final_text = ""`
			`open_read = 0`
			`reading_text = True`
			`reading_entity = False`
			`reading_mention = False`
			`reading_special_case = False`
			`entity_buffer = ""`
			`mention_buffer = ""`
			`for index, letter in enumerate(clean_text):`
			`if letter == '[':`
			`open_read += 1`
			`elif letter == ']':`
			`open_read -= 1`
			`elif letter == '\|':`
			`if reading_text:`
			`final_text += letter`
			`# switch from reading entity to mention in the [[entity\|mention]] pattern`
			`elif reading_entity:`
			`reading_text = False`
			`reading_entity = False`
			`reading_mention = True`
			`else:`
			`reading_special_case = True`
			`else:`
			`if reading_entity:`
			`entity_buffer += letter`
			`elif reading_mention:`
			`mention_buffer += letter`
			`elif reading_text:`
			`final_text += letter`
			`else:`
			`raise ValueError("Not sure at point", clean_text[index-2:index+2])`

			`if open_read > 2:`
			`reading_special_case = True`

			`if open_read == 2 and reading_text:`
			`reading_text = False`
			`reading_entity = True`
			`reading_mention = False`

			`# we just finished reading an entity`
			`if open_read == 0 and not reading_text:`
			`if '#' in entity_buffer or entity_buffer.startswith(':'):`
			`reading_special_case = True`
			`# Ignore cases with nested structures like File: handles etc`
			`if not reading_special_case:`
			`if not mention_buffer:`
			`mention_buffer = entity_buffer`
			`start = len(final_text)`
			`end = start + len(mention_buffer)`
			`qid = wp_to_id.get(entity_buffer, None)`
			`if qid:`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`_write_training_entity(outputfile=entityfile,`
			`article_id=article_id,`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`alias=mention_buffer,`
			`entity=qid,`
			`start=start,`
			`end=end)`
			`found_entities = True`
			`final_text += mention_buffer`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`entity_buffer = ""`
			`mention_buffer = ""`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`reading_text = True`
			`reading_entity = False`
			`reading_mention = False`
			`reading_special_case = False`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`if found_entities:`
			`_write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output)`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00

			`info_regex = re.compile(r'{[^{]*?}')`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`htlm_regex = re.compile(r'<!--[^-]*-->')`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`category_regex = re.compile(r'\[\[Category:[^\[]*]]')`
			`file_regex = re.compile(r'\[\[File:[^[\]]+]]')`
			`ref_regex = re.compile(r'<ref.*?>') # non-greedy`
			`ref_2_regex = re.compile(r'</ref.*?>') # non-greedy`


			`def _get_clean_wp_text(article_text):`
			`clean_text = article_text.strip()`

			`# remove bolding & italic markup`
			`clean_text = clean_text.replace('\'\'\'', '')`
			`clean_text = clean_text.replace('\'\'', '')`

			`# remove nested {{info}} statements by removing the inner/smallest ones first and iterating`
			`try_again = True`
			`previous_length = len(clean_text)`
			`while try_again:`
			`clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested {`
			`if len(clean_text) < previous_length:`
			`try_again = True`
			`else:`
			`try_again = False`
			`previous_length = len(clean_text)`

			`# remove HTML comments`
			`clean_text = htlm_regex.sub('', clean_text)`

			`# remove Category and File statements`
			`clean_text = category_regex.sub('', clean_text)`
			`clean_text = file_regex.sub('', clean_text)`

			`# remove multiple =`
			`while '==' in clean_text:`
			`clean_text = clean_text.replace("==", "=")`

			`clean_text = clean_text.replace(". =", ".")`
			`clean_text = clean_text.replace(" = ", ". ")`
			`clean_text = clean_text.replace("= ", ".")`
			`clean_text = clean_text.replace(" =", "")`

			`# remove refs (non-greedy match)`
			`clean_text = ref_regex.sub('', clean_text)`
			`clean_text = ref_2_regex.sub('', clean_text)`

			`# remove additional wikiformatting`
			`clean_text = re.sub(r'<blockquote>', '', clean_text)`
			`clean_text = re.sub(r'</blockquote>', '', clean_text)`

			`# change special characters back to normal ones`
			`clean_text = clean_text.replace(r'<', '<')`
			`clean_text = clean_text.replace(r'>', '>')`
			`clean_text = clean_text.replace(r'"', '"')`
			`clean_text = clean_text.replace(r'&nbsp;', ' ')`
			`clean_text = clean_text.replace(r'&', '&')`

			`# remove multiple spaces`
			`while ' ' in clean_text:`
			`clean_text = clean_text.replace(' ', ' ')`

			`return clean_text.strip()`


			`def _write_training_article(article_id, clean_text, training_output):`
further code cleanup 2019-06-19 10:15:43 +03:00			`file_loc = training_output / str(article_id) + ".txt"`
refactor code to separate functionality into different files 2019-05-06 11:56:56 +03:00			`with open(file_loc, mode='w', encoding='utf8') as outputfile:`
			`outputfile.write(clean_text)`


redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00			`def _write_training_entity(outputfile, article_id, alias, entity, start, end):`
			`outputfile.write(article_id + "\|" + alias + "\|" + entity + "\|" + str(start) + "\|" + str(end) + "\n")`
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00

reprocessing all of wikipedia for training data 2019-06-16 22:14:45 +03:00			`def is_dev(article_id):`
			`return article_id.endswith("3")`


baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`def read_training(nlp, training_dir, dev, limit):`
			`# This method provides training examples that correspond to the entity annotations found by the nlp object`
further code cleanup 2019-06-19 10:15:43 +03:00			`entityfile_loc = training_dir / ENTITY_FILE`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`data = []`

further code cleanup 2019-06-19 10:15:43 +03:00			`# assume the data is written sequentially, so we can reuse the article docs`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`current_article_id = None`
			`current_doc = None`
			`ents_by_offset = dict()`
			`skip_articles = set()`
			`total_entities = 0`
redo training data to be independent of KB and entity-level instead of doc-level 2019-06-14 16:55:26 +03:00
baseline evaluation using highest-freq candidate 2019-05-06 16:13:50 +03:00			`with open(entityfile_loc, mode='r', encoding='utf8') as file:`
			`for line in file:`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`if not limit or len(data) < limit:`
reprocessing all of wikipedia for training data 2019-06-16 22:14:45 +03:00			`fields = line.replace('\n', "").split(sep='\|')`
			`article_id = fields[0]`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`alias = fields[1]`
			`wp_title = fields[2]`
			`start = fields[3]`
			`end = fields[4]`

			`if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles:`
			`if not current_doc or (current_article_id != article_id):`
			`# parse the new article text`
			`file_name = article_id + ".txt"`
			`try:`
			`with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f:`
			`text = f.read()`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`if len(text) < 30000: # threshold for convenience / speed of processing`
			`current_doc = nlp(text)`
			`current_article_id = article_id`
			`ents_by_offset = dict()`
			`for ent in current_doc.ents:`
small tweaks and documentation 2019-06-18 19:38:09 +03:00			`sent_length = len(ent.sent)`
			`# custom filtering to avoid too long or too short sentences`
			`if 5 < sent_length < 100:`
			`ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`else:`
			`skip_articles.add(current_article_id)`
			`current_doc = None`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`except Exception as e:`
			`print("Problem parsing article", article_id, e)`
small tweaks and documentation 2019-06-18 19:38:09 +03:00			`skip_articles.add(current_article_id)`
storing NEL training data in GoldParse objects 2019-06-07 13:58:42 +03:00
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`# repeat checking this condition in case an exception was thrown`
			`if current_doc and (current_article_id == article_id):`
			`found_ent = ents_by_offset.get(start + "_" + end, None)`
			`if found_ent:`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`if found_ent.text != alias:`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`skip_articles.add(current_article_id)`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`current_doc = None`
baseline performances: oracle KB, random and prior prob 2019-06-17 15:39:40 +03:00			`else:`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`sent = found_ent.sent.as_doc()`
			`# currently feeding the gold data one entity per sentence at a time`
			`gold_start = int(start) - found_ent.sent.start_char`
			`gold_end = int(end) - found_ent.sent.start_char`
			`gold_entities = list()`
			`gold_entities.append((gold_start, gold_end, wp_title))`
further code cleanup 2019-06-19 10:15:43 +03:00			`gold = GoldParse(doc=sent, links=gold_entities)`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`data.append((sent, gold))`
			`total_entities += 1`
further code cleanup 2019-06-19 10:15:43 +03:00			`if len(data) % 2500 == 0:`
sentence encoder only (removing article/mention encoder) 2019-06-18 01:05:47 +03:00			`print(" -read", total_entities, "entities")`

			`print(" -read", total_entities, "entities")`
introduce goldparse.links 2019-06-07 14:54:45 +03:00			`return data`