diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py
index a9be49742..0db7f4665 100644
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@@ -29,7 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv'
KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb'
VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab'
-TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/'
+TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/'
# these will/should be matched ignoring case
@@ -523,74 +524,104 @@ def create_training(kb):
def _read_wikipedia_texts(kb, wp_to_id, limit=None):
- """ Read the XML wikipedia data to parse out training data """
+ """
+ Read the XML wikipedia data to parse out training data:
+ raw text data + positive and negative instances
+ """
title_regex = re.compile(r'(?<=
).*(?=)')
id_regex = re.compile(r'(?<=)\d*(?=)')
- # read entity training header file
- _write_training_entity(article_id="article_id",
- alias="alias",
- entity="entity",
- correct="correct",
- append=False)
+ read_ids = set()
- with bz2.open(ENWIKI_DUMP, mode='rb') as file:
- line = file.readline()
- cnt = 0
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- while line and (not limit or cnt < limit):
- if cnt % 500000 == 0:
- print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump")
- clean_line = line.strip().decode("utf-8")
-
- # Start reading new page
- if clean_line == "":
- article_text = ""
- article_title = None
- article_id = None
-
- # finished reading this page
- elif clean_line == "":
- if article_id:
- try:
- _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
- # on a previous run, an error occurred after 46M lines and 2h
- except Exception as e:
- print("Error processing article", article_id, article_title)
- print(e)
-
- # start reading text within a page
- if "":
+ reading_revision = True
+ elif clean_line == "":
+ reading_revision = False
+
+ # Start reading new page
+ if clean_line == "":
+ article_text = ""
+ article_title = None
+ article_id = None
+
+ # finished reading this page
+ elif clean_line == "":
+ if article_id:
+ try:
+ _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip())
+ # on a previous run, an error occurred after 46M lines and 2h
+ except Exception as e:
+ print("Error processing article", article_id, article_title)
+ print(e)
+ else:
+ print("Done processing a page, but couldn't find an article_id ?")
+ print(article_title)
+ print(article_text)
+ article_text = ""
+ article_title = None
+ article_id = None
+ reading_text = False
+ reading_revision = False
+
+ # start reading text within a page
+ if ").*(?=).*(?=