try catch per article to ensure the pipeline goes on

This commit is contained in:
svlandeg 2019-05-03 15:10:09 +02:00
parent bbcb9da466
commit 34600c92bd

View File

@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
with bz2.open(ENWIKI_DUMP, mode='rb') as file: with bz2.open(ENWIKI_DUMP, mode='rb') as file:
line = file.readline() line = file.readline()
cnt = 1 cnt = 0
article_text = "" article_text = ""
article_title = None article_title = None
article_id = None article_id = None
@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
# finished reading this page # finished reading this page
elif clean_line == "</page>": elif clean_line == "</page>":
if article_id: if article_id:
_process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) try:
_process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
# on a previous run, an error occurred after 46M lines and 2h
except Exception as e:
print("Error processing article", article_id, article_title)
print(e)
# start reading text within a page # start reading text within a page
if "<text" in clean_line: if "<text" in clean_line:
@ -585,7 +590,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text): def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
# remove the text tags # remove the text tags
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)') text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
text = text_regex.search(article_text).group(0) text = text_regex.search(article_text).group(0)
# stop processing if this is a redirect page # stop processing if this is a redirect page