From 34600c92bd5be2948debf465b9de9c2f3f2f16ee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 3 May 2019 15:10:09 +0200 Subject: [PATCH] try catch per article to ensure the pipeline goes on --- examples/pipeline/wikidata_entity_linking.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index cf388773a..a9be49742 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None): with bz2.open(ENWIKI_DUMP, mode='rb') as file: line = file.readline() - cnt = 1 + cnt = 0 article_text = "" article_title = None article_id = None @@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None): # finished reading this page elif clean_line == "": if article_id: - _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) + try: + _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) + # on a previous run, an error occurred after 46M lines and 2h + except Exception as e: + print("Error processing article", article_id, article_title) + print(e) # start reading text within a page if ").*(?=)') + text_regex = re.compile(r'(?<=).*(?=