mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
try catch per article to ensure the pipeline goes on
This commit is contained in:
parent
bbcb9da466
commit
34600c92bd
|
@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
||||||
|
|
||||||
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
with bz2.open(ENWIKI_DUMP, mode='rb') as file:
|
||||||
line = file.readline()
|
line = file.readline()
|
||||||
cnt = 1
|
cnt = 0
|
||||||
article_text = ""
|
article_text = ""
|
||||||
article_title = None
|
article_title = None
|
||||||
article_id = None
|
article_id = None
|
||||||
|
@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
||||||
# finished reading this page
|
# finished reading this page
|
||||||
elif clean_line == "</page>":
|
elif clean_line == "</page>":
|
||||||
if article_id:
|
if article_id:
|
||||||
_process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
|
try:
|
||||||
|
_process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip())
|
||||||
|
# on a previous run, an error occurred after 46M lines and 2h
|
||||||
|
except Exception as e:
|
||||||
|
print("Error processing article", article_id, article_title)
|
||||||
|
print(e)
|
||||||
|
|
||||||
# start reading text within a page
|
# start reading text within a page
|
||||||
if "<text" in clean_line:
|
if "<text" in clean_line:
|
||||||
|
@ -585,7 +590,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None):
|
||||||
|
|
||||||
def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
|
def _process_wp_text(kb, wp_to_id, article_id, article_title, article_text):
|
||||||
# remove the text tags
|
# remove the text tags
|
||||||
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text>)')
|
text_regex = re.compile(r'(?<=<text xml:space=\"preserve\">).*(?=</text)')
|
||||||
text = text_regex.search(article_text).group(0)
|
text = text_regex.search(article_text).group(0)
|
||||||
|
|
||||||
# stop processing if this is a redirect page
|
# stop processing if this is a redirect page
|
||||||
|
|
Loading…
Reference in New Issue
Block a user