avoid enumerate to avoid long waiting at 0% (#5159)

This commit is contained in:
Sofie Van Landeghem 2020-04-02 15:04:15 +02:00 committed by GitHub
parent 2b14997b68
commit 9cf965c260
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
if not labels_discard:
labels_discard = []
texts = []
entities_list = []
max_index = max(line_ids)
with entity_file_path.open("r", encoding="utf8") as file:
for i, line in enumerate(file):
with entity_file_path.open("r", encoding="utf8") as _file:
line = _file.readline()
i = 0
while line and i < max_index:
if i in line_ids:
example = json.loads(line)
article_id = example["article_id"]
@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
if dev != is_dev(article_id) or not is_valid_article(clean_text):
continue
texts.append(clean_text)
entities_list.append(entities)
docs = nlp.pipe(texts, batch_size=50)
for doc, entities in zip(docs, entities_list):
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
if gold and len(gold.links) > 0:
yield doc, gold
doc = nlp(clean_text)
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
if gold and len(gold.links) > 0:
yield doc, gold
i += 1
line = _file.readline()
def _get_gold_parse(doc, entities, dev, kb, labels_discard):