avoid enumerate to avoid long waiting at 0% (#5159)

This commit is contained in:
Sofie Van Landeghem 2020-04-02 15:04:15 +02:00 committed by GitHub
parent 2b14997b68
commit 9cf965c260
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
if not labels_discard: if not labels_discard:
labels_discard = [] labels_discard = []
texts = [] max_index = max(line_ids)
entities_list = []
with entity_file_path.open("r", encoding="utf8") as file: with entity_file_path.open("r", encoding="utf8") as _file:
for i, line in enumerate(file): line = _file.readline()
i = 0
while line and i < max_index:
if i in line_ids: if i in line_ids:
example = json.loads(line) example = json.loads(line)
article_id = example["article_id"] article_id = example["article_id"]
@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
if dev != is_dev(article_id) or not is_valid_article(clean_text): if dev != is_dev(article_id) or not is_valid_article(clean_text):
continue continue
texts.append(clean_text) doc = nlp(clean_text)
entities_list.append(entities)
docs = nlp.pipe(texts, batch_size=50)
for doc, entities in zip(docs, entities_list):
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
if gold and len(gold.links) > 0: if gold and len(gold.links) > 0:
yield doc, gold yield doc, gold
i += 1
line = _file.readline()
def _get_gold_parse(doc, entities, dev, kb, labels_discard): def _get_gold_parse(doc, entities, dev, kb, labels_discard):