avoid enumerate to avoid long waiting at 0% (#5159)

2025-07-03 03:13:08 +03:00 · 2020-04-02 15:04:15 +02:00 · 2020-04-02 15:04:15 +02:00 · 9cf965c260
commit 9cf965c260
parent 2b14997b68
1 changed files with 11 additions and 13 deletions
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
    if not labels_discard:
        labels_discard = []

-    texts = []
-    entities_list = []
+    max_index = max(line_ids)

-    with entity_file_path.open("r", encoding="utf8") as file:
-        for i, line in enumerate(file):
+    with entity_file_path.open("r", encoding="utf8") as _file:
+        line = _file.readline()
+        i = 0
+        while line and i < max_index:
            if i in line_ids:
                example = json.loads(line)
                article_id = example["article_id"]
@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
                if dev != is_dev(article_id) or not is_valid_article(clean_text):
                    continue

-                texts.append(clean_text)
-                entities_list.append(entities)
-
-    docs = nlp.pipe(texts, batch_size=50)
-
-    for doc, entities in zip(docs, entities_list):
+                doc = nlp(clean_text)
                gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
                if gold and len(gold.links) > 0:
                    yield doc, gold
+            i += 1
+            line = _file.readline()


 def _get_gold_parse(doc, entities, dev, kb, labels_discard):