diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 315b1e916..ed3c35c43 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard= if not labels_discard: labels_discard = [] - texts = [] - entities_list = [] + max_index = max(line_ids) - with entity_file_path.open("r", encoding="utf8") as file: - for i, line in enumerate(file): + with entity_file_path.open("r", encoding="utf8") as _file: + line = _file.readline() + i = 0 + while line and i < max_index: if i in line_ids: example = json.loads(line) article_id = example["article_id"] @@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard= if dev != is_dev(article_id) or not is_valid_article(clean_text): continue - texts.append(clean_text) - entities_list.append(entities) - - docs = nlp.pipe(texts, batch_size=50) - - for doc, entities in zip(docs, entities_list): - gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) - if gold and len(gold.links) > 0: - yield doc, gold + doc = nlp(clean_text) + gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard) + if gold and len(gold.links) > 0: + yield doc, gold + i += 1 + line = _file.readline() def _get_gold_parse(doc, entities, dev, kb, labels_discard):