mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 05:04:09 +03:00
avoid enumerate to avoid long waiting at 0% (#5159)
This commit is contained in:
parent
2b14997b68
commit
9cf965c260
|
@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
|
|||
if not labels_discard:
|
||||
labels_discard = []
|
||||
|
||||
texts = []
|
||||
entities_list = []
|
||||
max_index = max(line_ids)
|
||||
|
||||
with entity_file_path.open("r", encoding="utf8") as file:
|
||||
for i, line in enumerate(file):
|
||||
with entity_file_path.open("r", encoding="utf8") as _file:
|
||||
line = _file.readline()
|
||||
i = 0
|
||||
while line and i < max_index:
|
||||
if i in line_ids:
|
||||
example = json.loads(line)
|
||||
article_id = example["article_id"]
|
||||
|
@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
|
|||
if dev != is_dev(article_id) or not is_valid_article(clean_text):
|
||||
continue
|
||||
|
||||
texts.append(clean_text)
|
||||
entities_list.append(entities)
|
||||
|
||||
docs = nlp.pipe(texts, batch_size=50)
|
||||
|
||||
for doc, entities in zip(docs, entities_list):
|
||||
doc = nlp(clean_text)
|
||||
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
|
||||
if gold and len(gold.links) > 0:
|
||||
yield doc, gold
|
||||
i += 1
|
||||
line = _file.readline()
|
||||
|
||||
|
||||
def _get_gold_parse(doc, entities, dev, kb, labels_discard):
|
||||
|
|
Loading…
Reference in New Issue
Block a user