mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-03 05:04:09 +03:00
avoid enumerate to avoid long waiting at 0% (#5159)
This commit is contained in:
parent
2b14997b68
commit
9cf965c260
|
@ -479,11 +479,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
|
||||||
if not labels_discard:
|
if not labels_discard:
|
||||||
labels_discard = []
|
labels_discard = []
|
||||||
|
|
||||||
texts = []
|
max_index = max(line_ids)
|
||||||
entities_list = []
|
|
||||||
|
|
||||||
with entity_file_path.open("r", encoding="utf8") as file:
|
with entity_file_path.open("r", encoding="utf8") as _file:
|
||||||
for i, line in enumerate(file):
|
line = _file.readline()
|
||||||
|
i = 0
|
||||||
|
while line and i < max_index:
|
||||||
if i in line_ids:
|
if i in line_ids:
|
||||||
example = json.loads(line)
|
example = json.loads(line)
|
||||||
article_id = example["article_id"]
|
article_id = example["article_id"]
|
||||||
|
@ -493,15 +494,12 @@ def read_el_docs_golds(nlp, entity_file_path, dev, line_ids, kb, labels_discard=
|
||||||
if dev != is_dev(article_id) or not is_valid_article(clean_text):
|
if dev != is_dev(article_id) or not is_valid_article(clean_text):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
texts.append(clean_text)
|
doc = nlp(clean_text)
|
||||||
entities_list.append(entities)
|
|
||||||
|
|
||||||
docs = nlp.pipe(texts, batch_size=50)
|
|
||||||
|
|
||||||
for doc, entities in zip(docs, entities_list):
|
|
||||||
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
|
gold = _get_gold_parse(doc, entities, dev=dev, kb=kb, labels_discard=labels_discard)
|
||||||
if gold and len(gold.links) > 0:
|
if gold and len(gold.links) > 0:
|
||||||
yield doc, gold
|
yield doc, gold
|
||||||
|
i += 1
|
||||||
|
line = _file.readline()
|
||||||
|
|
||||||
|
|
||||||
def _get_gold_parse(doc, entities, dev, kb, labels_discard):
|
def _get_gold_parse(doc, entities, dev, kb, labels_discard):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user