mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 16:52:21 +03:00
formatting
This commit is contained in:
parent
41fb5204ba
commit
f75d1299a7
|
@ -365,9 +365,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
||||||
encoding="utf8",
|
encoding="utf8",
|
||||||
) as f:
|
) as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
if (
|
# threshold for convenience / speed of processing
|
||||||
len(text) < 30000
|
if len(text) < 30000:
|
||||||
): # threshold for convenience / speed of processing
|
|
||||||
current_doc = nlp(text)
|
current_doc = nlp(text)
|
||||||
current_article_id = article_id
|
current_article_id = article_id
|
||||||
ents_by_offset = dict()
|
ents_by_offset = dict()
|
||||||
|
@ -386,7 +385,6 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Problem parsing article", article_id, e)
|
print("Problem parsing article", article_id, e)
|
||||||
skip_articles.add(article_id)
|
skip_articles.add(article_id)
|
||||||
raise e
|
|
||||||
|
|
||||||
# repeat checking this condition in case an exception was thrown
|
# repeat checking this condition in case an exception was thrown
|
||||||
if current_doc and (current_article_id == article_id):
|
if current_doc and (current_article_id == article_id):
|
||||||
|
@ -404,13 +402,17 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
||||||
gold_entities = {}
|
gold_entities = {}
|
||||||
found_useful = False
|
found_useful = False
|
||||||
for ent in sent.ents:
|
for ent in sent.ents:
|
||||||
if ent.start_char == gold_start and ent.end_char == gold_end:
|
entry = (ent.start_char, ent.end_char)
|
||||||
|
gold_entry = (gold_start, gold_end)
|
||||||
|
if entry == gold_entry:
|
||||||
# add both pos and neg examples (in random order)
|
# add both pos and neg examples (in random order)
|
||||||
# this will exclude examples not in the KB
|
# this will exclude examples not in the KB
|
||||||
if kb:
|
if kb:
|
||||||
value_by_id = {}
|
value_by_id = {}
|
||||||
candidates = kb.get_candidates(alias)
|
candidates = kb.get_candidates(alias)
|
||||||
candidate_ids = [c.entity_ for c in candidates]
|
candidate_ids = [
|
||||||
|
c.entity_ for c in candidates
|
||||||
|
]
|
||||||
random.shuffle(candidate_ids)
|
random.shuffle(candidate_ids)
|
||||||
for kb_id in candidate_ids:
|
for kb_id in candidate_ids:
|
||||||
found_useful = True
|
found_useful = True
|
||||||
|
@ -418,16 +420,17 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
||||||
value_by_id[kb_id] = 0.0
|
value_by_id[kb_id] = 0.0
|
||||||
else:
|
else:
|
||||||
value_by_id[kb_id] = 1.0
|
value_by_id[kb_id] = 1.0
|
||||||
gold_entities[(ent.start_char, ent.end_char)] = value_by_id
|
gold_entities[entry] = value_by_id
|
||||||
# if no KB, keep all positive examples
|
# if no KB, keep all positive examples
|
||||||
else:
|
else:
|
||||||
found_useful = True
|
found_useful = True
|
||||||
value_by_id = {wd_id: 1.0}
|
value_by_id = {wd_id: 1.0}
|
||||||
gold_entities[(ent.start_char, ent.end_char)] = value_by_id
|
|
||||||
|
gold_entities[entry] = value_by_id
|
||||||
# currently feeding the gold data one entity per sentence at a time
|
# currently feeding the gold data one entity per sentence at a time
|
||||||
# setting all other entities to empty gold dictionary
|
# setting all other entities to empty gold dictionary
|
||||||
else:
|
else:
|
||||||
gold_entities[(ent.start_char, ent.end_char)] = {}
|
gold_entities[entry] = {}
|
||||||
if found_useful:
|
if found_useful:
|
||||||
gold = GoldParse(doc=sent, links=gold_entities)
|
gold = GoldParse(doc=sent, links=gold_entities)
|
||||||
data.append((sent, gold))
|
data.append((sent, gold))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user