mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
format offsets
This commit is contained in:
parent
20389e4553
commit
cd6c263fe4
|
@ -364,11 +364,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
||||||
sent_length = len(ent.sent)
|
sent_length = len(ent.sent)
|
||||||
# custom filtering to avoid too long or too short sentences
|
# custom filtering to avoid too long or too short sentences
|
||||||
if 5 < sent_length < 100:
|
if 5 < sent_length < 100:
|
||||||
ents_by_offset[
|
offset = "{}_{}".format(ent.start_char, ent.end_char)
|
||||||
str(ent.start_char)
|
ents_by_offset[offset] = ent
|
||||||
+ "_"
|
|
||||||
+ str(ent.end_char)
|
|
||||||
] = ent
|
|
||||||
else:
|
else:
|
||||||
skip_articles.add(article_id)
|
skip_articles.add(article_id)
|
||||||
current_doc = None
|
current_doc = None
|
||||||
|
@ -378,7 +375,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
||||||
|
|
||||||
# repeat checking this condition in case an exception was thrown
|
# repeat checking this condition in case an exception was thrown
|
||||||
if current_doc and (current_article_id == article_id):
|
if current_doc and (current_article_id == article_id):
|
||||||
found_ent = ents_by_offset.get(start + "_" + end, None)
|
offset = "{}_{}".format(start, end)
|
||||||
|
found_ent = ents_by_offset.get(offset, None)
|
||||||
if found_ent:
|
if found_ent:
|
||||||
if found_ent.text != alias:
|
if found_ent.text != alias:
|
||||||
skip_articles.add(article_id)
|
skip_articles.add(article_id)
|
||||||
|
|
|
@ -333,7 +333,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
|
||||||
# only evaluating on positive examples
|
# only evaluating on positive examples
|
||||||
for gold_kb, value in kb_dict.items():
|
for gold_kb, value in kb_dict.items():
|
||||||
if value:
|
if value:
|
||||||
offset = str(start) + "-" + str(end)
|
offset = _offset(start, end)
|
||||||
correct_entries_per_article[offset] = gold_kb
|
correct_entries_per_article[offset] = gold_kb
|
||||||
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
|
@ -341,7 +341,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
|
||||||
pred_entity = ent.kb_id_
|
pred_entity = ent.kb_id_
|
||||||
start = ent.start_char
|
start = ent.start_char
|
||||||
end = ent.end_char
|
end = ent.end_char
|
||||||
offset = str(start) + "-" + str(end)
|
offset = _offset(start, end)
|
||||||
gold_entity = correct_entries_per_article.get(offset, None)
|
gold_entity = correct_entries_per_article.get(offset, None)
|
||||||
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
||||||
if gold_entity is not None:
|
if gold_entity is not None:
|
||||||
|
@ -392,14 +392,14 @@ def _measure_baselines(data, kb):
|
||||||
for gold_kb, value in kb_dict.items():
|
for gold_kb, value in kb_dict.items():
|
||||||
# only evaluating on positive examples
|
# only evaluating on positive examples
|
||||||
if value:
|
if value:
|
||||||
offset = str(start) + "-" + str(end)
|
offset = _offset(start, end)
|
||||||
correct_entries_per_article[offset] = gold_kb
|
correct_entries_per_article[offset] = gold_kb
|
||||||
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
label = ent.label_
|
label = ent.label_
|
||||||
start = ent.start_char
|
start = ent.start_char
|
||||||
end = ent.end_char
|
end = ent.end_char
|
||||||
offset = str(start) + "-" + str(end)
|
offset = _offset(start, end)
|
||||||
gold_entity = correct_entries_per_article.get(offset, None)
|
gold_entity = correct_entries_per_article.get(offset, None)
|
||||||
|
|
||||||
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
||||||
|
@ -454,6 +454,10 @@ def _measure_baselines(data, kb):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _offset(start, end):
|
||||||
|
return "{}_{}".format(start, end)
|
||||||
|
|
||||||
|
|
||||||
def calculate_acc(correct_by_label, incorrect_by_label):
|
def calculate_acc(correct_by_label, incorrect_by_label):
|
||||||
acc_by_label = dict()
|
acc_by_label = dict()
|
||||||
total_correct = 0
|
total_correct = 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user