format offsets

This commit is contained in:
svlandeg 2019-07-23 11:31:29 +02:00
parent 20389e4553
commit cd6c263fe4
2 changed files with 12 additions and 10 deletions

View File

@ -364,11 +364,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
sent_length = len(ent.sent) sent_length = len(ent.sent)
# custom filtering to avoid too long or too short sentences # custom filtering to avoid too long or too short sentences
if 5 < sent_length < 100: if 5 < sent_length < 100:
ents_by_offset[ offset = "{}_{}".format(ent.start_char, ent.end_char)
str(ent.start_char) ents_by_offset[offset] = ent
+ "_"
+ str(ent.end_char)
] = ent
else: else:
skip_articles.add(article_id) skip_articles.add(article_id)
current_doc = None current_doc = None
@ -378,7 +375,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
# repeat checking this condition in case an exception was thrown # repeat checking this condition in case an exception was thrown
if current_doc and (current_article_id == article_id): if current_doc and (current_article_id == article_id):
found_ent = ents_by_offset.get(start + "_" + end, None) offset = "{}_{}".format(start, end)
found_ent = ents_by_offset.get(offset, None)
if found_ent: if found_ent:
if found_ent.text != alias: if found_ent.text != alias:
skip_articles.add(article_id) skip_articles.add(article_id)

View File

@ -333,7 +333,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
# only evaluating on positive examples # only evaluating on positive examples
for gold_kb, value in kb_dict.items(): for gold_kb, value in kb_dict.items():
if value: if value:
offset = str(start) + "-" + str(end) offset = _offset(start, end)
correct_entries_per_article[offset] = gold_kb correct_entries_per_article[offset] = gold_kb
for ent in doc.ents: for ent in doc.ents:
@ -341,7 +341,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
pred_entity = ent.kb_id_ pred_entity = ent.kb_id_
start = ent.start_char start = ent.start_char
end = ent.end_char end = ent.end_char
offset = str(start) + "-" + str(end) offset = _offset(start, end)
gold_entity = correct_entries_per_article.get(offset, None) gold_entity = correct_entries_per_article.get(offset, None)
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
if gold_entity is not None: if gold_entity is not None:
@ -392,14 +392,14 @@ def _measure_baselines(data, kb):
for gold_kb, value in kb_dict.items(): for gold_kb, value in kb_dict.items():
# only evaluating on positive examples # only evaluating on positive examples
if value: if value:
offset = str(start) + "-" + str(end) offset = _offset(start, end)
correct_entries_per_article[offset] = gold_kb correct_entries_per_article[offset] = gold_kb
for ent in doc.ents: for ent in doc.ents:
label = ent.label_ label = ent.label_
start = ent.start_char start = ent.start_char
end = ent.end_char end = ent.end_char
offset = str(start) + "-" + str(end) offset = _offset(start, end)
gold_entity = correct_entries_per_article.get(offset, None) gold_entity = correct_entries_per_article.get(offset, None)
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
@ -454,6 +454,10 @@ def _measure_baselines(data, kb):
) )
def _offset(start, end):
return "{}_{}".format(start, end)
def calculate_acc(correct_by_label, incorrect_by_label): def calculate_acc(correct_by_label, incorrect_by_label):
acc_by_label = dict() acc_by_label = dict()
total_correct = 0 total_correct = 0