mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
format offsets
This commit is contained in:
parent
20389e4553
commit
cd6c263fe4
|
@ -364,11 +364,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
|||
sent_length = len(ent.sent)
|
||||
# custom filtering to avoid too long or too short sentences
|
||||
if 5 < sent_length < 100:
|
||||
ents_by_offset[
|
||||
str(ent.start_char)
|
||||
+ "_"
|
||||
+ str(ent.end_char)
|
||||
] = ent
|
||||
offset = "{}_{}".format(ent.start_char, ent.end_char)
|
||||
ents_by_offset[offset] = ent
|
||||
else:
|
||||
skip_articles.add(article_id)
|
||||
current_doc = None
|
||||
|
@ -378,7 +375,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
|
|||
|
||||
# repeat checking this condition in case an exception was thrown
|
||||
if current_doc and (current_article_id == article_id):
|
||||
found_ent = ents_by_offset.get(start + "_" + end, None)
|
||||
offset = "{}_{}".format(start, end)
|
||||
found_ent = ents_by_offset.get(offset, None)
|
||||
if found_ent:
|
||||
if found_ent.text != alias:
|
||||
skip_articles.add(article_id)
|
||||
|
|
|
@ -333,7 +333,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
|
|||
# only evaluating on positive examples
|
||||
for gold_kb, value in kb_dict.items():
|
||||
if value:
|
||||
offset = str(start) + "-" + str(end)
|
||||
offset = _offset(start, end)
|
||||
correct_entries_per_article[offset] = gold_kb
|
||||
|
||||
for ent in doc.ents:
|
||||
|
@ -341,7 +341,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
|
|||
pred_entity = ent.kb_id_
|
||||
start = ent.start_char
|
||||
end = ent.end_char
|
||||
offset = str(start) + "-" + str(end)
|
||||
offset = _offset(start, end)
|
||||
gold_entity = correct_entries_per_article.get(offset, None)
|
||||
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
||||
if gold_entity is not None:
|
||||
|
@ -392,14 +392,14 @@ def _measure_baselines(data, kb):
|
|||
for gold_kb, value in kb_dict.items():
|
||||
# only evaluating on positive examples
|
||||
if value:
|
||||
offset = str(start) + "-" + str(end)
|
||||
offset = _offset(start, end)
|
||||
correct_entries_per_article[offset] = gold_kb
|
||||
|
||||
for ent in doc.ents:
|
||||
label = ent.label_
|
||||
start = ent.start_char
|
||||
end = ent.end_char
|
||||
offset = str(start) + "-" + str(end)
|
||||
offset = _offset(start, end)
|
||||
gold_entity = correct_entries_per_article.get(offset, None)
|
||||
|
||||
# the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
|
||||
|
@ -454,6 +454,10 @@ def _measure_baselines(data, kb):
|
|||
)
|
||||
|
||||
|
||||
def _offset(start, end):
|
||||
return "{}_{}".format(start, end)
|
||||
|
||||
|
||||
def calculate_acc(correct_by_label, incorrect_by_label):
|
||||
acc_by_label = dict()
|
||||
total_correct = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user