diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index a0ca4444c..74bdbe9fb 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -364,11 +364,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None): sent_length = len(ent.sent) # custom filtering to avoid too long or too short sentences if 5 < sent_length < 100: - ents_by_offset[ - str(ent.start_char) - + "_" - + str(ent.end_char) - ] = ent + offset = "{}_{}".format(ent.start_char, ent.end_char) + ents_by_offset[offset] = ent else: skip_articles.add(article_id) current_doc = None @@ -378,7 +375,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None): # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): - found_ent = ents_by_offset.get(start + "_" + end, None) + offset = "{}_{}".format(start, end) + found_ent = ents_by_offset.get(offset, None) if found_ent: if found_ent.text != alias: skip_articles.add(article_id) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 32f751cd7..04e5bce6d 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -333,7 +333,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False): # only evaluating on positive examples for gold_kb, value in kb_dict.items(): if value: - offset = str(start) + "-" + str(end) + offset = _offset(start, end) correct_entries_per_article[offset] = gold_kb for ent in doc.ents: @@ -341,7 +341,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False): pred_entity = ent.kb_id_ start = ent.start_char end = ent.end_char - offset = str(start) + "-" + str(end) + offset = _offset(start, end) gold_entity = correct_entries_per_article.get(offset, None) # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' if gold_entity is not None: @@ -392,14 +392,14 @@ def _measure_baselines(data, kb): for gold_kb, value in kb_dict.items(): # only evaluating on positive examples if value: - offset = str(start) + "-" + str(end) + offset = _offset(start, end) correct_entries_per_article[offset] = gold_kb for ent in doc.ents: label = ent.label_ start = ent.start_char end = ent.end_char - offset = str(start) + "-" + str(end) + offset = _offset(start, end) gold_entity = correct_entries_per_article.get(offset, None) # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' @@ -454,6 +454,10 @@ def _measure_baselines(data, kb): ) +def _offset(start, end): + return "{}_{}".format(start, end) + + def calculate_acc(correct_by_label, incorrect_by_label): acc_by_label = dict() total_correct = 0