format offsets

2025-07-11 08:42:28 +03:00 · 2019-07-23 11:31:29 +02:00 · 2019-07-23 11:31:29 +02:00 · cd6c263fe4
commit cd6c263fe4
parent 20389e4553
2 changed files with 12 additions and 10 deletions
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@ -364,11 +364,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                        sent_length = len(ent.sent)
                                        # custom filtering to avoid too long or too short sentences
                                        if 5 < sent_length < 100:
-                                            ents_by_offset[
+                                            offset = "{}_{}".format(ent.start_char, ent.end_char)
-                                                str(ent.start_char)
+                                            ents_by_offset[offset] = ent
                                                + "_"
                                                + str(ent.end_char)
                                            ] = ent
                                else:
                                    skip_articles.add(article_id)
                                    current_doc = None
@ -378,7 +375,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                    # repeat checking this condition in case an exception was thrown
                    if current_doc and (current_article_id == article_id):
-                        found_ent = ents_by_offset.get(start + "_" + end, None)
+                        offset = "{}_{}".format(start, end)
                        found_ent = ents_by_offset.get(offset, None)
                        if found_ent:
                            if found_ent.text != alias:
                                skip_articles.add(article_id)
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -333,7 +333,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
                # only evaluating on positive examples
                for gold_kb, value in kb_dict.items():
                    if value:
-                        offset = str(start) + "-" + str(end)
+                        offset = _offset(start, end)
                        correct_entries_per_article[offset] = gold_kb
            for ent in doc.ents:
@ -341,7 +341,7 @@ def _measure_acc(data, el_pipe=None, error_analysis=False):
                pred_entity = ent.kb_id_
                start = ent.start_char
                end = ent.end_char
-                offset = str(start) + "-" + str(end)
+                offset = _offset(start, end)
                gold_entity = correct_entries_per_article.get(offset, None)
                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
                if gold_entity is not None:
@ -392,14 +392,14 @@ def _measure_baselines(data, kb):
                for gold_kb, value in kb_dict.items():
                    # only evaluating on positive examples
                    if value:
-                        offset = str(start) + "-" + str(end)
+                        offset = _offset(start, end)
                        correct_entries_per_article[offset] = gold_kb
            for ent in doc.ents:
                label = ent.label_
                start = ent.start_char
                end = ent.end_char
-                offset = str(start) + "-" + str(end)
+                offset = _offset(start, end)
                gold_entity = correct_entries_per_article.get(offset, None)
                # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong'
@ -454,6 +454,10 @@ def _measure_baselines(data, kb):
    )
 def _offset(start, end):
    return "{}_{}".format(start, end)
 def calculate_acc(correct_by_label, incorrect_by_label):
    acc_by_label = dict()
    total_correct = 0