formatting

2025-12-03 08:14:20 +03:00 · 2019-07-19 14:52:45 +02:00 · 2019-07-19 14:52:45 +02:00 · f75d1299a7
commit f75d1299a7
parent 41fb5204ba
1 changed files with 12 additions and 9 deletions
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ b/bin/wiki_entity_linking/training_set_creator.py
@ -365,9 +365,8 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                encoding="utf8",
                            ) as f:
                                text = f.read()
-                                if (
+                                # threshold for convenience / speed of processing
-                                    len(text) < 30000
+                                if len(text) < 30000:
                                ):  # threshold for convenience / speed of processing
                                    current_doc = nlp(text)
                                    current_article_id = article_id
                                    ents_by_offset = dict()
@ -386,7 +385,6 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                        except Exception as e:
                            print("Problem parsing article", article_id, e)
                            skip_articles.add(article_id)
                            raise e
                    # repeat checking this condition in case an exception was thrown
                    if current_doc and (current_article_id == article_id):
@ -404,13 +402,17 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                gold_entities = {}
                                found_useful = False
                                for ent in sent.ents:
-                                    if ent.start_char == gold_start and ent.end_char == gold_end:
+                                    entry = (ent.start_char, ent.end_char)
                                    gold_entry = (gold_start, gold_end)
                                    if entry == gold_entry:
                                        # add both pos and neg examples (in random order)
                                        # this will exclude examples not in the KB
                                        if kb:
                                            value_by_id = {}
                                            candidates = kb.get_candidates(alias)
-                                            candidate_ids = [c.entity_ for c in candidates]
+                                            candidate_ids = [
                                                c.entity_ for c in candidates
                                            ]
                                            random.shuffle(candidate_ids)
                                            for kb_id in candidate_ids:
                                                found_useful = True
@ -418,16 +420,17 @@ def read_training(nlp, training_dir, dev, limit, kb=None):
                                                    value_by_id[kb_id] = 0.0
                                                else:
                                                    value_by_id[kb_id] = 1.0
-                                            gold_entities[(ent.start_char, ent.end_char)] = value_by_id
+                                            gold_entities[entry] = value_by_id
                                        # if no KB, keep all positive examples
                                        else:
                                            found_useful = True
                                            value_by_id = {wd_id: 1.0}
-                                            gold_entities[(ent.start_char, ent.end_char)] = value_by_id
+
                                            gold_entities[entry] = value_by_id
                                    # currently feeding the gold data one entity per sentence at a time
                                    # setting all other entities to empty gold dictionary
                                    else:
-                                        gold_entities[(ent.start_char, ent.end_char)] = {}
+                                        gold_entities[entry] = {}
                                if found_useful:
                                    gold = GoldParse(doc=sent, links=gold_entities)
                                    data.append((sent, gold))