small fixes

This commit is contained in:
svlandeg 2019-06-24 10:55:04 +02:00
parent b76a43bee4
commit b58bace84b
5 changed files with 21 additions and 22 deletions

View File

@ -292,8 +292,8 @@ def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
def spans_score(gold_spans, system_spans): def spans_score(gold_spans, system_spans):
correct, gi, si = 0, 0, 0 correct, gi, si = 0, 0, 0
undersegmented = list() undersegmented = []
oversegmented = list() oversegmented = []
combo = 0 combo = 0
previous_end_si_earlier = False previous_end_si_earlier = False
previous_end_gi_earlier = False previous_end_gi_earlier = False

View File

@ -42,9 +42,9 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
filtered_title_to_id = dict() filtered_title_to_id = dict()
entity_list = list() entity_list = []
description_list = list() description_list = []
frequency_list = list() frequency_list = []
for title, entity in title_to_id.items(): for title, entity in title_to_id.items():
freq = entity_frequencies.get(title, 0) freq = entity_frequencies.get(title, 0)
desc = id_to_descr.get(entity, None) desc = id_to_descr.get(entity, None)
@ -131,8 +131,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
line = prior_file.readline() line = prior_file.readline()
previous_alias = None previous_alias = None
total_count = 0 total_count = 0
counts = list() counts = []
entities = list() entities = []
while line: while line:
splits = line.replace('\n', "").split(sep='|') splits = line.replace('\n', "").split(sep='|')
new_alias = splits[0] new_alias = splits[0]
@ -142,8 +142,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
if new_alias != previous_alias and previous_alias: if new_alias != previous_alias and previous_alias:
# done reading the previous alias --> output # done reading the previous alias --> output
if len(entities) > 0: if len(entities) > 0:
selected_entities = list() selected_entities = []
prior_probs = list() prior_probs = []
for ent_count, ent_string in zip(counts, entities): for ent_count, ent_string in zip(counts, entities):
if ent_string in wp_titles: if ent_string in wp_titles:
wd_id = title_to_id[ent_string] wd_id = title_to_id[ent_string]
@ -157,8 +157,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
except ValueError as e: except ValueError as e:
print(e) print(e)
total_count = 0 total_count = 0
counts = list() counts = []
entities = list() entities = []
total_count += count total_count += count

View File

@ -343,7 +343,7 @@ def read_training(nlp, training_dir, dev, limit):
# currently feeding the gold data one entity per sentence at a time # currently feeding the gold data one entity per sentence at a time
gold_start = int(start) - found_ent.sent.start_char gold_start = int(start) - found_ent.sent.start_char
gold_end = int(end) - found_ent.sent.start_char gold_end = int(end) - found_ent.sent.start_char
gold_entities = list() gold_entities = []
gold_entities.append((gold_start, gold_end, wp_title)) gold_entities.append((gold_start, gold_end, wp_title))
gold = GoldParse(doc=sent, links=gold_entities) gold = GoldParse(doc=sent, links=gold_entities)
data.append((sent, gold)) data.append((sent, gold))

View File

@ -147,7 +147,7 @@ def run_pipeline():
if train_pipe: if train_pipe:
print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
# define the size (nr of entities) of training and dev set # define the size (nr of entities) of training and dev set
train_limit = 10000 train_limit = 5000
dev_limit = 5000 dev_limit = 5000
train_data = training_set_creator.read_training(nlp=nlp_2, train_data = training_set_creator.read_training(nlp=nlp_2,
@ -332,7 +332,7 @@ def _measure_baselines(data, kb):
best_candidate = "" best_candidate = ""
random_candidate = "" random_candidate = ""
if candidates: if candidates:
scores = list() scores = []
for c in candidates: for c in candidates:
scores.append(c.prior_prob) scores.append(c.prior_prob)

View File

@ -1131,8 +1131,8 @@ class EntityLinker(Pipe):
docs = [docs] docs = [docs]
golds = [golds] golds = [golds]
context_docs = list() context_docs = []
entity_encodings = list() entity_encodings = []
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
for entity in gold.links: for entity in gold.links:
@ -1198,8 +1198,8 @@ class EntityLinker(Pipe):
self.require_model() self.require_model()
self.require_kb() self.require_kb()
final_entities = list() final_entities = []
final_kb_ids = list() final_kb_ids = []
if not docs: if not docs:
return final_entities, final_kb_ids return final_entities, final_kb_ids
@ -1214,7 +1214,7 @@ class EntityLinker(Pipe):
for ent in doc.ents: for ent in doc.ents:
candidates = self.kb.get_candidates(ent.text) candidates = self.kb.get_candidates(ent.text)
if candidates: if candidates:
scores = list() scores = []
for c in candidates: for c in candidates:
prior_prob = c.prior_prob * self.prior_weight prior_prob = c.prior_prob * self.prior_weight
kb_id = c.entity_ kb_id = c.entity_
@ -1259,11 +1259,10 @@ class EntityLinker(Pipe):
return self return self
def rehearse(self, docs, sgd=None, losses=None, **config): def rehearse(self, docs, sgd=None, losses=None, **config):
# TODO raise NotImplementedError
pass
def add_label(self, label): def add_label(self, label):
pass raise NotImplementedError
class Sentencizer(object): class Sentencizer(object):