mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-25 07:23:05 +03:00
small fixes
This commit is contained in:
parent
b76a43bee4
commit
b58bace84b
|
@ -292,8 +292,8 @@ def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True):
|
||||||
|
|
||||||
def spans_score(gold_spans, system_spans):
|
def spans_score(gold_spans, system_spans):
|
||||||
correct, gi, si = 0, 0, 0
|
correct, gi, si = 0, 0, 0
|
||||||
undersegmented = list()
|
undersegmented = []
|
||||||
oversegmented = list()
|
oversegmented = []
|
||||||
combo = 0
|
combo = 0
|
||||||
previous_end_si_earlier = False
|
previous_end_si_earlier = False
|
||||||
previous_end_gi_earlier = False
|
previous_end_gi_earlier = False
|
||||||
|
|
|
@ -42,9 +42,9 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
|
||||||
|
|
||||||
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
|
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
|
||||||
filtered_title_to_id = dict()
|
filtered_title_to_id = dict()
|
||||||
entity_list = list()
|
entity_list = []
|
||||||
description_list = list()
|
description_list = []
|
||||||
frequency_list = list()
|
frequency_list = []
|
||||||
for title, entity in title_to_id.items():
|
for title, entity in title_to_id.items():
|
||||||
freq = entity_frequencies.get(title, 0)
|
freq = entity_frequencies.get(title, 0)
|
||||||
desc = id_to_descr.get(entity, None)
|
desc = id_to_descr.get(entity, None)
|
||||||
|
@ -131,8 +131,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
|
||||||
line = prior_file.readline()
|
line = prior_file.readline()
|
||||||
previous_alias = None
|
previous_alias = None
|
||||||
total_count = 0
|
total_count = 0
|
||||||
counts = list()
|
counts = []
|
||||||
entities = list()
|
entities = []
|
||||||
while line:
|
while line:
|
||||||
splits = line.replace('\n', "").split(sep='|')
|
splits = line.replace('\n', "").split(sep='|')
|
||||||
new_alias = splits[0]
|
new_alias = splits[0]
|
||||||
|
@ -142,8 +142,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
|
||||||
if new_alias != previous_alias and previous_alias:
|
if new_alias != previous_alias and previous_alias:
|
||||||
# done reading the previous alias --> output
|
# done reading the previous alias --> output
|
||||||
if len(entities) > 0:
|
if len(entities) > 0:
|
||||||
selected_entities = list()
|
selected_entities = []
|
||||||
prior_probs = list()
|
prior_probs = []
|
||||||
for ent_count, ent_string in zip(counts, entities):
|
for ent_count, ent_string in zip(counts, entities):
|
||||||
if ent_string in wp_titles:
|
if ent_string in wp_titles:
|
||||||
wd_id = title_to_id[ent_string]
|
wd_id = title_to_id[ent_string]
|
||||||
|
@ -157,8 +157,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
print(e)
|
print(e)
|
||||||
total_count = 0
|
total_count = 0
|
||||||
counts = list()
|
counts = []
|
||||||
entities = list()
|
entities = []
|
||||||
|
|
||||||
total_count += count
|
total_count += count
|
||||||
|
|
||||||
|
|
|
@ -343,7 +343,7 @@ def read_training(nlp, training_dir, dev, limit):
|
||||||
# currently feeding the gold data one entity per sentence at a time
|
# currently feeding the gold data one entity per sentence at a time
|
||||||
gold_start = int(start) - found_ent.sent.start_char
|
gold_start = int(start) - found_ent.sent.start_char
|
||||||
gold_end = int(end) - found_ent.sent.start_char
|
gold_end = int(end) - found_ent.sent.start_char
|
||||||
gold_entities = list()
|
gold_entities = []
|
||||||
gold_entities.append((gold_start, gold_end, wp_title))
|
gold_entities.append((gold_start, gold_end, wp_title))
|
||||||
gold = GoldParse(doc=sent, links=gold_entities)
|
gold = GoldParse(doc=sent, links=gold_entities)
|
||||||
data.append((sent, gold))
|
data.append((sent, gold))
|
||||||
|
|
|
@ -147,7 +147,7 @@ def run_pipeline():
|
||||||
if train_pipe:
|
if train_pipe:
|
||||||
print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
|
print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
|
||||||
# define the size (nr of entities) of training and dev set
|
# define the size (nr of entities) of training and dev set
|
||||||
train_limit = 10000
|
train_limit = 5000
|
||||||
dev_limit = 5000
|
dev_limit = 5000
|
||||||
|
|
||||||
train_data = training_set_creator.read_training(nlp=nlp_2,
|
train_data = training_set_creator.read_training(nlp=nlp_2,
|
||||||
|
@ -332,7 +332,7 @@ def _measure_baselines(data, kb):
|
||||||
best_candidate = ""
|
best_candidate = ""
|
||||||
random_candidate = ""
|
random_candidate = ""
|
||||||
if candidates:
|
if candidates:
|
||||||
scores = list()
|
scores = []
|
||||||
|
|
||||||
for c in candidates:
|
for c in candidates:
|
||||||
scores.append(c.prior_prob)
|
scores.append(c.prior_prob)
|
||||||
|
|
|
@ -1131,8 +1131,8 @@ class EntityLinker(Pipe):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
|
||||||
context_docs = list()
|
context_docs = []
|
||||||
entity_encodings = list()
|
entity_encodings = []
|
||||||
|
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
for entity in gold.links:
|
for entity in gold.links:
|
||||||
|
@ -1198,8 +1198,8 @@ class EntityLinker(Pipe):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
self.require_kb()
|
self.require_kb()
|
||||||
|
|
||||||
final_entities = list()
|
final_entities = []
|
||||||
final_kb_ids = list()
|
final_kb_ids = []
|
||||||
|
|
||||||
if not docs:
|
if not docs:
|
||||||
return final_entities, final_kb_ids
|
return final_entities, final_kb_ids
|
||||||
|
@ -1214,7 +1214,7 @@ class EntityLinker(Pipe):
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
candidates = self.kb.get_candidates(ent.text)
|
candidates = self.kb.get_candidates(ent.text)
|
||||||
if candidates:
|
if candidates:
|
||||||
scores = list()
|
scores = []
|
||||||
for c in candidates:
|
for c in candidates:
|
||||||
prior_prob = c.prior_prob * self.prior_weight
|
prior_prob = c.prior_prob * self.prior_weight
|
||||||
kb_id = c.entity_
|
kb_id = c.entity_
|
||||||
|
@ -1259,11 +1259,10 @@ class EntityLinker(Pipe):
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def rehearse(self, docs, sgd=None, losses=None, **config):
|
def rehearse(self, docs, sgd=None, losses=None, **config):
|
||||||
# TODO
|
raise NotImplementedError
|
||||||
pass
|
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
pass
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class Sentencizer(object):
|
class Sentencizer(object):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user