experiment with adding NER types to the feature vector

This commit is contained in:
svlandeg 2019-06-29 14:52:36 +02:00
parent c664f58246
commit 2d2dea9924
3 changed files with 41 additions and 16 deletions

View File

@ -42,7 +42,7 @@ MIN_PAIR_OCC = 5
# model training parameters # model training parameters
EPOCHS = 10 EPOCHS = 10
DROPOUT = 0.2 DROPOUT = 0.5
LEARN_RATE = 0.005 LEARN_RATE = 0.005
L2 = 1e-6 L2 = 1e-6
CONTEXT_WIDTH = 128 CONTEXT_WIDTH = 128
@ -73,10 +73,10 @@ def run_pipeline():
measure_performance = True measure_performance = True
# test the EL pipe on a simple example # test the EL pipe on a simple example
to_test_pipeline = True to_test_pipeline = False
# write the NLP object, read back in and test again # write the NLP object, read back in and test again
to_write_nlp = True to_write_nlp = False
to_read_nlp = False to_read_nlp = False
test_from_file = False test_from_file = False
@ -138,9 +138,12 @@ def run_pipeline():
# STEP 6: create and train the entity linking pipe # STEP 6: create and train the entity linking pipe
if train_pipe: if train_pipe:
print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)}
print(" -analysing", len(type_to_int), "different entity types")
el_pipe = nlp_2.create_pipe(name='entity_linker', el_pipe = nlp_2.create_pipe(name='entity_linker',
config={"context_width": CONTEXT_WIDTH, config={"context_width": CONTEXT_WIDTH,
"pretrained_vectors": nlp_2.vocab.vectors.name}) "pretrained_vectors": nlp_2.vocab.vectors.name,
"type_to_int": type_to_int})
el_pipe.set_kb(kb_2) el_pipe.set_kb(kb_2)
nlp_2.add_pipe(el_pipe, last=True) nlp_2.add_pipe(el_pipe, last=True)
@ -151,8 +154,8 @@ def run_pipeline():
optimizer.L2 = L2 optimizer.L2 = L2
# define the size (nr of entities) of training and dev set # define the size (nr of entities) of training and dev set
train_limit = 500000 train_limit = 50000
dev_limit = 5000 dev_limit = 50000
train_data = training_set_creator.read_training(nlp=nlp_2, train_data = training_set_creator.read_training(nlp=nlp_2,
training_dir=TRAINING_DIR, training_dir=TRAINING_DIR,
@ -219,7 +222,7 @@ def run_pipeline():
# measuring combined accuracy (prior + context) # measuring combined accuracy (prior + context)
el_pipe.context_weight = 1 el_pipe.context_weight = 1
el_pipe.prior_weight = 1 el_pipe.prior_weight = 1
dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False)
print("dev acc combo avg:", round(dev_acc_combo, 3), print("dev acc combo avg:", round(dev_acc_combo, 3),
[(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
@ -264,7 +267,7 @@ def run_pipeline():
nlp_3 = spacy.load(NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR)
el_pipe = nlp_3.get_pipe("entity_linker") el_pipe = nlp_3.get_pipe("entity_linker")
dev_limit = 10000 dev_limit = 5000
dev_data = training_set_creator.read_training(nlp=nlp_2, dev_data = training_set_creator.read_training(nlp=nlp_2,
training_dir=TRAINING_DIR, training_dir=TRAINING_DIR,
dev=True, dev=True,
@ -273,7 +276,7 @@ def run_pipeline():
print("Dev testing from file on", len(dev_data), "articles") print("Dev testing from file on", len(dev_data), "articles")
print() print()
dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe) dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False)
print("dev acc combo avg:", round(dev_acc_combo, 3), print("dev acc combo avg:", round(dev_acc_combo, 3),
[(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()])
@ -281,7 +284,7 @@ def run_pipeline():
print("STOP", datetime.datetime.now()) print("STOP", datetime.datetime.now())
def _measure_accuracy(data, el_pipe=None): def _measure_accuracy(data, el_pipe=None, error_analysis=False):
# If the docs in the data require further processing with an entity linker, set el_pipe # If the docs in the data require further processing with an entity linker, set el_pipe
correct_by_label = dict() correct_by_label = dict()
incorrect_by_label = dict() incorrect_by_label = dict()
@ -312,6 +315,10 @@ def _measure_accuracy(data, el_pipe=None):
else: else:
incorrect = incorrect_by_label.get(ent_label, 0) incorrect = incorrect_by_label.get(ent_label, 0)
incorrect_by_label[ent_label] = incorrect + 1 incorrect_by_label[ent_label] = incorrect + 1
if error_analysis:
print(ent.text, "in", doc)
print("Predicted", pred_entity, "should have been", gold_entity)
print()
except Exception as e: except Exception as e:
print("Error assessing accuracy", e) print("Error assessing accuracy", e)

View File

@ -652,7 +652,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
return model return model
def build_nel_encoder(embed_width, hidden_width, **cfg): def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg):
# TODO proper error # TODO proper error
if "entity_width" not in cfg: if "entity_width" not in cfg:
raise ValueError("entity_width not found") raise ValueError("entity_width not found")
@ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg):
entity_width = cfg.get("entity_width") entity_width = cfg.get("entity_width")
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):
model = Affine(entity_width, entity_width+context_width+1)\ model = Affine(entity_width, entity_width+context_width+1+ner_types)\
>> Affine(1, entity_width, drop_factor=0.0)\ >> Affine(1, entity_width, drop_factor=0.0)\
>> logistic >> logistic

View File

@ -1074,8 +1074,9 @@ class EntityLinker(Pipe):
def Model(cls, **cfg): def Model(cls, **cfg):
embed_width = cfg.get("embed_width", 300) embed_width = cfg.get("embed_width", 300)
hidden_width = cfg.get("hidden_width", 128) hidden_width = cfg.get("hidden_width", 128)
type_to_int = cfg.get("type_to_int", dict())
model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg) model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg)
return model return model
def __init__(self, **cfg): def __init__(self, **cfg):
@ -1086,6 +1087,7 @@ class EntityLinker(Pipe):
self.context_weight = cfg.get("context_weight", 1) self.context_weight = cfg.get("context_weight", 1)
self.prior_weight = cfg.get("prior_weight", 1) self.prior_weight = cfg.get("prior_weight", 1)
self.context_width = cfg.get("context_width") self.context_width = cfg.get("context_width")
self.type_to_int = cfg.get("type_to_int", dict())
def set_kb(self, kb): def set_kb(self, kb):
self.kb = kb self.kb = kb
@ -1134,11 +1136,22 @@ class EntityLinker(Pipe):
entity_encodings = [] entity_encodings = []
cats = [] cats = []
priors = [] priors = []
type_vectors = []
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
ents_by_offset = dict()
for ent in doc.ents:
ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent
for entity in gold.links: for entity in gold.links:
start, end, gold_kb = entity start, end, gold_kb = entity
mention = doc.text[start:end] mention = doc.text[start:end]
gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)]
assert gold_ent is not None
type_vector = [0 for i in range(len(self.type_to_int))]
if len(self.type_to_int) > 0:
type_vector[self.type_to_int[gold_ent.label_]] = 1
candidates = self.kb.get_candidates(mention) candidates = self.kb.get_candidates(mention)
random.shuffle(candidates) random.shuffle(candidates)
nr_neg = 0 nr_neg = 0
@ -1147,6 +1160,7 @@ class EntityLinker(Pipe):
entity_encoding = c.entity_vector entity_encoding = c.entity_vector
entity_encodings.append(entity_encoding) entity_encodings.append(entity_encoding)
context_docs.append(doc) context_docs.append(doc)
type_vectors.append(type_vector)
if self.prior_weight > 0: if self.prior_weight > 0:
priors.append([c.prior_prob]) priors.append([c.prior_prob])
@ -1160,12 +1174,12 @@ class EntityLinker(Pipe):
cats.append([0]) cats.append([0])
if len(entity_encodings) > 0: if len(entity_encodings) > 0:
assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors)
context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i]
for i in range(len(entity_encodings))] for i in range(len(entity_encodings))]
pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
cats = self.model.ops.asarray(cats, dtype="float32") cats = self.model.ops.asarray(cats, dtype="float32")
@ -1225,6 +1239,10 @@ class EntityLinker(Pipe):
if len(doc) > 0: if len(doc) > 0:
context_encoding = context_encodings[i] context_encoding = context_encodings[i]
for ent in doc.ents: for ent in doc.ents:
type_vector = [0 for i in range(len(self.type_to_int))]
if len(self.type_to_int) > 0:
type_vector[self.type_to_int[ent.label_]] = 1
candidates = self.kb.get_candidates(ent.text) candidates = self.kb.get_candidates(ent.text)
if candidates: if candidates:
random.shuffle(candidates) random.shuffle(candidates)
@ -1238,7 +1256,7 @@ class EntityLinker(Pipe):
entity_encodings = xp.asarray([c.entity_vector for c in candidates]) entity_encodings = xp.asarray([c.entity_vector for c in candidates])
assert len(entity_encodings) == len(prior_probs) assert len(entity_encodings) == len(prior_probs)
mention_encodings = [list(context_encoding) + list(entity_encodings[i]) mention_encodings = [list(context_encoding) + list(entity_encodings[i])
+ list(prior_probs[i]) + list(prior_probs[i]) + type_vector
for i in range(len(entity_encodings))] for i in range(len(entity_encodings))]
scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))