diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a61af3660..c0a7e3c66 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -42,7 +42,7 @@ MIN_PAIR_OCC = 5 # model training parameters EPOCHS = 10 -DROPOUT = 0.2 +DROPOUT = 0.5 LEARN_RATE = 0.005 L2 = 1e-6 CONTEXT_WIDTH = 128 @@ -73,10 +73,10 @@ def run_pipeline(): measure_performance = True # test the EL pipe on a simple example - to_test_pipeline = True + to_test_pipeline = False # write the NLP object, read back in and test again - to_write_nlp = True + to_write_nlp = False to_read_nlp = False test_from_file = False @@ -138,9 +138,12 @@ def run_pipeline(): # STEP 6: create and train the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) + type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} + print(" -analysing", len(type_to_int), "different entity types") el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH, - "pretrained_vectors": nlp_2.vocab.vectors.name}) + "pretrained_vectors": nlp_2.vocab.vectors.name, + "type_to_int": type_to_int}) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) @@ -151,8 +154,8 @@ def run_pipeline(): optimizer.L2 = L2 # define the size (nr of entities) of training and dev set - train_limit = 500000 - dev_limit = 5000 + train_limit = 50000 + dev_limit = 50000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -219,7 +222,7 @@ def run_pipeline(): # measuring combined accuracy (prior + context) el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) @@ -264,7 +267,7 @@ def run_pipeline(): nlp_3 = spacy.load(NLP_2_DIR) el_pipe = nlp_3.get_pipe("entity_linker") - dev_limit = 10000 + dev_limit = 5000 dev_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, @@ -273,7 +276,7 @@ def run_pipeline(): print("Dev testing from file on", len(dev_data), "articles") print() - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe) + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) @@ -281,7 +284,7 @@ def run_pipeline(): print("STOP", datetime.datetime.now()) -def _measure_accuracy(data, el_pipe=None): +def _measure_accuracy(data, el_pipe=None, error_analysis=False): # If the docs in the data require further processing with an entity linker, set el_pipe correct_by_label = dict() incorrect_by_label = dict() @@ -312,6 +315,10 @@ def _measure_accuracy(data, el_pipe=None): else: incorrect = incorrect_by_label.get(ent_label, 0) incorrect_by_label[ent_label] = incorrect + 1 + if error_analysis: + print(ent.text, "in", doc) + print("Predicted", pred_entity, "should have been", gold_entity) + print() except Exception as e: print("Error assessing accuracy", e) diff --git a/spacy/_ml.py b/spacy/_ml.py index 07037f653..cca324b45 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -652,7 +652,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, return model -def build_nel_encoder(embed_width, hidden_width, **cfg): +def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): # TODO proper error if "entity_width" not in cfg: raise ValueError("entity_width not found") @@ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg): entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = Affine(entity_width, entity_width+context_width+1)\ + model = Affine(entity_width, entity_width+context_width+1+ner_types)\ >> Affine(1, entity_width, drop_factor=0.0)\ >> logistic diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d3f6fa776..f1a864fcf 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1074,8 +1074,9 @@ class EntityLinker(Pipe): def Model(cls, **cfg): embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 128) + type_to_int = cfg.get("type_to_int", dict()) - model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg) + model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg) return model def __init__(self, **cfg): @@ -1086,6 +1087,7 @@ class EntityLinker(Pipe): self.context_weight = cfg.get("context_weight", 1) self.prior_weight = cfg.get("prior_weight", 1) self.context_width = cfg.get("context_width") + self.type_to_int = cfg.get("type_to_int", dict()) def set_kb(self, kb): self.kb = kb @@ -1134,11 +1136,22 @@ class EntityLinker(Pipe): entity_encodings = [] cats = [] priors = [] + type_vectors = [] for doc, gold in zip(docs, golds): + ents_by_offset = dict() + for ent in doc.ents: + ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent for entity in gold.links: start, end, gold_kb = entity mention = doc.text[start:end] + + gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] + assert gold_ent is not None + type_vector = [0 for i in range(len(self.type_to_int))] + if len(self.type_to_int) > 0: + type_vector[self.type_to_int[gold_ent.label_]] = 1 + candidates = self.kb.get_candidates(mention) random.shuffle(candidates) nr_neg = 0 @@ -1147,6 +1160,7 @@ class EntityLinker(Pipe): entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) context_docs.append(doc) + type_vectors.append(type_vector) if self.prior_weight > 0: priors.append([c.prior_prob]) @@ -1160,12 +1174,12 @@ class EntityLinker(Pipe): cats.append([0]) if len(entity_encodings) > 0: - assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) + assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors) context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i] for i in range(len(entity_encodings))] pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) cats = self.model.ops.asarray(cats, dtype="float32") @@ -1225,6 +1239,10 @@ class EntityLinker(Pipe): if len(doc) > 0: context_encoding = context_encodings[i] for ent in doc.ents: + type_vector = [0 for i in range(len(self.type_to_int))] + if len(self.type_to_int) > 0: + type_vector[self.type_to_int[ent.label_]] = 1 + candidates = self.kb.get_candidates(ent.text) if candidates: random.shuffle(candidates) @@ -1238,7 +1256,7 @@ class EntityLinker(Pipe): entity_encodings = xp.asarray([c.entity_vector for c in candidates]) assert len(entity_encodings) == len(prior_probs) mention_encodings = [list(context_encoding) + list(entity_encodings[i]) - + list(prior_probs[i]) + + list(prior_probs[i]) + type_vector for i in range(len(entity_encodings))] scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))