adding prior probability as feature in the model

2025-08-24 05:54:55 +03:00 · 2019-06-28 16:22:58 +02:00 · 2019-06-28 16:22:58 +02:00 · c664f58246
commit c664f58246
parent 1c80b85241
3 changed files with 40 additions and 24 deletions
--- a/examples/pipeline/wikidata_entity_linking.py
+++ b/examples/pipeline/wikidata_entity_linking.py
@ -45,7 +45,7 @@ EPOCHS = 10
 DROPOUT = 0.2
 LEARN_RATE = 0.005
 L2 = 1e-6
-CONTEXT_WIDTH=128
+CONTEXT_WIDTH = 128


 def run_pipeline():
@ -138,7 +138,9 @@ def run_pipeline():
    # STEP 6: create and train the entity linking pipe
    if train_pipe:
        print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
-        el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH})
+        el_pipe = nlp_2.create_pipe(name='entity_linker',
+                                    config={"context_width": CONTEXT_WIDTH,
+                                            "pretrained_vectors": nlp_2.vocab.vectors.name})
        el_pipe.set_kb(kb_2)
        nlp_2.add_pipe(el_pipe, last=True)

@ -195,11 +197,11 @@ def run_pipeline():
                if batchnr > 0:
                    with el_pipe.model.use_params(optimizer.averages):
                        el_pipe.context_weight = 1
-                        el_pipe.prior_weight = 0
+                        el_pipe.prior_weight = 1
                        dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe)
                        losses['entity_linker'] = losses['entity_linker'] / batchnr
                        print("Epoch, train loss", itn, round(losses['entity_linker'], 2),
-                              " / dev acc context avg", round(dev_acc_context, 3))
+                              " / dev acc avg", round(dev_acc_context, 3))

        # STEP 7: measure the performance of our trained pipe on an independent dev set
        if len(dev_data) and measure_performance:
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -666,15 +666,16 @@ def build_nel_encoder(embed_width, hidden_width, **cfg):
    entity_width = cfg.get("entity_width")

    with Model.define_operators({">>": chain, "**": clone}):
-        model = Affine(1, entity_width+context_width, drop_factor=0.0)\
+        model = Affine(entity_width, entity_width+context_width+1)\
+                >> Affine(1, entity_width, drop_factor=0.0)\
                >> logistic

        # context encoder
        tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
-                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth,
+                          cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth,
                          bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
                                >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
-                                >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
+                                >> zero_init(Affine(context_width, hidden_width))

        model.tok2vec = tok2vec

--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1132,7 +1132,8 @@ class EntityLinker(Pipe):

        context_docs = []
        entity_encodings = []
-        labels = []
+        cats = []
+        priors = []

        for doc, gold in zip(docs, golds):
            for entity in gold.links:
@ -1143,27 +1144,33 @@ class EntityLinker(Pipe):
                nr_neg = 0
                for c in candidates:
                    kb_id = c.entity_
+                    entity_encoding = c.entity_vector
+                    entity_encodings.append(entity_encoding)
+                    context_docs.append(doc)
+
+                    if self.prior_weight > 0:
+                        priors.append([c.prior_prob])
+                    else:
+                        priors.append([0])
+
                    if kb_id == gold_kb:
-                        entity_encoding = c.entity_vector
-                        entity_encodings.append(entity_encoding)
-                        context_docs.append(doc)
-                        labels.append([1])
-                    else:   # elif nr_neg < 1:
+                        cats.append([1])
+                    else:
                        nr_neg += 1
-                        entity_encoding = c.entity_vector
-                        entity_encodings.append(entity_encoding)
-                        context_docs.append(doc)
-                        labels.append([0])
+                        cats.append([0])

        if len(entity_encodings) > 0:
+            assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats)
+
            context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
            entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")

-            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
+            mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i]
+                                 for i in range(len(entity_encodings))]
            pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
-            labels = self.model.ops.asarray(labels, dtype="float32")
+            cats = self.model.ops.asarray(cats, dtype="float32")

-            loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None)
+            loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None)
            mention_gradient = bp_mention(d_scores, sgd=sgd)

            context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
@ -1221,13 +1228,19 @@ class EntityLinker(Pipe):
                    candidates = self.kb.get_candidates(ent.text)
                    if candidates:
                        random.shuffle(candidates)
+
+                        # this will set the prior probabilities to 0 (just like in training) if their weight is 0
                        prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
                        prior_probs *= self.prior_weight
+                        scores = prior_probs

-                        entity_encodings = xp.asarray([c.entity_vector for c in candidates])
-                        mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
-                        predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
-                        scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions)))
+                        if self.context_weight > 0:
+                            entity_encodings = xp.asarray([c.entity_vector for c in candidates])
+                            assert len(entity_encodings) == len(prior_probs)
+                            mention_encodings = [list(context_encoding) + list(entity_encodings[i])
+                                                 + list(prior_probs[i])
+                                                 for i in range(len(entity_encodings))]
+                            scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))

                        # TODO: thresholding
                        best_index = scores.argmax()