implementing el pipe in pipes.pyx (not tested yet)

2025-12-18 07:34:15 +03:00 · 2019-06-03 21:32:54 +02:00 · 2019-06-03 21:32:54 +02:00 · fb37cdb2d3
commit fb37cdb2d3
parent d83a1e3052
4 changed files with 160 additions and 41 deletions
--- a/examples/pipeline/wiki_entity_linking/train_el.py
+++ b/examples/pipeline/wiki_entity_linking/train_el.py
@ -24,8 +24,6 @@ from thinc.misc import LayerNorm as LN
 # from spacy.cli.pretrain import get_cossim_loss
 from spacy.matcher import PhraseMatcher
 """ TODO: this code needs to be implemented in pipes.pyx"""
 class EL_Model:
@ -45,7 +43,7 @@ class EL_Model:
    DROP = 0.1
    LEARN_RATE = 0.001
-    EPOCHS = 20
+    EPOCHS = 5
    L2 = 1e-6
    name = "entity_linker"
@ -213,8 +211,7 @@ class EL_Model:
        if avg:
            with self.article_encoder.use_params(self.sgd_article.averages) \
                 and self.desc_encoder.use_params(self.sgd_desc.averages)\
-                 and self.sent_encoder.use_params(self.sgd_sent.averages)\
+                 and self.sent_encoder.use_params(self.sgd_sent.averages):
                 and self.cont_encoder.use_params(self.sgd_cont.averages):
                desc_encodings = self.desc_encoder(desc_docs)
                doc_encoding = self.article_encoder([article_doc])
                sent_encoding = self.sent_encoder([sent_doc])
@ -226,7 +223,13 @@ class EL_Model:
        concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
-        cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
+        if avg:
            with self.cont_encoder.use_params(self.sgd_cont.averages):
                cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
        else:
            cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
        context_enc = np.transpose(cont_encodings)
        highest_sim = -5
@ -298,8 +301,8 @@ class EL_Model:
        self.sgd_desc.learn_rate = self.LEARN_RATE
        self.sgd_desc.L2 = self.L2
-    def get_loss(self, v1, v2, targets):
+    def get_loss(self, pred, gold, targets):
-        loss, gradients = self.get_cossim_loss(v1, v2, targets)
+        loss, gradients = self.get_cossim_loss(pred, gold, targets)
        return loss, gradients
    def get_cossim_loss(self, yh, y, t):
@ -327,8 +330,6 @@ class EL_Model:
        return loss, -inverse
    def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
        all_clusters = list(entity_clusters.keys())
        arts_list = list()
        sents_list = list()
        descs_list = list()
--- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
+++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py
@ -111,7 +111,7 @@ if __name__ == "__main__":
        print("STEP 6: training", datetime.datetime.now())
        my_nlp = spacy.load('en_core_web_md')
        trainer = EL_Model(kb=my_kb, nlp=my_nlp)
-        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
+        trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20)
        print()
    # STEP 7: apply the EL algorithm on the dev dataset
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -652,6 +652,28 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
    return model
 def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
    conv_depth = cfg.get("conv_depth", 2)
    cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
    with Model.define_operators({">>": chain, "**": clone}):
        convolution = Residual((ExtractWindow(nW=1) >>
                                LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
        encoder = SpacyVectors \
                  >> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
                  >> flatten_add_lengths \
                  >> ParametricAttention(hidden_width) \
                  >> Pooling(mean_pool) \
                  >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
                  >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
        # TODO: ReLu or LN(Maxout)  ?
        # sum_pool or mean_pool ?
    encoder.nO = end_width
    return encoder
@layerize
 def flatten(seqs, drop=0.0):
    ops = Model.ops
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -13,6 +13,8 @@ from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical, copy_array
 from spacy.cli.pretrain import get_cossim_loss
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@ -23,14 +25,17 @@ from ..vocab cimport Vocab
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
-from .._ml import Tok2Vec, build_tagger_model
+from .._ml import Tok2Vec, build_tagger_model, cosine
 from .._ml import build_text_classifier, build_simple_cnn_text_classifier
-from .._ml import build_bow_text_classifier
+from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer
 from ..errors import Errors, TempErrors
 from .. import util
 # TODO: remove
 from examples.pipeline.wiki_entity_linking import kb_creator
 def _load_cfg(path):
    if path.exists():
@ -1065,50 +1070,141 @@ class EntityLinker(Pipe):
    name = 'entity_linker'
    @classmethod
-    def Model(cls, nr_class=1, **cfg):
+    def Model(cls, **cfg):
-        # TODO: non-dummy EL implementation
+        embed_width = cfg.get("embed_width", 300)
-        return None
+        hidden_width = cfg.get("hidden_width", 32)
        entity_width = cfg.get("entity_width", 64)
        article_width = cfg.get("article_width", 128)
        sent_width = cfg.get("sent_width", 64)
        entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
        article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
        sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
        # dimension of the mention encoder needs to match the dimension of the entity encoder
        mention_width = entity_encoder.nO
        mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
        return entity_encoder, article_encoder, sent_encoder, mention_encoder
    def __init__(self, **cfg):
        # TODO: bring-your-own-model
        self.mention_encoder = True
    def __init__(self, model=True, **cfg):
        self.model = False
        self.cfg = dict(cfg)
        self.kb = self.cfg["kb"]
        # TODO: fix this. store entity vectors in the KB ?
        self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
    def use_avg_params(self):
        """Modify the pipe's encoders/models, to use their average parameter values."""
        with self.article_encoder.use_params(self.sgd_article.averages) \
                 and self.entity_encoder.use_params(self.sgd_entity.averages)\
                 and self.sent_encoder.use_params(self.sgd_sent.averages) \
                 and self.mention_encoder.use_params(self.sgd_mention.averages):
            yield
    def require_model(self):
        """Raise an error if the component's model is not initialized."""
        if getattr(self, "mention_encoder", None) in (None, True, False):
            raise ValueError(Errors.E109.format(name=self.name))
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        if self.mention_encoder is True:
            self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
            self.sgd_article = create_default_optimizer(self.article_encoder.ops)
            self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
            self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
            self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
        """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
        self.require_model()
        entity_docs, article_docs, sentence_docs = docs
        assert len(entity_docs) == len(article_docs) == len(sentence_docs)
        if isinstance(entity_docs, Doc):
            entity_docs = [entity_docs]
            article_docs = [article_docs]
            sentence_docs = [sentence_docs]
        entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
        doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
        sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
        concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
                            range(len(article_docs))]
        mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
        loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
        mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
        # gradient : concat (doc+sent) vs. desc
        sent_start = self.article_encoder.nO
        sent_gradients = list()
        doc_gradients = list()
        for x in mention_gradient:
            doc_gradients.append(list(x[0:sent_start]))
            sent_gradients.append(list(x[sent_start:]))
        bp_doc(doc_gradients, sgd=self.sgd_article)
        bp_sent(sent_gradients, sgd=self.sgd_sent)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        return loss
    def get_loss(self, docs, golds, scores):
        loss, gradients = get_cossim_loss(scores, golds)
        return loss, gradients
    def __call__(self, doc):
-        self.set_annotations([doc], scores=None, tensors=None)
+        entities, kb_ids = self.predict([doc])
        self.set_annotations([doc], entities, kb_ids)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
-            self.set_annotations(docs, scores=None, tensors=None)
+            entities, kb_ids = self.predict(docs)
            self.set_annotations(docs, entities, kb_ids)
            yield from docs
-    def set_annotations(self, docs, scores, tensors=None):
+    def predict(self, docs):
-        """
+        self.require_model()
-        Currently implemented as taking the KB entry with highest prior probability for each named entity
+        for i, article_doc in enumerate(docs):
-        TODO: actually use context etc
+            doc_encoding = self.article_encoder([article_doc])
-        """
+            for ent in article_doc.ents:
-        for i, doc in enumerate(docs):
+                sent_doc = ent.sent.as_doc()
-            for ent in doc.ents:
+                sent_encoding = self.sent_encoder([sent_doc])
                concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
                mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
                mention_enc_t = np.transpose(mention_encoding)
                candidates = self.kb.get_candidates(ent.text)
                if candidates:
-                    best_candidate = max(candidates, key=lambda c: c.prior_prob)
+                    highest_sim = -5
-                    for token in ent:
+                    best_i = -1
-                        token.ent_kb_id_ = best_candidate.entity_
+                    with self.use_avg_params:
                        for c in candidates:
                            kb_id = c.entity_
                            description = self.id_to_descr.get(kb_id)
                            entity_encodings = self.entity_encoder([description])  # TODO: static entity vectors ?
                            sim = cosine(entity_encodings, mention_enc_t)
                            if sim >= highest_sim:
                                best_i = i
                                highest_sim = sim
-    def get_loss(self, docs, golds, scores):
+                    # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob)
        # TODO
        pass
    def add_label(self, label):
        # TODO
        pass
    def set_annotations(self, docs, entities, kb_ids=None):
        for token, kb_id in zip(entities, kb_ids):
            token.ent_kb_id_ = kb_id
 class Sentencizer(object):
    """Segment the Doc into sentences using a rule-based strategy.