minimal EL pipe

2025-07-07 05:13:18 +03:00 · 2019-03-21 17:33:25 +01:00 · 2019-03-21 17:33:25 +01:00 · d0c763ba44
commit d0c763ba44
parent 26afa4800f
5 changed files with 37 additions and 101 deletions
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@ -109,7 +109,7 @@ cdef class KnowledgeBase:
        """Add an entry to the knowledge base."""
        # This is what we'll map the hash key to. It's where the entry will sit
        # in the vector of entries, so we can get it later.
-        cdef int64_t entity_index = self._entries.size()
+        cdef int64_t new_index = self._entries.size()
        self._entries.push_back(
            _EntryC(
                entity_hash=entity_hash,
@ -117,22 +117,22 @@ cdef class KnowledgeBase:
                feats_row=feats_row,
                prob=prob
            ))
-        self._entry_index[entity_hash] = entity_index
+        self._entry_index[entity_hash] = new_index
-        return entity_index
+        return new_index
    cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
        """Connect a mention to a list of potential entities with their prior probabilities ."""
-        cdef int64_t alias_index = self._aliases_table.size()
+        cdef int64_t new_index = self._aliases_table.size()
        self._aliases_table.push_back(
            _AliasC(
                entry_indices=entry_indices,
                probs=probs
            ))
-        self._alias_index[alias_hash] = alias_index
+        self._alias_index[alias_hash] = new_index
-        return alias_index
+        return new_index
-    cdef inline create_empty_vectors(self):
+    cdef inline _create_empty_vectors(self):
        """ 
        Making sure the first element of each vector is a dummy,
        because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -65,7 +65,7 @@ cdef class KnowledgeBase:
        self._alias_index = PreshMap()
        self.mem = Pool()
        self.strings = StringStore()
-        self.create_empty_vectors()
+        self._create_empty_vectors()
    def __len__(self):
        return self.get_size_entities()
@ -151,4 +151,3 @@ cdef class KnowledgeBase:
                          prior_prob=prob)
                for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
                if entry_index != 0]
--- a/spacy/language.py
+++ b/spacy/language.py
@ -209,6 +209,10 @@ class Language(object):
    def entity(self):
        return self.get_pipe("ner")
    @property
    def linker(self):
        return self.get_pipe("el")
    @property
    def matcher(self):
        return self.get_pipe("matcher")
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -1045,44 +1045,28 @@ class EntityLinker(Pipe):
    @classmethod
    def Model(cls, nr_class=1, **cfg):
-        embed_size = util.env_opt("embed_size", 2000)
+        # TODO: non-dummy EL implementation
-        if "token_vector_width" in cfg:
+        return None
            token_vector_width = cfg["token_vector_width"]
        else:
            token_vector_width = util.env_opt("token_vector_width", 96)
        if cfg.get('architecture') == 'simple_cnn':
            tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
            return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
        else:
            return None # build_text_classifier(nr_class, **cfg)
-
+    def __init__(self, model=True, **cfg):
-    def __init__(self, vocab, model=True, **cfg):
+        self.model = False
        self.vocab = vocab
        self.model = model
        self._rehearsal_model = None
        self.cfg = dict(cfg)
        self.kb = self.cfg["kb"]
    def __call__(self, doc):
-        # scores, tensors = self.predict([doc])
+        self.set_annotations([doc], scores=None, tensors=None)
        scores, tensors = None, None
        self.set_annotations([doc], scores, tensors=tensors)
        return doc
    def pipe(self, stream, batch_size=128, n_threads=-1):
        """Apply the pipe to a stream of documents.
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
        for docs in util.minibatch(stream, size=batch_size):
            docs = list(docs)
-            scores, tensors = self.predict(docs)
+            self.set_annotations(docs, scores=None, tensors=None)
            self.set_annotations(docs, scores, tensors=tensors)
            yield from docs
    def predict(self, docs):
        # self.require_model()
        scores = self.model(docs)
        scores = self.model.ops.asarray(scores)
        tensors = [doc.tensor for doc in docs]
        return scores, tensors
    def set_annotations(self, docs, scores, tensors=None):
        # TODO Sofie: actually implement this class instead of dummy implementation
        for i, doc in enumerate(docs):
@ -1091,67 +1075,13 @@ class EntityLinker(Pipe):
                    for token in ent:
                        token.ent_kb_id_ = "Q42"
    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
        loss, d_scores = self.get_loss(docs, golds, scores)
        bp_scores(d_scores, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
    def rehearse(self, docs, drop=0., sgd=None, losses=None):
        if self._rehearsal_model is None:
            return
        scores, bp_scores = self.model.begin_update(docs, drop=drop)
        target = self._rehearsal_model(docs)
        gradient = scores - target
        bp_scores(gradient, sgd=sgd)
        if losses is not None:
            losses.setdefault(self.name, 0.0)
            losses[self.name] += (gradient**2).sum()
    def get_loss(self, docs, golds, scores):
-        truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
+        # TODO
-        not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
+        pass
        for i, gold in enumerate(golds):
            for j, label in enumerate(self.labels):
                if label in gold.cats:
                    truths[i, j] = gold.cats[label]
                else:
                    not_missing[i, j] = 0.
        truths = self.model.ops.asarray(truths)
        not_missing = self.model.ops.asarray(not_missing)
        d_scores = (scores-truths) / scores.shape[0]
        d_scores *= not_missing
        mean_square_error = (d_scores**2).sum(axis=1).mean()
        return float(mean_square_error), d_scores
    def add_label(self, label):
-        if label in self.labels:
+        # TODO
-            return 0
+        pass
        if self.model not in (None, True, False):
            # This functionality was available previously, but was broken.
            # The problem is that we resize the last layer, but the last layer
            # is actually just an ensemble. We're not resizing the child layers
            # -- a huge problem.
            raise ValueError(Errors.E116)
            #smaller = self.model._layers[-1]
            #larger = Affine(len(self.labels)+1, smaller.nI)
            #copy_array(larger.W[:smaller.nO], smaller.W)
            #copy_array(larger.b[:smaller.nO], smaller.b)
            #self.model._layers[-1] = larger
        self.labels = tuple(list(self.labels) + [label])
        return 1
    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
                       **kwargs):
        if self.model is True:
            self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
            self.model = self.Model(len(self.labels), **self.cfg)
            link_vectors_to_models(self.vocab)
        if sgd is None:
            sgd = self.create_optimizer()
        return sgd
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
--- a/spacy/sandbox_test_sofie/testing_el.py
+++ b/spacy/sandbox_test_sofie/testing_el.py
@ -37,16 +37,14 @@ def create_kb():
    print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
    print()
-    for alias in [alias1, "rubbish", alias3]:
+    return mykb
        candidates = mykb.get_candidates(alias)
        print(len(candidates), "candidates for", alias)
-def add_el():
+def add_el(kb):
    nlp = spacy.load('en_core_web_sm')
    print("pipes before:", nlp.pipe_names)
-    el_pipe = nlp.create_pipe(name='el')
+    el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
    nlp.add_pipe(el_pipe, last=True)
    print("pipes after:", nlp.pipe_names)
@ -62,7 +60,12 @@ def add_el():
    for ent in doc.ents:
        print("ent", ent.text, ent.label_, ent.kb_id_)
    print()
    for alias in ["douglassss", "rubbish", "adam"]:
        candidates = nlp.linker.kb.get_candidates(alias)
        print(len(candidates), "candidates for", alias)
 if __name__ == "__main__":
-    # add_el()
+    mykb = create_kb()
-    create_kb()
+    add_el(mykb)