From d0c763ba447282d53ac7d25354afde468f0e4a73 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 21 Mar 2019 17:33:25 +0100 Subject: [PATCH] minimal EL pipe --- spacy/kb.pxd | 14 ++-- spacy/kb.pyx | 3 +- spacy/language.py | 4 + spacy/pipeline/pipes.pyx | 100 ++++--------------------- spacy/sandbox_test_sofie/testing_el.py | 17 +++-- 5 files changed, 37 insertions(+), 101 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 4ae34bfa7..5fd239998 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -109,7 +109,7 @@ cdef class KnowledgeBase: """Add an entry to the knowledge base.""" # This is what we'll map the hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. - cdef int64_t entity_index = self._entries.size() + cdef int64_t new_index = self._entries.size() self._entries.push_back( _EntryC( entity_hash=entity_hash, @@ -117,22 +117,22 @@ cdef class KnowledgeBase: feats_row=feats_row, prob=prob )) - self._entry_index[entity_hash] = entity_index - return entity_index + self._entry_index[entity_hash] = new_index + return new_index cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" - cdef int64_t alias_index = self._aliases_table.size() + cdef int64_t new_index = self._aliases_table.size() self._aliases_table.push_back( _AliasC( entry_indices=entry_indices, probs=probs )) - self._alias_index[alias_hash] = alias_index - return alias_index + self._alias_index[alias_hash] = new_index + return new_index - cdef inline create_empty_vectors(self): + cdef inline _create_empty_vectors(self): """ Making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 62080e1be..33a79da04 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -65,7 +65,7 @@ cdef class KnowledgeBase: self._alias_index = PreshMap() self.mem = Pool() self.strings = StringStore() - self.create_empty_vectors() + self._create_empty_vectors() def __len__(self): return self.get_size_entities() @@ -151,4 +151,3 @@ cdef class KnowledgeBase: prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0] - diff --git a/spacy/language.py b/spacy/language.py index 736899341..f80d8699d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -209,6 +209,10 @@ class Language(object): def entity(self): return self.get_pipe("ner") + @property + def linker(self): + return self.get_pipe("el") + @property def matcher(self): return self.get_pipe("matcher") diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index e1e5471be..5866518a7 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1045,44 +1045,28 @@ class EntityLinker(Pipe): @classmethod def Model(cls, nr_class=1, **cfg): - embed_size = util.env_opt("embed_size", 2000) - if "token_vector_width" in cfg: - token_vector_width = cfg["token_vector_width"] - else: - token_vector_width = util.env_opt("token_vector_width", 96) - if cfg.get('architecture') == 'simple_cnn': - tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg) - return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg) - else: - return None # build_text_classifier(nr_class, **cfg) + # TODO: non-dummy EL implementation + return None - - def __init__(self, vocab, model=True, **cfg): - self.vocab = vocab - self.model = model - self._rehearsal_model = None + def __init__(self, model=True, **cfg): + self.model = False self.cfg = dict(cfg) + self.kb = self.cfg["kb"] def __call__(self, doc): - # scores, tensors = self.predict([doc]) - scores, tensors = None, None - self.set_annotations([doc], scores, tensors=tensors) + self.set_annotations([doc], scores=None, tensors=None) return doc def pipe(self, stream, batch_size=128, n_threads=-1): + """Apply the pipe to a stream of documents. + Both __call__ and pipe should delegate to the `predict()` + and `set_annotations()` methods. + """ for docs in util.minibatch(stream, size=batch_size): docs = list(docs) - scores, tensors = self.predict(docs) - self.set_annotations(docs, scores, tensors=tensors) + self.set_annotations(docs, scores=None, tensors=None) yield from docs - def predict(self, docs): - # self.require_model() - scores = self.model(docs) - scores = self.model.ops.asarray(scores) - tensors = [doc.tensor for doc in docs] - return scores, tensors - def set_annotations(self, docs, scores, tensors=None): # TODO Sofie: actually implement this class instead of dummy implementation for i, doc in enumerate(docs): @@ -1091,67 +1075,13 @@ class EntityLinker(Pipe): for token in ent: token.ent_kb_id_ = "Q42" - def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): - scores, bp_scores = self.model.begin_update(docs, drop=drop) - loss, d_scores = self.get_loss(docs, golds, scores) - bp_scores(d_scores, sgd=sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - - def rehearse(self, docs, drop=0., sgd=None, losses=None): - if self._rehearsal_model is None: - return - scores, bp_scores = self.model.begin_update(docs, drop=drop) - target = self._rehearsal_model(docs) - gradient = scores - target - bp_scores(gradient, sgd=sgd) - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += (gradient**2).sum() - def get_loss(self, docs, golds, scores): - truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') - not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') - for i, gold in enumerate(golds): - for j, label in enumerate(self.labels): - if label in gold.cats: - truths[i, j] = gold.cats[label] - else: - not_missing[i, j] = 0. - truths = self.model.ops.asarray(truths) - not_missing = self.model.ops.asarray(not_missing) - d_scores = (scores-truths) / scores.shape[0] - d_scores *= not_missing - mean_square_error = (d_scores**2).sum(axis=1).mean() - return float(mean_square_error), d_scores + # TODO + pass def add_label(self, label): - if label in self.labels: - return 0 - if self.model not in (None, True, False): - # This functionality was available previously, but was broken. - # The problem is that we resize the last layer, but the last layer - # is actually just an ensemble. We're not resizing the child layers - # -- a huge problem. - raise ValueError(Errors.E116) - #smaller = self.model._layers[-1] - #larger = Affine(len(self.labels)+1, smaller.nI) - #copy_array(larger.W[:smaller.nO], smaller.W) - #copy_array(larger.b[:smaller.nO], smaller.b) - #self.model._layers[-1] = larger - self.labels = tuple(list(self.labels) + [label]) - return 1 - - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, - **kwargs): - if self.model is True: - self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors') - self.model = self.Model(len(self.labels), **self.cfg) - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd + # TODO + pass __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker'] diff --git a/spacy/sandbox_test_sofie/testing_el.py b/spacy/sandbox_test_sofie/testing_el.py index 03261806b..f6296bf89 100644 --- a/spacy/sandbox_test_sofie/testing_el.py +++ b/spacy/sandbox_test_sofie/testing_el.py @@ -37,16 +37,14 @@ def create_kb(): print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print() - for alias in [alias1, "rubbish", alias3]: - candidates = mykb.get_candidates(alias) - print(len(candidates), "candidates for", alias) + return mykb -def add_el(): +def add_el(kb): nlp = spacy.load('en_core_web_sm') print("pipes before:", nlp.pipe_names) - el_pipe = nlp.create_pipe(name='el') + el_pipe = nlp.create_pipe(name='el', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True) print("pipes after:", nlp.pipe_names) @@ -62,7 +60,12 @@ def add_el(): for ent in doc.ents: print("ent", ent.text, ent.label_, ent.kb_id_) + print() + for alias in ["douglassss", "rubbish", "adam"]: + candidates = nlp.linker.kb.get_candidates(alias) + print(len(candidates), "candidates for", alias) + if __name__ == "__main__": - # add_el() - create_kb() + mykb = create_kb() + add_el(mykb)