minimal EL pipe

This commit is contained in:
svlandeg 2019-03-21 17:33:25 +01:00
parent 26afa4800f
commit d0c763ba44
5 changed files with 37 additions and 101 deletions

View File

@ -109,7 +109,7 @@ cdef class KnowledgeBase:
"""Add an entry to the knowledge base.""" """Add an entry to the knowledge base."""
# This is what we'll map the hash key to. It's where the entry will sit # This is what we'll map the hash key to. It's where the entry will sit
# in the vector of entries, so we can get it later. # in the vector of entries, so we can get it later.
cdef int64_t entity_index = self._entries.size() cdef int64_t new_index = self._entries.size()
self._entries.push_back( self._entries.push_back(
_EntryC( _EntryC(
entity_hash=entity_hash, entity_hash=entity_hash,
@ -117,22 +117,22 @@ cdef class KnowledgeBase:
feats_row=feats_row, feats_row=feats_row,
prob=prob prob=prob
)) ))
self._entry_index[entity_hash] = entity_index self._entry_index[entity_hash] = new_index
return entity_index return new_index
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
"""Connect a mention to a list of potential entities with their prior probabilities .""" """Connect a mention to a list of potential entities with their prior probabilities ."""
cdef int64_t alias_index = self._aliases_table.size() cdef int64_t new_index = self._aliases_table.size()
self._aliases_table.push_back( self._aliases_table.push_back(
_AliasC( _AliasC(
entry_indices=entry_indices, entry_indices=entry_indices,
probs=probs probs=probs
)) ))
self._alias_index[alias_hash] = alias_index self._alias_index[alias_hash] = new_index
return alias_index return new_index
cdef inline create_empty_vectors(self): cdef inline _create_empty_vectors(self):
""" """
Making sure the first element of each vector is a dummy, Making sure the first element of each vector is a dummy,
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value because the PreshMap maps pointing to indices in these vectors can not contain 0 as value

View File

@ -65,7 +65,7 @@ cdef class KnowledgeBase:
self._alias_index = PreshMap() self._alias_index = PreshMap()
self.mem = Pool() self.mem = Pool()
self.strings = StringStore() self.strings = StringStore()
self.create_empty_vectors() self._create_empty_vectors()
def __len__(self): def __len__(self):
return self.get_size_entities() return self.get_size_entities()
@ -151,4 +151,3 @@ cdef class KnowledgeBase:
prior_prob=prob) prior_prob=prob)
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
if entry_index != 0] if entry_index != 0]

View File

@ -209,6 +209,10 @@ class Language(object):
def entity(self): def entity(self):
return self.get_pipe("ner") return self.get_pipe("ner")
@property
def linker(self):
return self.get_pipe("el")
@property @property
def matcher(self): def matcher(self):
return self.get_pipe("matcher") return self.get_pipe("matcher")

View File

@ -1045,44 +1045,28 @@ class EntityLinker(Pipe):
@classmethod @classmethod
def Model(cls, nr_class=1, **cfg): def Model(cls, nr_class=1, **cfg):
embed_size = util.env_opt("embed_size", 2000) # TODO: non-dummy EL implementation
if "token_vector_width" in cfg: return None
token_vector_width = cfg["token_vector_width"]
else:
token_vector_width = util.env_opt("token_vector_width", 96)
if cfg.get('architecture') == 'simple_cnn':
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
else:
return None # build_text_classifier(nr_class, **cfg)
def __init__(self, model=True, **cfg):
def __init__(self, vocab, model=True, **cfg): self.model = False
self.vocab = vocab
self.model = model
self._rehearsal_model = None
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.kb = self.cfg["kb"]
def __call__(self, doc): def __call__(self, doc):
# scores, tensors = self.predict([doc]) self.set_annotations([doc], scores=None, tensors=None)
scores, tensors = None, None
self.set_annotations([doc], scores, tensors=tensors)
return doc return doc
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in util.minibatch(stream, size=batch_size): for docs in util.minibatch(stream, size=batch_size):
docs = list(docs) docs = list(docs)
scores, tensors = self.predict(docs) self.set_annotations(docs, scores=None, tensors=None)
self.set_annotations(docs, scores, tensors=tensors)
yield from docs yield from docs
def predict(self, docs):
# self.require_model()
scores = self.model(docs)
scores = self.model.ops.asarray(scores)
tensors = [doc.tensor for doc in docs]
return scores, tensors
def set_annotations(self, docs, scores, tensors=None): def set_annotations(self, docs, scores, tensors=None):
# TODO Sofie: actually implement this class instead of dummy implementation # TODO Sofie: actually implement this class instead of dummy implementation
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
@ -1091,67 +1075,13 @@ class EntityLinker(Pipe):
for token in ent: for token in ent:
token.ent_kb_id_ = "Q42" token.ent_kb_id_ = "Q42"
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
scores, bp_scores = self.model.begin_update(docs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
bp_scores(d_scores, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
def rehearse(self, docs, drop=0., sgd=None, losses=None):
if self._rehearsal_model is None:
return
scores, bp_scores = self.model.begin_update(docs, drop=drop)
target = self._rehearsal_model(docs)
gradient = scores - target
bp_scores(gradient, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += (gradient**2).sum()
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f') # TODO
not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f') pass
for i, gold in enumerate(golds):
for j, label in enumerate(self.labels):
if label in gold.cats:
truths[i, j] = gold.cats[label]
else:
not_missing[i, j] = 0.
truths = self.model.ops.asarray(truths)
not_missing = self.model.ops.asarray(not_missing)
d_scores = (scores-truths) / scores.shape[0]
d_scores *= not_missing
mean_square_error = (d_scores**2).sum(axis=1).mean()
return float(mean_square_error), d_scores
def add_label(self, label): def add_label(self, label):
if label in self.labels: # TODO
return 0 pass
if self.model not in (None, True, False):
# This functionality was available previously, but was broken.
# The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers
# -- a huge problem.
raise ValueError(Errors.E116)
#smaller = self.model._layers[-1]
#larger = Affine(len(self.labels)+1, smaller.nI)
#copy_array(larger.W[:smaller.nO], smaller.W)
#copy_array(larger.b[:smaller.nO], smaller.b)
#self.model._layers[-1] = larger
self.labels = tuple(list(self.labels) + [label])
return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
**kwargs):
if self.model is True:
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
self.model = self.Model(len(self.labels), **self.cfg)
link_vectors_to_models(self.vocab)
if sgd is None:
sgd = self.create_optimizer()
return sgd
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker'] __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']

View File

@ -37,16 +37,14 @@ def create_kb():
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases()) print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
print() print()
for alias in [alias1, "rubbish", alias3]: return mykb
candidates = mykb.get_candidates(alias)
print(len(candidates), "candidates for", alias)
def add_el(): def add_el(kb):
nlp = spacy.load('en_core_web_sm') nlp = spacy.load('en_core_web_sm')
print("pipes before:", nlp.pipe_names) print("pipes before:", nlp.pipe_names)
el_pipe = nlp.create_pipe(name='el') el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
nlp.add_pipe(el_pipe, last=True) nlp.add_pipe(el_pipe, last=True)
print("pipes after:", nlp.pipe_names) print("pipes after:", nlp.pipe_names)
@ -62,7 +60,12 @@ def add_el():
for ent in doc.ents: for ent in doc.ents:
print("ent", ent.text, ent.label_, ent.kb_id_) print("ent", ent.text, ent.label_, ent.kb_id_)
print()
for alias in ["douglassss", "rubbish", "adam"]:
candidates = nlp.linker.kb.get_candidates(alias)
print(len(candidates), "candidates for", alias)
if __name__ == "__main__": if __name__ == "__main__":
# add_el() mykb = create_kb()
create_kb() add_el(mykb)