mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-14 03:26:24 +03:00
minimal EL pipe
This commit is contained in:
parent
26afa4800f
commit
d0c763ba44
14
spacy/kb.pxd
14
spacy/kb.pxd
|
@ -109,7 +109,7 @@ cdef class KnowledgeBase:
|
||||||
"""Add an entry to the knowledge base."""
|
"""Add an entry to the knowledge base."""
|
||||||
# This is what we'll map the hash key to. It's where the entry will sit
|
# This is what we'll map the hash key to. It's where the entry will sit
|
||||||
# in the vector of entries, so we can get it later.
|
# in the vector of entries, so we can get it later.
|
||||||
cdef int64_t entity_index = self._entries.size()
|
cdef int64_t new_index = self._entries.size()
|
||||||
self._entries.push_back(
|
self._entries.push_back(
|
||||||
_EntryC(
|
_EntryC(
|
||||||
entity_hash=entity_hash,
|
entity_hash=entity_hash,
|
||||||
|
@ -117,22 +117,22 @@ cdef class KnowledgeBase:
|
||||||
feats_row=feats_row,
|
feats_row=feats_row,
|
||||||
prob=prob
|
prob=prob
|
||||||
))
|
))
|
||||||
self._entry_index[entity_hash] = entity_index
|
self._entry_index[entity_hash] = new_index
|
||||||
return entity_index
|
return new_index
|
||||||
|
|
||||||
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
|
cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs):
|
||||||
"""Connect a mention to a list of potential entities with their prior probabilities ."""
|
"""Connect a mention to a list of potential entities with their prior probabilities ."""
|
||||||
cdef int64_t alias_index = self._aliases_table.size()
|
cdef int64_t new_index = self._aliases_table.size()
|
||||||
|
|
||||||
self._aliases_table.push_back(
|
self._aliases_table.push_back(
|
||||||
_AliasC(
|
_AliasC(
|
||||||
entry_indices=entry_indices,
|
entry_indices=entry_indices,
|
||||||
probs=probs
|
probs=probs
|
||||||
))
|
))
|
||||||
self._alias_index[alias_hash] = alias_index
|
self._alias_index[alias_hash] = new_index
|
||||||
return alias_index
|
return new_index
|
||||||
|
|
||||||
cdef inline create_empty_vectors(self):
|
cdef inline _create_empty_vectors(self):
|
||||||
"""
|
"""
|
||||||
Making sure the first element of each vector is a dummy,
|
Making sure the first element of each vector is a dummy,
|
||||||
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
because the PreshMap maps pointing to indices in these vectors can not contain 0 as value
|
||||||
|
|
|
@ -65,7 +65,7 @@ cdef class KnowledgeBase:
|
||||||
self._alias_index = PreshMap()
|
self._alias_index = PreshMap()
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.strings = StringStore()
|
self.strings = StringStore()
|
||||||
self.create_empty_vectors()
|
self._create_empty_vectors()
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.get_size_entities()
|
return self.get_size_entities()
|
||||||
|
@ -151,4 +151,3 @@ cdef class KnowledgeBase:
|
||||||
prior_prob=prob)
|
prior_prob=prob)
|
||||||
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs)
|
||||||
if entry_index != 0]
|
if entry_index != 0]
|
||||||
|
|
||||||
|
|
|
@ -209,6 +209,10 @@ class Language(object):
|
||||||
def entity(self):
|
def entity(self):
|
||||||
return self.get_pipe("ner")
|
return self.get_pipe("ner")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def linker(self):
|
||||||
|
return self.get_pipe("el")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def matcher(self):
|
def matcher(self):
|
||||||
return self.get_pipe("matcher")
|
return self.get_pipe("matcher")
|
||||||
|
|
|
@ -1045,44 +1045,28 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class=1, **cfg):
|
def Model(cls, nr_class=1, **cfg):
|
||||||
embed_size = util.env_opt("embed_size", 2000)
|
# TODO: non-dummy EL implementation
|
||||||
if "token_vector_width" in cfg:
|
return None
|
||||||
token_vector_width = cfg["token_vector_width"]
|
|
||||||
else:
|
|
||||||
token_vector_width = util.env_opt("token_vector_width", 96)
|
|
||||||
if cfg.get('architecture') == 'simple_cnn':
|
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size, **cfg)
|
|
||||||
return None # build_simple_cnn_text_classifier(tok2vec, nr_class, **cfg)
|
|
||||||
else:
|
|
||||||
return None # build_text_classifier(nr_class, **cfg)
|
|
||||||
|
|
||||||
|
def __init__(self, model=True, **cfg):
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
self.model = False
|
||||||
self.vocab = vocab
|
|
||||||
self.model = model
|
|
||||||
self._rehearsal_model = None
|
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
|
self.kb = self.cfg["kb"]
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
# scores, tensors = self.predict([doc])
|
self.set_annotations([doc], scores=None, tensors=None)
|
||||||
scores, tensors = None, None
|
|
||||||
self.set_annotations([doc], scores, tensors=tensors)
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
"""Apply the pipe to a stream of documents.
|
||||||
|
Both __call__ and pipe should delegate to the `predict()`
|
||||||
|
and `set_annotations()` methods.
|
||||||
|
"""
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
scores, tensors = self.predict(docs)
|
self.set_annotations(docs, scores=None, tensors=None)
|
||||||
self.set_annotations(docs, scores, tensors=tensors)
|
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def predict(self, docs):
|
|
||||||
# self.require_model()
|
|
||||||
scores = self.model(docs)
|
|
||||||
scores = self.model.ops.asarray(scores)
|
|
||||||
tensors = [doc.tensor for doc in docs]
|
|
||||||
return scores, tensors
|
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def set_annotations(self, docs, scores, tensors=None):
|
||||||
# TODO Sofie: actually implement this class instead of dummy implementation
|
# TODO Sofie: actually implement this class instead of dummy implementation
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
|
@ -1091,67 +1075,13 @@ class EntityLinker(Pipe):
|
||||||
for token in ent:
|
for token in ent:
|
||||||
token.ent_kb_id_ = "Q42"
|
token.ent_kb_id_ = "Q42"
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
|
||||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
|
||||||
loss, d_scores = self.get_loss(docs, golds, scores)
|
|
||||||
bp_scores(d_scores, sgd=sgd)
|
|
||||||
if losses is not None:
|
|
||||||
losses.setdefault(self.name, 0.0)
|
|
||||||
losses[self.name] += loss
|
|
||||||
|
|
||||||
def rehearse(self, docs, drop=0., sgd=None, losses=None):
|
|
||||||
if self._rehearsal_model is None:
|
|
||||||
return
|
|
||||||
scores, bp_scores = self.model.begin_update(docs, drop=drop)
|
|
||||||
target = self._rehearsal_model(docs)
|
|
||||||
gradient = scores - target
|
|
||||||
bp_scores(gradient, sgd=sgd)
|
|
||||||
if losses is not None:
|
|
||||||
losses.setdefault(self.name, 0.0)
|
|
||||||
losses[self.name] += (gradient**2).sum()
|
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
truths = numpy.zeros((len(golds), len(self.labels)), dtype='f')
|
# TODO
|
||||||
not_missing = numpy.ones((len(golds), len(self.labels)), dtype='f')
|
pass
|
||||||
for i, gold in enumerate(golds):
|
|
||||||
for j, label in enumerate(self.labels):
|
|
||||||
if label in gold.cats:
|
|
||||||
truths[i, j] = gold.cats[label]
|
|
||||||
else:
|
|
||||||
not_missing[i, j] = 0.
|
|
||||||
truths = self.model.ops.asarray(truths)
|
|
||||||
not_missing = self.model.ops.asarray(not_missing)
|
|
||||||
d_scores = (scores-truths) / scores.shape[0]
|
|
||||||
d_scores *= not_missing
|
|
||||||
mean_square_error = (d_scores**2).sum(axis=1).mean()
|
|
||||||
return float(mean_square_error), d_scores
|
|
||||||
|
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
if label in self.labels:
|
# TODO
|
||||||
return 0
|
pass
|
||||||
if self.model not in (None, True, False):
|
|
||||||
# This functionality was available previously, but was broken.
|
|
||||||
# The problem is that we resize the last layer, but the last layer
|
|
||||||
# is actually just an ensemble. We're not resizing the child layers
|
|
||||||
# -- a huge problem.
|
|
||||||
raise ValueError(Errors.E116)
|
|
||||||
#smaller = self.model._layers[-1]
|
|
||||||
#larger = Affine(len(self.labels)+1, smaller.nI)
|
|
||||||
#copy_array(larger.W[:smaller.nO], smaller.W)
|
|
||||||
#copy_array(larger.b[:smaller.nO], smaller.b)
|
|
||||||
#self.model._layers[-1] = larger
|
|
||||||
self.labels = tuple(list(self.labels) + [label])
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
|
||||||
**kwargs):
|
|
||||||
if self.model is True:
|
|
||||||
self.cfg['pretrained_vectors'] = kwargs.get('pretrained_vectors')
|
|
||||||
self.model = self.Model(len(self.labels), **self.cfg)
|
|
||||||
link_vectors_to_models(self.vocab)
|
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer', 'EntityLinker']
|
||||||
|
|
|
@ -37,16 +37,14 @@ def create_kb():
|
||||||
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
|
print("kb size", len(mykb), mykb.get_size_entities(), mykb.get_size_aliases())
|
||||||
print()
|
print()
|
||||||
|
|
||||||
for alias in [alias1, "rubbish", alias3]:
|
return mykb
|
||||||
candidates = mykb.get_candidates(alias)
|
|
||||||
print(len(candidates), "candidates for", alias)
|
|
||||||
|
|
||||||
|
|
||||||
def add_el():
|
def add_el(kb):
|
||||||
nlp = spacy.load('en_core_web_sm')
|
nlp = spacy.load('en_core_web_sm')
|
||||||
print("pipes before:", nlp.pipe_names)
|
print("pipes before:", nlp.pipe_names)
|
||||||
|
|
||||||
el_pipe = nlp.create_pipe(name='el')
|
el_pipe = nlp.create_pipe(name='el', config={"kb": kb})
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
print("pipes after:", nlp.pipe_names)
|
print("pipes after:", nlp.pipe_names)
|
||||||
|
@ -62,7 +60,12 @@ def add_el():
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
print("ent", ent.text, ent.label_, ent.kb_id_)
|
print("ent", ent.text, ent.label_, ent.kb_id_)
|
||||||
|
|
||||||
|
print()
|
||||||
|
for alias in ["douglassss", "rubbish", "adam"]:
|
||||||
|
candidates = nlp.linker.kb.get_candidates(alias)
|
||||||
|
print(len(candidates), "candidates for", alias)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# add_el()
|
mykb = create_kb()
|
||||||
create_kb()
|
add_el(mykb)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user