implementing el pipe in pipes.pyx (not tested yet)

This commit is contained in:
svlandeg 2019-06-03 21:32:54 +02:00
parent d83a1e3052
commit fb37cdb2d3
4 changed files with 160 additions and 41 deletions

View File

@ -24,8 +24,6 @@ from thinc.misc import LayerNorm as LN
# from spacy.cli.pretrain import get_cossim_loss
from spacy.matcher import PhraseMatcher
""" TODO: this code needs to be implemented in pipes.pyx"""
class EL_Model:
@ -45,7 +43,7 @@ class EL_Model:
DROP = 0.1
LEARN_RATE = 0.001
EPOCHS = 20
EPOCHS = 5
L2 = 1e-6
name = "entity_linker"
@ -213,8 +211,7 @@ class EL_Model:
if avg:
with self.article_encoder.use_params(self.sgd_article.averages) \
and self.desc_encoder.use_params(self.sgd_desc.averages)\
and self.sent_encoder.use_params(self.sgd_sent.averages)\
and self.cont_encoder.use_params(self.sgd_cont.averages):
and self.sent_encoder.use_params(self.sgd_sent.averages):
desc_encodings = self.desc_encoder(desc_docs)
doc_encoding = self.article_encoder([article_doc])
sent_encoding = self.sent_encoder([sent_doc])
@ -226,7 +223,13 @@ class EL_Model:
concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
if avg:
with self.cont_encoder.use_params(self.sgd_cont.averages):
cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
else:
cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
context_enc = np.transpose(cont_encodings)
highest_sim = -5
@ -298,8 +301,8 @@ class EL_Model:
self.sgd_desc.learn_rate = self.LEARN_RATE
self.sgd_desc.L2 = self.L2
def get_loss(self, v1, v2, targets):
loss, gradients = self.get_cossim_loss(v1, v2, targets)
def get_loss(self, pred, gold, targets):
loss, gradients = self.get_cossim_loss(pred, gold, targets)
return loss, gradients
def get_cossim_loss(self, yh, y, t):
@ -327,8 +330,6 @@ class EL_Model:
return loss, -inverse
def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
all_clusters = list(entity_clusters.keys())
arts_list = list()
sents_list = list()
descs_list = list()

View File

@ -111,7 +111,7 @@ if __name__ == "__main__":
print("STEP 6: training", datetime.datetime.now())
my_nlp = spacy.load('en_core_web_md')
trainer = EL_Model(kb=my_kb, nlp=my_nlp)
trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20)
print()
# STEP 7: apply the EL algorithm on the dev dataset

View File

@ -652,6 +652,28 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
return model
def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
conv_depth = cfg.get("conv_depth", 2)
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
with Model.define_operators({">>": chain, "**": clone}):
convolution = Residual((ExtractWindow(nW=1) >>
LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
encoder = SpacyVectors \
>> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
>> flatten_add_lengths \
>> ParametricAttention(hidden_width) \
>> Pooling(mean_pool) \
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
>> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
# TODO: ReLu or LN(Maxout) ?
# sum_pool or mean_pool ?
encoder.nO = end_width
return encoder
@layerize
def flatten(seqs, drop=0.0):
ops = Model.ops

View File

@ -13,6 +13,8 @@ from thinc.v2v import Affine, Maxout, Softmax
from thinc.misc import LayerNorm
from thinc.neural.util import to_categorical, copy_array
from spacy.cli.pretrain import get_cossim_loss
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
from ..syntax.ner cimport BiluoPushDown
@ -23,14 +25,17 @@ from ..vocab cimport Vocab
from ..syntax import nonproj
from ..attrs import POS, ID
from ..parts_of_speech import X
from .._ml import Tok2Vec, build_tagger_model
from .._ml import Tok2Vec, build_tagger_model, cosine
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
from .._ml import build_bow_text_classifier
from .._ml import build_bow_text_classifier, build_nel_encoder
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import masked_language_model, create_default_optimizer
from ..errors import Errors, TempErrors
from .. import util
# TODO: remove
from examples.pipeline.wiki_entity_linking import kb_creator
def _load_cfg(path):
if path.exists():
@ -1065,50 +1070,141 @@ class EntityLinker(Pipe):
name = 'entity_linker'
@classmethod
def Model(cls, nr_class=1, **cfg):
# TODO: non-dummy EL implementation
return None
def Model(cls, **cfg):
embed_width = cfg.get("embed_width", 300)
hidden_width = cfg.get("hidden_width", 32)
entity_width = cfg.get("entity_width", 64)
article_width = cfg.get("article_width", 128)
sent_width = cfg.get("sent_width", 64)
entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
# dimension of the mention encoder needs to match the dimension of the entity encoder
mention_width = entity_encoder.nO
mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
return entity_encoder, article_encoder, sent_encoder, mention_encoder
def __init__(self, **cfg):
# TODO: bring-your-own-model
self.mention_encoder = True
def __init__(self, model=True, **cfg):
self.model = False
self.cfg = dict(cfg)
self.kb = self.cfg["kb"]
# TODO: fix this. store entity vectors in the KB ?
self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
def use_avg_params(self):
"""Modify the pipe's encoders/models, to use their average parameter values."""
with self.article_encoder.use_params(self.sgd_article.averages) \
and self.entity_encoder.use_params(self.sgd_entity.averages)\
and self.sent_encoder.use_params(self.sgd_sent.averages) \
and self.mention_encoder.use_params(self.sgd_mention.averages):
yield
def require_model(self):
"""Raise an error if the component's model is not initialized."""
if getattr(self, "mention_encoder", None) in (None, True, False):
raise ValueError(Errors.E109.format(name=self.name))
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
if self.mention_encoder is True:
self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
self.sgd_article = create_default_optimizer(self.article_encoder.ops)
self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
""" docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
self.require_model()
entity_docs, article_docs, sentence_docs = docs
assert len(entity_docs) == len(article_docs) == len(sentence_docs)
if isinstance(entity_docs, Doc):
entity_docs = [entity_docs]
article_docs = [article_docs]
sentence_docs = [sentence_docs]
entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
range(len(article_docs))]
mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
# gradient : concat (doc+sent) vs. desc
sent_start = self.article_encoder.nO
sent_gradients = list()
doc_gradients = list()
for x in mention_gradient:
doc_gradients.append(list(x[0:sent_start]))
sent_gradients.append(list(x[sent_start:]))
bp_doc(doc_gradients, sgd=self.sgd_article)
bp_sent(sent_gradients, sgd=self.sgd_sent)
if losses is not None:
losses.setdefault(self.name, 0.0)
losses[self.name] += loss
return loss
def get_loss(self, docs, golds, scores):
loss, gradients = get_cossim_loss(scores, golds)
return loss, gradients
def __call__(self, doc):
self.set_annotations([doc], scores=None, tensors=None)
entities, kb_ids = self.predict([doc])
self.set_annotations([doc], entities, kb_ids)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
"""Apply the pipe to a stream of documents.
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
self.set_annotations(docs, scores=None, tensors=None)
entities, kb_ids = self.predict(docs)
self.set_annotations(docs, entities, kb_ids)
yield from docs
def set_annotations(self, docs, scores, tensors=None):
"""
Currently implemented as taking the KB entry with highest prior probability for each named entity
TODO: actually use context etc
"""
for i, doc in enumerate(docs):
for ent in doc.ents:
def predict(self, docs):
self.require_model()
for i, article_doc in enumerate(docs):
doc_encoding = self.article_encoder([article_doc])
for ent in article_doc.ents:
sent_doc = ent.sent.as_doc()
sent_encoding = self.sent_encoder([sent_doc])
concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
mention_enc_t = np.transpose(mention_encoding)
candidates = self.kb.get_candidates(ent.text)
if candidates:
best_candidate = max(candidates, key=lambda c: c.prior_prob)
for token in ent:
token.ent_kb_id_ = best_candidate.entity_
highest_sim = -5
best_i = -1
with self.use_avg_params:
for c in candidates:
kb_id = c.entity_
description = self.id_to_descr.get(kb_id)
entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ?
sim = cosine(entity_encodings, mention_enc_t)
if sim >= highest_sim:
best_i = i
highest_sim = sim
def get_loss(self, docs, golds, scores):
# TODO
pass
def add_label(self, label):
# TODO
pass
# TODO best_candidate = max(candidates, key=lambda c: c.prior_prob)
def set_annotations(self, docs, entities, kb_ids=None):
for token, kb_id in zip(entities, kb_ids):
token.ent_kb_id_ = kb_id
class Sentencizer(object):
"""Segment the Doc into sentences using a rule-based strategy.