mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-26 11:53:40 +03:00
implementing el pipe in pipes.pyx (not tested yet)
This commit is contained in:
parent
d83a1e3052
commit
fb37cdb2d3
|
@ -24,8 +24,6 @@ from thinc.misc import LayerNorm as LN
|
||||||
# from spacy.cli.pretrain import get_cossim_loss
|
# from spacy.cli.pretrain import get_cossim_loss
|
||||||
from spacy.matcher import PhraseMatcher
|
from spacy.matcher import PhraseMatcher
|
||||||
|
|
||||||
""" TODO: this code needs to be implemented in pipes.pyx"""
|
|
||||||
|
|
||||||
|
|
||||||
class EL_Model:
|
class EL_Model:
|
||||||
|
|
||||||
|
@ -45,7 +43,7 @@ class EL_Model:
|
||||||
|
|
||||||
DROP = 0.1
|
DROP = 0.1
|
||||||
LEARN_RATE = 0.001
|
LEARN_RATE = 0.001
|
||||||
EPOCHS = 20
|
EPOCHS = 5
|
||||||
L2 = 1e-6
|
L2 = 1e-6
|
||||||
|
|
||||||
name = "entity_linker"
|
name = "entity_linker"
|
||||||
|
@ -213,8 +211,7 @@ class EL_Model:
|
||||||
if avg:
|
if avg:
|
||||||
with self.article_encoder.use_params(self.sgd_article.averages) \
|
with self.article_encoder.use_params(self.sgd_article.averages) \
|
||||||
and self.desc_encoder.use_params(self.sgd_desc.averages)\
|
and self.desc_encoder.use_params(self.sgd_desc.averages)\
|
||||||
and self.sent_encoder.use_params(self.sgd_sent.averages)\
|
and self.sent_encoder.use_params(self.sgd_sent.averages):
|
||||||
and self.cont_encoder.use_params(self.sgd_cont.averages):
|
|
||||||
desc_encodings = self.desc_encoder(desc_docs)
|
desc_encodings = self.desc_encoder(desc_docs)
|
||||||
doc_encoding = self.article_encoder([article_doc])
|
doc_encoding = self.article_encoder([article_doc])
|
||||||
sent_encoding = self.sent_encoder([sent_doc])
|
sent_encoding = self.sent_encoder([sent_doc])
|
||||||
|
@ -226,7 +223,13 @@ class EL_Model:
|
||||||
|
|
||||||
concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
|
concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
|
||||||
|
|
||||||
cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
|
if avg:
|
||||||
|
with self.cont_encoder.use_params(self.sgd_cont.averages):
|
||||||
|
cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
|
||||||
|
|
||||||
|
else:
|
||||||
|
cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]]))
|
||||||
|
|
||||||
context_enc = np.transpose(cont_encodings)
|
context_enc = np.transpose(cont_encodings)
|
||||||
|
|
||||||
highest_sim = -5
|
highest_sim = -5
|
||||||
|
@ -298,8 +301,8 @@ class EL_Model:
|
||||||
self.sgd_desc.learn_rate = self.LEARN_RATE
|
self.sgd_desc.learn_rate = self.LEARN_RATE
|
||||||
self.sgd_desc.L2 = self.L2
|
self.sgd_desc.L2 = self.L2
|
||||||
|
|
||||||
def get_loss(self, v1, v2, targets):
|
def get_loss(self, pred, gold, targets):
|
||||||
loss, gradients = self.get_cossim_loss(v1, v2, targets)
|
loss, gradients = self.get_cossim_loss(pred, gold, targets)
|
||||||
return loss, gradients
|
return loss, gradients
|
||||||
|
|
||||||
def get_cossim_loss(self, yh, y, t):
|
def get_cossim_loss(self, yh, y, t):
|
||||||
|
@ -327,8 +330,6 @@ class EL_Model:
|
||||||
return loss, -inverse
|
return loss, -inverse
|
||||||
|
|
||||||
def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
|
def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents):
|
||||||
all_clusters = list(entity_clusters.keys())
|
|
||||||
|
|
||||||
arts_list = list()
|
arts_list = list()
|
||||||
sents_list = list()
|
sents_list = list()
|
||||||
descs_list = list()
|
descs_list = list()
|
||||||
|
|
|
@ -111,7 +111,7 @@ if __name__ == "__main__":
|
||||||
print("STEP 6: training", datetime.datetime.now())
|
print("STEP 6: training", datetime.datetime.now())
|
||||||
my_nlp = spacy.load('en_core_web_md')
|
my_nlp = spacy.load('en_core_web_md')
|
||||||
trainer = EL_Model(kb=my_kb, nlp=my_nlp)
|
trainer = EL_Model(kb=my_kb, nlp=my_nlp)
|
||||||
trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500)
|
trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20)
|
||||||
print()
|
print()
|
||||||
|
|
||||||
# STEP 7: apply the EL algorithm on the dev dataset
|
# STEP 7: apply the EL algorithm on the dev dataset
|
||||||
|
|
22
spacy/_ml.py
22
spacy/_ml.py
|
@ -652,6 +652,28 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
|
||||||
|
conv_depth = cfg.get("conv_depth", 2)
|
||||||
|
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
||||||
|
|
||||||
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
|
convolution = Residual((ExtractWindow(nW=1) >>
|
||||||
|
LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
|
||||||
|
|
||||||
|
encoder = SpacyVectors \
|
||||||
|
>> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \
|
||||||
|
>> flatten_add_lengths \
|
||||||
|
>> ParametricAttention(hidden_width) \
|
||||||
|
>> Pooling(mean_pool) \
|
||||||
|
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
|
||||||
|
>> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
|
||||||
|
|
||||||
|
# TODO: ReLu or LN(Maxout) ?
|
||||||
|
# sum_pool or mean_pool ?
|
||||||
|
|
||||||
|
encoder.nO = end_width
|
||||||
|
return encoder
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.0):
|
def flatten(seqs, drop=0.0):
|
||||||
ops = Model.ops
|
ops = Model.ops
|
||||||
|
|
|
@ -13,6 +13,8 @@ from thinc.v2v import Affine, Maxout, Softmax
|
||||||
from thinc.misc import LayerNorm
|
from thinc.misc import LayerNorm
|
||||||
from thinc.neural.util import to_categorical, copy_array
|
from thinc.neural.util import to_categorical, copy_array
|
||||||
|
|
||||||
|
from spacy.cli.pretrain import get_cossim_loss
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..syntax.nn_parser cimport Parser
|
from ..syntax.nn_parser cimport Parser
|
||||||
from ..syntax.ner cimport BiluoPushDown
|
from ..syntax.ner cimport BiluoPushDown
|
||||||
|
@ -23,14 +25,17 @@ from ..vocab cimport Vocab
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
from ..attrs import POS, ID
|
from ..attrs import POS, ID
|
||||||
from ..parts_of_speech import X
|
from ..parts_of_speech import X
|
||||||
from .._ml import Tok2Vec, build_tagger_model
|
from .._ml import Tok2Vec, build_tagger_model, cosine
|
||||||
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
from .._ml import build_text_classifier, build_simple_cnn_text_classifier
|
||||||
from .._ml import build_bow_text_classifier
|
from .._ml import build_bow_text_classifier, build_nel_encoder
|
||||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from .._ml import masked_language_model, create_default_optimizer
|
from .._ml import masked_language_model, create_default_optimizer
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
# TODO: remove
|
||||||
|
from examples.pipeline.wiki_entity_linking import kb_creator
|
||||||
|
|
||||||
|
|
||||||
def _load_cfg(path):
|
def _load_cfg(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
|
@ -1065,50 +1070,141 @@ class EntityLinker(Pipe):
|
||||||
name = 'entity_linker'
|
name = 'entity_linker'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, nr_class=1, **cfg):
|
def Model(cls, **cfg):
|
||||||
# TODO: non-dummy EL implementation
|
embed_width = cfg.get("embed_width", 300)
|
||||||
return None
|
hidden_width = cfg.get("hidden_width", 32)
|
||||||
|
entity_width = cfg.get("entity_width", 64)
|
||||||
|
article_width = cfg.get("article_width", 128)
|
||||||
|
sent_width = cfg.get("sent_width", 64)
|
||||||
|
|
||||||
|
entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width)
|
||||||
|
|
||||||
|
article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width)
|
||||||
|
sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width)
|
||||||
|
|
||||||
|
# dimension of the mention encoder needs to match the dimension of the entity encoder
|
||||||
|
mention_width = entity_encoder.nO
|
||||||
|
mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0)
|
||||||
|
|
||||||
|
return entity_encoder, article_encoder, sent_encoder, mention_encoder
|
||||||
|
|
||||||
|
def __init__(self, **cfg):
|
||||||
|
# TODO: bring-your-own-model
|
||||||
|
self.mention_encoder = True
|
||||||
|
|
||||||
def __init__(self, model=True, **cfg):
|
|
||||||
self.model = False
|
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.kb = self.cfg["kb"]
|
self.kb = self.cfg["kb"]
|
||||||
|
|
||||||
|
# TODO: fix this. store entity vectors in the KB ?
|
||||||
|
self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv')
|
||||||
|
|
||||||
|
def use_avg_params(self):
|
||||||
|
"""Modify the pipe's encoders/models, to use their average parameter values."""
|
||||||
|
with self.article_encoder.use_params(self.sgd_article.averages) \
|
||||||
|
and self.entity_encoder.use_params(self.sgd_entity.averages)\
|
||||||
|
and self.sent_encoder.use_params(self.sgd_sent.averages) \
|
||||||
|
and self.mention_encoder.use_params(self.sgd_mention.averages):
|
||||||
|
yield
|
||||||
|
|
||||||
|
def require_model(self):
|
||||||
|
"""Raise an error if the component's model is not initialized."""
|
||||||
|
if getattr(self, "mention_encoder", None) in (None, True, False):
|
||||||
|
raise ValueError(Errors.E109.format(name=self.name))
|
||||||
|
|
||||||
|
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
|
if self.mention_encoder is True:
|
||||||
|
self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg)
|
||||||
|
self.sgd_article = create_default_optimizer(self.article_encoder.ops)
|
||||||
|
self.sgd_sent = create_default_optimizer(self.sent_encoder.ops)
|
||||||
|
self.sgd_mention = create_default_optimizer(self.mention_encoder.ops)
|
||||||
|
self.sgd_entity = create_default_optimizer(self.entity_encoder.ops)
|
||||||
|
|
||||||
|
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
|
||||||
|
""" docs should be a tuple of (entity_docs, article_docs, sentence_docs) """
|
||||||
|
self.require_model()
|
||||||
|
|
||||||
|
entity_docs, article_docs, sentence_docs = docs
|
||||||
|
assert len(entity_docs) == len(article_docs) == len(sentence_docs)
|
||||||
|
|
||||||
|
if isinstance(entity_docs, Doc):
|
||||||
|
entity_docs = [entity_docs]
|
||||||
|
article_docs = [article_docs]
|
||||||
|
sentence_docs = [sentence_docs]
|
||||||
|
|
||||||
|
entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop)
|
||||||
|
doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop)
|
||||||
|
sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop)
|
||||||
|
|
||||||
|
concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in
|
||||||
|
range(len(article_docs))]
|
||||||
|
mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP)
|
||||||
|
|
||||||
|
loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
|
||||||
|
|
||||||
|
mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont)
|
||||||
|
|
||||||
|
# gradient : concat (doc+sent) vs. desc
|
||||||
|
sent_start = self.article_encoder.nO
|
||||||
|
sent_gradients = list()
|
||||||
|
doc_gradients = list()
|
||||||
|
for x in mention_gradient:
|
||||||
|
doc_gradients.append(list(x[0:sent_start]))
|
||||||
|
sent_gradients.append(list(x[sent_start:]))
|
||||||
|
|
||||||
|
bp_doc(doc_gradients, sgd=self.sgd_article)
|
||||||
|
bp_sent(sent_gradients, sgd=self.sgd_sent)
|
||||||
|
|
||||||
|
if losses is not None:
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return loss
|
||||||
|
|
||||||
|
def get_loss(self, docs, golds, scores):
|
||||||
|
loss, gradients = get_cossim_loss(scores, golds)
|
||||||
|
return loss, gradients
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
self.set_annotations([doc], scores=None, tensors=None)
|
entities, kb_ids = self.predict([doc])
|
||||||
|
self.set_annotations([doc], entities, kb_ids)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
"""Apply the pipe to a stream of documents.
|
|
||||||
Both __call__ and pipe should delegate to the `predict()`
|
|
||||||
and `set_annotations()` methods.
|
|
||||||
"""
|
|
||||||
for docs in util.minibatch(stream, size=batch_size):
|
for docs in util.minibatch(stream, size=batch_size):
|
||||||
docs = list(docs)
|
docs = list(docs)
|
||||||
self.set_annotations(docs, scores=None, tensors=None)
|
entities, kb_ids = self.predict(docs)
|
||||||
|
self.set_annotations(docs, entities, kb_ids)
|
||||||
yield from docs
|
yield from docs
|
||||||
|
|
||||||
def set_annotations(self, docs, scores, tensors=None):
|
def predict(self, docs):
|
||||||
"""
|
self.require_model()
|
||||||
Currently implemented as taking the KB entry with highest prior probability for each named entity
|
for i, article_doc in enumerate(docs):
|
||||||
TODO: actually use context etc
|
doc_encoding = self.article_encoder([article_doc])
|
||||||
"""
|
for ent in article_doc.ents:
|
||||||
for i, doc in enumerate(docs):
|
sent_doc = ent.sent.as_doc()
|
||||||
for ent in doc.ents:
|
sent_encoding = self.sent_encoder([sent_doc])
|
||||||
|
concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])]
|
||||||
|
mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]]))
|
||||||
|
mention_enc_t = np.transpose(mention_encoding)
|
||||||
|
|
||||||
candidates = self.kb.get_candidates(ent.text)
|
candidates = self.kb.get_candidates(ent.text)
|
||||||
if candidates:
|
if candidates:
|
||||||
best_candidate = max(candidates, key=lambda c: c.prior_prob)
|
highest_sim = -5
|
||||||
for token in ent:
|
best_i = -1
|
||||||
token.ent_kb_id_ = best_candidate.entity_
|
with self.use_avg_params:
|
||||||
|
for c in candidates:
|
||||||
|
kb_id = c.entity_
|
||||||
|
description = self.id_to_descr.get(kb_id)
|
||||||
|
entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ?
|
||||||
|
sim = cosine(entity_encodings, mention_enc_t)
|
||||||
|
if sim >= highest_sim:
|
||||||
|
best_i = i
|
||||||
|
highest_sim = sim
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
# TODO best_candidate = max(candidates, key=lambda c: c.prior_prob)
|
||||||
# TODO
|
|
||||||
pass
|
|
||||||
|
|
||||||
def add_label(self, label):
|
|
||||||
# TODO
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
def set_annotations(self, docs, entities, kb_ids=None):
|
||||||
|
for token, kb_id in zip(entities, kb_ids):
|
||||||
|
token.ent_kb_id_ = kb_id
|
||||||
|
|
||||||
class Sentencizer(object):
|
class Sentencizer(object):
|
||||||
"""Segment the Doc into sentences using a rule-based strategy.
|
"""Segment the Doc into sentences using a rule-based strategy.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user