mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
context encoder with Tok2Vec + linking model instead of cosine
This commit is contained in:
parent
dbc53b9870
commit
68a0662019
|
@ -33,7 +33,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ,
|
||||||
else:
|
else:
|
||||||
# read the mappings from file
|
# read the mappings from file
|
||||||
title_to_id = get_entity_to_id(entity_def_output)
|
title_to_id = get_entity_to_id(entity_def_output)
|
||||||
id_to_descr = _get_id_to_description(entity_descr_output)
|
id_to_descr = get_id_to_description(entity_descr_output)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print(" * _get_entity_frequencies", datetime.datetime.now())
|
print(" * _get_entity_frequencies", datetime.datetime.now())
|
||||||
|
@ -109,7 +109,7 @@ def get_entity_to_id(entity_def_output):
|
||||||
return entity_to_id
|
return entity_to_id
|
||||||
|
|
||||||
|
|
||||||
def _get_id_to_description(entity_descr_output):
|
def get_id_to_description(entity_descr_output):
|
||||||
id_to_desc = dict()
|
id_to_desc = dict()
|
||||||
with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
|
with open(entity_descr_output, 'r', encoding='utf8') as csvfile:
|
||||||
csvreader = csv.reader(csvfile, delimiter='|')
|
csvreader = csv.reader(csvfile, delimiter='|')
|
||||||
|
|
|
@ -14,7 +14,7 @@ from thinc.neural._classes.affine import Affine
|
||||||
class EntityEncoder:
|
class EntityEncoder:
|
||||||
"""
|
"""
|
||||||
Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
|
Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D).
|
||||||
This entity vector will be stored in the KB, and context vectors will be trained to be similar to them.
|
This entity vector will be stored in the KB, for further downstream use in the entity model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DROP = 0
|
DROP = 0
|
||||||
|
@ -97,7 +97,7 @@ class EntityEncoder:
|
||||||
else:
|
else:
|
||||||
indices[i] = 0
|
indices[i] = 0
|
||||||
word_vectors = doc.vocab.vectors.data[indices]
|
word_vectors = doc.vocab.vectors.data[indices]
|
||||||
doc_vector = np.mean(word_vectors, axis=0) # TODO: min? max?
|
doc_vector = np.mean(word_vectors, axis=0)
|
||||||
return doc_vector
|
return doc_vector
|
||||||
|
|
||||||
def _build_network(self, orig_width, hidden_with):
|
def _build_network(self, orig_width, hidden_with):
|
||||||
|
|
|
@ -14,8 +14,7 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm
|
||||||
Gold-standard entities are stored in one file in standoff format (by character offset).
|
Gold-standard entities are stored in one file in standoff format (by character offset).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# ENTITY_FILE = "gold_entities.csv"
|
ENTITY_FILE = "gold_entities.csv"
|
||||||
ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing
|
|
||||||
|
|
||||||
|
|
||||||
def create_training(wikipedia_input, entity_def_input, training_output):
|
def create_training(wikipedia_input, entity_def_input, training_output):
|
||||||
|
|
|
@ -42,9 +42,10 @@ MIN_PAIR_OCC = 5
|
||||||
|
|
||||||
# model training parameters
|
# model training parameters
|
||||||
EPOCHS = 10
|
EPOCHS = 10
|
||||||
DROPOUT = 0.1
|
DROPOUT = 0.2
|
||||||
LEARN_RATE = 0.005
|
LEARN_RATE = 0.005
|
||||||
L2 = 1e-6
|
L2 = 1e-6
|
||||||
|
CONTEXT_WIDTH=128
|
||||||
|
|
||||||
|
|
||||||
def run_pipeline():
|
def run_pipeline():
|
||||||
|
@ -136,7 +137,8 @@ def run_pipeline():
|
||||||
|
|
||||||
# STEP 6: create and train the entity linking pipe
|
# STEP 6: create and train the entity linking pipe
|
||||||
if train_pipe:
|
if train_pipe:
|
||||||
el_pipe = nlp_2.create_pipe(name='entity_linker', config={})
|
print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
|
||||||
|
el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH})
|
||||||
el_pipe.set_kb(kb_2)
|
el_pipe.set_kb(kb_2)
|
||||||
nlp_2.add_pipe(el_pipe, last=True)
|
nlp_2.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
|
@ -146,9 +148,8 @@ def run_pipeline():
|
||||||
optimizer.learn_rate = LEARN_RATE
|
optimizer.learn_rate = LEARN_RATE
|
||||||
optimizer.L2 = L2
|
optimizer.L2 = L2
|
||||||
|
|
||||||
print("STEP 6: training Entity Linking pipe", datetime.datetime.now())
|
|
||||||
# define the size (nr of entities) of training and dev set
|
# define the size (nr of entities) of training and dev set
|
||||||
train_limit = 5000
|
train_limit = 500000
|
||||||
dev_limit = 5000
|
dev_limit = 5000
|
||||||
|
|
||||||
train_data = training_set_creator.read_training(nlp=nlp_2,
|
train_data = training_set_creator.read_training(nlp=nlp_2,
|
||||||
|
|
45
spacy/_ml.py
45
spacy/_ml.py
|
@ -652,37 +652,36 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_nel_encoder(in_width, hidden_width, end_width, **cfg):
|
def build_nel_encoder(embed_width, hidden_width, **cfg):
|
||||||
|
# TODO proper error
|
||||||
|
if "entity_width" not in cfg:
|
||||||
|
raise ValueError("entity_width not found")
|
||||||
|
if "context_width" not in cfg:
|
||||||
|
raise ValueError("context_width not found")
|
||||||
|
|
||||||
conv_depth = cfg.get("conv_depth", 2)
|
conv_depth = cfg.get("conv_depth", 2)
|
||||||
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3)
|
||||||
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
|
pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name
|
||||||
|
context_width = cfg.get("context_width")
|
||||||
tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors,
|
entity_width = cfg.get("entity_width")
|
||||||
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0)
|
|
||||||
|
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
# convolution = Residual((ExtractWindow(nW=1) >>
|
model = Affine(1, entity_width+context_width+1, drop_factor=0.0)\
|
||||||
# LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces))))
|
>> logistic
|
||||||
|
|
||||||
# encoder = SpacyVectors \
|
# context encoder
|
||||||
# >> with_flatten(Affine(hidden_width, in_width)) \
|
tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors,
|
||||||
# >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \
|
cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth,
|
||||||
# >> flatten_add_lengths \
|
bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\
|
||||||
# >> ParametricAttention(hidden_width) \
|
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
|
||||||
# >> Pooling(sum_pool) \
|
>> zero_init(Affine(context_width, hidden_width, drop_factor=0.0))
|
||||||
# >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
|
|
||||||
# >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
|
|
||||||
|
|
||||||
encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\
|
model.tok2vec = tok2vec
|
||||||
>> Residual(zero_init(Maxout(hidden_width, hidden_width))) \
|
|
||||||
>> zero_init(Affine(end_width, hidden_width, drop_factor=0.0))
|
|
||||||
|
|
||||||
# TODO: ReLu or LN(Maxout) ?
|
model.tok2vec = tok2vec
|
||||||
# sum_pool or mean_pool ?
|
model.tok2vec.nO = context_width
|
||||||
|
model.nO = 1
|
||||||
encoder.tok2vec = tok2vec
|
return model
|
||||||
encoder.nO = end_width
|
|
||||||
return encoder
|
|
||||||
|
|
||||||
@layerize
|
@layerize
|
||||||
def flatten(seqs, drop=0.0):
|
def flatten(seqs, drop=0.0):
|
||||||
|
|
|
@ -5,6 +5,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
|
import random
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, Maxout, Softmax
|
from thinc.v2v import Affine, Maxout, Softmax
|
||||||
|
@ -229,7 +230,7 @@ class Tensorizer(Pipe):
|
||||||
|
|
||||||
vocab (Vocab): A `Vocab` instance. The model must share the same
|
vocab (Vocab): A `Vocab` instance. The model must share the same
|
||||||
`Vocab` instance with the `Doc` objects it will process.
|
`Vocab` instance with the `Doc` objects it will process.
|
||||||
model (Model): A `Model` instance or `True` allocate one later.
|
model (Model): A `Model` instance or `True` to allocate one later.
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
|
|
||||||
EXAMPLE:
|
EXAMPLE:
|
||||||
|
@ -386,7 +387,7 @@ class Tagger(Pipe):
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
self.require_model()
|
self.require_model()
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
# Handle case where there are no tokens in any docs.
|
# Handle cases where there are no tokens in any docs.
|
||||||
n_labels = len(self.labels)
|
n_labels = len(self.labels)
|
||||||
guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
|
guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
|
||||||
tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
|
tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
|
||||||
|
@ -1071,22 +1072,20 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, **cfg):
|
def Model(cls, **cfg):
|
||||||
if "entity_width" not in cfg:
|
|
||||||
raise ValueError("entity_width not found")
|
|
||||||
|
|
||||||
embed_width = cfg.get("embed_width", 300)
|
embed_width = cfg.get("embed_width", 300)
|
||||||
hidden_width = cfg.get("hidden_width", 128)
|
hidden_width = cfg.get("hidden_width", 128)
|
||||||
entity_width = cfg.get("entity_width") # this needs to correspond with the KB entity length
|
|
||||||
|
|
||||||
model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg)
|
model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def __init__(self, **cfg):
|
def __init__(self, **cfg):
|
||||||
self.model = True
|
self.model = True
|
||||||
self.kb = None
|
self.kb = None
|
||||||
|
self.sgd_context = None
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.context_weight = cfg.get("context_weight", 1)
|
self.context_weight = cfg.get("context_weight", 1)
|
||||||
self.prior_weight = cfg.get("prior_weight", 1)
|
self.prior_weight = cfg.get("prior_weight", 1)
|
||||||
|
self.context_width = cfg.get("context_width")
|
||||||
|
|
||||||
def set_kb(self, kb):
|
def set_kb(self, kb):
|
||||||
self.kb = kb
|
self.kb = kb
|
||||||
|
@ -1107,6 +1106,7 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
|
self.sgd_context = self.create_optimizer()
|
||||||
|
|
||||||
if sgd is None:
|
if sgd is None:
|
||||||
sgd = self.create_optimizer()
|
sgd = self.create_optimizer()
|
||||||
|
@ -1132,35 +1132,55 @@ class EntityLinker(Pipe):
|
||||||
|
|
||||||
context_docs = []
|
context_docs = []
|
||||||
entity_encodings = []
|
entity_encodings = []
|
||||||
|
labels = []
|
||||||
|
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
for entity in gold.links:
|
for entity in gold.links:
|
||||||
start, end, gold_kb = entity
|
start, end, gold_kb = entity
|
||||||
mention = doc.text[start:end]
|
mention = doc.text[start:end]
|
||||||
|
|
||||||
candidates = self.kb.get_candidates(mention)
|
candidates = self.kb.get_candidates(mention)
|
||||||
|
random.shuffle(candidates)
|
||||||
|
nr_neg = 0
|
||||||
for c in candidates:
|
for c in candidates:
|
||||||
kb_id = c.entity_
|
kb_id = c.entity_
|
||||||
# Currently only training on the positive instances
|
|
||||||
if kb_id == gold_kb:
|
if kb_id == gold_kb:
|
||||||
prior_prob = c.prior_prob
|
|
||||||
entity_encoding = c.entity_vector
|
entity_encoding = c.entity_vector
|
||||||
entity_encodings.append(entity_encoding)
|
entity_encodings.append(entity_encoding)
|
||||||
context_docs.append(doc)
|
context_docs.append(doc)
|
||||||
|
labels.append([1])
|
||||||
|
else: # elif nr_neg < 1:
|
||||||
|
nr_neg += 1
|
||||||
|
entity_encoding = c.entity_vector
|
||||||
|
entity_encodings.append(entity_encoding)
|
||||||
|
context_docs.append(doc)
|
||||||
|
labels.append([0])
|
||||||
|
|
||||||
if len(entity_encodings) > 0:
|
if len(entity_encodings) > 0:
|
||||||
context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
|
context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop)
|
||||||
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32")
|
||||||
|
|
||||||
loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None)
|
mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
|
||||||
bp_context(d_scores, sgd=sgd)
|
pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop)
|
||||||
|
labels = self.model.ops.asarray(labels, dtype="float32")
|
||||||
|
|
||||||
|
loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None)
|
||||||
|
mention_gradient = bp_mention(d_scores, sgd=sgd)
|
||||||
|
|
||||||
|
context_gradients = [list(x[0:self.context_width]) for x in mention_gradient]
|
||||||
|
bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context)
|
||||||
|
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return loss
|
return loss
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, prediction):
|
||||||
|
d_scores = (prediction - golds)
|
||||||
|
loss = (d_scores ** 2).sum()
|
||||||
|
loss = loss / len(golds)
|
||||||
|
return loss, d_scores
|
||||||
|
|
||||||
|
def get_loss_old(self, docs, golds, scores):
|
||||||
# this loss function assumes we're only using positive examples
|
# this loss function assumes we're only using positive examples
|
||||||
loss, gradients = get_cossim_loss(yh=scores, y=golds)
|
loss, gradients = get_cossim_loss(yh=scores, y=golds)
|
||||||
loss = loss / len(golds)
|
loss = loss / len(golds)
|
||||||
|
@ -1191,30 +1211,26 @@ class EntityLinker(Pipe):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
|
||||||
context_encodings = self.model(docs)
|
context_encodings = self.model.tok2vec(docs)
|
||||||
xp = get_array_module(context_encodings)
|
xp = get_array_module(context_encodings)
|
||||||
|
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
if len(doc) > 0:
|
if len(doc) > 0:
|
||||||
context_encoding = context_encodings[i]
|
context_encoding = context_encodings[i]
|
||||||
context_enc_t = context_encoding.T
|
|
||||||
norm_1 = xp.linalg.norm(context_enc_t)
|
|
||||||
for ent in doc.ents:
|
for ent in doc.ents:
|
||||||
candidates = self.kb.get_candidates(ent.text)
|
candidates = self.kb.get_candidates(ent.text)
|
||||||
if candidates:
|
if candidates:
|
||||||
prior_probs = xp.asarray([c.prior_prob for c in candidates])
|
random.shuffle(candidates)
|
||||||
|
prior_probs = xp.asarray([[c.prior_prob] for c in candidates])
|
||||||
prior_probs *= self.prior_weight
|
prior_probs *= self.prior_weight
|
||||||
|
|
||||||
entity_encodings = xp.asarray([c.entity_vector for c in candidates])
|
entity_encodings = xp.asarray([c.entity_vector for c in candidates])
|
||||||
norm_2 = xp.linalg.norm(entity_encodings, axis=1)
|
mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))]
|
||||||
|
predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32"))
|
||||||
# cosine similarity
|
scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions)))
|
||||||
sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
|
|
||||||
sims *= self.context_weight
|
|
||||||
scores = prior_probs + sims - (prior_probs*sims)
|
|
||||||
best_index = scores.argmax()
|
|
||||||
|
|
||||||
# TODO: thresholding
|
# TODO: thresholding
|
||||||
|
best_index = scores.argmax()
|
||||||
best_candidate = candidates[best_index]
|
best_candidate = candidates[best_index]
|
||||||
final_entities.append(ent)
|
final_entities.append(ent)
|
||||||
final_kb_ids.append(best_candidate.entity_)
|
final_kb_ids.append(best_candidate.entity_)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user