first tests with EL pipe

This commit is contained in:
svlandeg 2019-06-10 21:25:26 +02:00
parent 7de1ee69b8
commit 83dc7b46fd
5 changed files with 43 additions and 24 deletions

View File

@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
title_list = list(title_to_id.keys()) title_list = list(title_to_id.keys())
# TODO: remove this filter (just for quicker testing of code) # TODO: remove this filter (just for quicker testing of code)
title_list = title_list[0:34200] # title_list = title_list[0:34200]
title_to_id = {t: title_to_id[t] for t in title_list} # title_to_id = {t: title_to_id[t] for t in title_list}
entity_list = [title_to_id[x] for x in title_list] entity_list = [title_to_id[x] for x in title_list]

View File

@ -17,7 +17,7 @@ class EntityEncoder:
DROP = 0 DROP = 0
EPOCHS = 5 EPOCHS = 5
STOP_THRESHOLD = 0.9 # 0.1 STOP_THRESHOLD = 0.1
BATCH_SIZE = 1000 BATCH_SIZE = 1000
@ -32,7 +32,7 @@ class EntityEncoder:
print("Encoding", len(description_list), "entities") print("Encoding", len(description_list), "entities")
batch_size = 10000 batch_size = 100000
start = 0 start = 0
stop = min(batch_size, len(description_list)) stop = min(batch_size, len(description_list))

View File

@ -298,7 +298,7 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr
return correct_entries_per_article, incorrect_entries_per_article return correct_entries_per_article, incorrect_entries_per_article
def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print): def read_training(nlp, training_dir, dev, limit, to_print):
correct_entries, incorrect_entries = read_training_entities(training_output=training_dir, correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
collect_correct=True, collect_correct=True,
collect_incorrect=True) collect_incorrect=True)
@ -306,7 +306,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
data = [] data = []
cnt = 0 cnt = 0
next_entity_nr = 1
files = listdir(training_dir) files = listdir(training_dir)
for f in files: for f in files:
if not limit or cnt < limit: if not limit or cnt < limit:
@ -320,7 +319,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
text = file.read() text = file.read()
article_doc = nlp(text) article_doc = nlp(text)
truncated_text = text[0:min(doc_cutoff, len(text))]
gold_entities = list() gold_entities = list()

View File

@ -121,15 +121,16 @@ if __name__ == "__main__":
if train_pipe: if train_pipe:
id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR) id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
train_data = training_set_creator.read_training(nlp=nlp, train_limit = 10
training_dir=TRAINING_DIR, print("Training on", train_limit, "articles")
id_to_descr=id_to_descr,
doc_cutoff=DOC_CHAR_CUTOFF,
dev=False,
limit=100,
to_print=False)
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) train_data = training_set_creator.read_training(nlp=nlp,
training_dir=TRAINING_DIR,
dev=False,
limit=train_limit,
to_print=False)
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
nlp.add_pipe(el_pipe, last=True) nlp.add_pipe(el_pipe, last=True)
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -141,7 +142,7 @@ if __name__ == "__main__":
print("EPOCH", itn) print("EPOCH", itn)
random.shuffle(train_data) random.shuffle(train_data)
losses = {} losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
for batch in batches: for batch in batches:
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update( nlp.update(

View File

@ -11,9 +11,8 @@ from collections import OrderedDict
from thinc.api import chain from thinc.api import chain
from thinc.v2v import Affine, Maxout, Softmax from thinc.v2v import Affine, Maxout, Softmax
from thinc.misc import LayerNorm from thinc.misc import LayerNorm
from thinc.neural.util import to_categorical, copy_array from thinc.neural.util import to_categorical
from thinc.neural.util import get_array_module
from spacy.cli.pretrain import get_cossim_loss
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser from ..syntax.nn_parser cimport Parser
@ -33,9 +32,6 @@ from .._ml import masked_language_model, create_default_optimizer
from ..errors import Errors, TempErrors from ..errors import Errors, TempErrors
from .. import util from .. import util
# TODO: remove
from examples.pipeline.wiki_entity_linking import kb_creator
def _load_cfg(path): def _load_cfg(path):
if path.exists(): if path.exists():
@ -1094,6 +1090,7 @@ class EntityLinker(Pipe):
self.mention_encoder = True self.mention_encoder = True
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.kb = self.cfg["kb"] self.kb = self.cfg["kb"]
self.doc_cutoff = self.cfg["doc_cutoff"]
def use_avg_params(self): def use_avg_params(self):
"""Modify the pipe's encoders/models, to use their average parameter values.""" """Modify the pipe's encoders/models, to use their average parameter values."""
@ -1134,6 +1131,7 @@ class EntityLinker(Pipe):
start, end, gold_kb = entity start, end, gold_kb = entity
mention = doc[start:end] mention = doc[start:end]
sentence = mention.sent sentence = mention.sent
first_par = doc[0:self.doc_cutoff].as_doc()
candidates = self.kb.get_candidates(mention.text) candidates = self.kb.get_candidates(mention.text)
for c in candidates: for c in candidates:
@ -1144,7 +1142,7 @@ class EntityLinker(Pipe):
entity_encoding = c.entity_vector entity_encoding = c.entity_vector
entity_encodings.append(entity_encoding) entity_encodings.append(entity_encoding)
article_docs.append(doc) article_docs.append(first_par)
sentence_docs.append(sentence.as_doc()) sentence_docs.append(sentence.as_doc())
if len(entity_encodings) > 0: if len(entity_encodings) > 0:
@ -1158,6 +1156,10 @@ class EntityLinker(Pipe):
entity_encodings = np.asarray(entity_encodings, dtype=np.float32) entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
# print("scores", mention_encodings)
# print("golds", entity_encodings)
# print("loss", loss)
# print("d_scores", d_scores)
mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
@ -1180,9 +1182,26 @@ class EntityLinker(Pipe):
return 0 return 0
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
loss, gradients = get_cossim_loss(scores, golds) targets = [[1] for _ in golds] # assuming we're only using positive examples
loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets)
#loss = loss / len(golds)
return loss, gradients return loss, gradients
def get_cossim_loss_2(self, yh, y, t):
# Add a small constant to avoid 0 vectors
yh = yh + 1e-8
y = y + 1e-8
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
xp = get_array_module(yh)
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
mul_norms = norm_yh * norm_y
cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
loss = xp.abs(cos - t).sum()
inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
return loss, -inverse
def __call__(self, doc): def __call__(self, doc):
entities, kb_ids = self.predict([doc]) entities, kb_ids = self.predict([doc])
self.set_annotations([doc], entities, kb_ids) self.set_annotations([doc], entities, kb_ids)
@ -1220,6 +1239,7 @@ class EntityLinker(Pipe):
score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ?
scores.append(score) scores.append(score)
# TODO: thresholding
best_index = scores.index(max(scores)) best_index = scores.index(max(scores))
best_candidate = candidates[best_index] best_candidate = candidates[best_index]
final_entities.append(ent) final_entities.append(ent)