From 83dc7b46fd1b39023c6eb883471c961d9e5bd51c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 10 Jun 2019 21:25:26 +0200 Subject: [PATCH] first tests with EL pipe --- .../wiki_entity_linking/kb_creator.py | 4 +-- .../wiki_entity_linking/train_descriptions.py | 4 +-- .../training_set_creator.py | 4 +-- .../wiki_entity_linking/wiki_nel_pipeline.py | 19 +++++----- spacy/pipeline/pipes.pyx | 36 ++++++++++++++----- 5 files changed, 43 insertions(+), 24 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index ee632bd48..e7e3d077d 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = list(title_to_id.keys()) # TODO: remove this filter (just for quicker testing of code) - title_list = title_list[0:34200] - title_to_id = {t: title_to_id[t] for t in title_list} + # title_list = title_list[0:34200] + # title_to_id = {t: title_to_id[t] for t in title_list} entity_list = [title_to_id[x] for x in title_list] diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index f2c3fa05d..e1a2f1797 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -17,7 +17,7 @@ class EntityEncoder: DROP = 0 EPOCHS = 5 - STOP_THRESHOLD = 0.9 # 0.1 + STOP_THRESHOLD = 0.1 BATCH_SIZE = 1000 @@ -32,7 +32,7 @@ class EntityEncoder: print("Encoding", len(description_list), "entities") - batch_size = 10000 + batch_size = 100000 start = 0 stop = min(batch_size, len(description_list)) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 156bce05f..38a86058d 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -298,7 +298,7 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr return correct_entries_per_article, incorrect_entries_per_article -def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print): +def read_training(nlp, training_dir, dev, limit, to_print): correct_entries, incorrect_entries = read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=True) @@ -306,7 +306,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri data = [] cnt = 0 - next_entity_nr = 1 files = listdir(training_dir) for f in files: if not limit or cnt < limit: @@ -320,7 +319,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() article_doc = nlp(text) - truncated_text = text[0:min(doc_cutoff, len(text))] gold_entities = list() diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ded4bdc24..4be1ae2fb 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -121,15 +121,16 @@ if __name__ == "__main__": if train_pipe: id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR) - train_data = training_set_creator.read_training(nlp=nlp, - training_dir=TRAINING_DIR, - id_to_descr=id_to_descr, - doc_cutoff=DOC_CHAR_CUTOFF, - dev=False, - limit=100, - to_print=False) + train_limit = 10 + print("Training on", train_limit, "articles") - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) + train_data = training_set_creator.read_training(nlp=nlp, + training_dir=TRAINING_DIR, + dev=False, + limit=train_limit, + to_print=False) + + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF}) nlp.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] @@ -141,7 +142,7 @@ if __name__ == "__main__": print("EPOCH", itn) random.shuffle(train_data) losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) for batch in batches: docs, golds = zip(*batch) nlp.update( diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01302b618..e5ed2ec23 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -11,9 +11,8 @@ from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm -from thinc.neural.util import to_categorical, copy_array - -from spacy.cli.pretrain import get_cossim_loss +from thinc.neural.util import to_categorical +from thinc.neural.util import get_array_module from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser @@ -33,9 +32,6 @@ from .._ml import masked_language_model, create_default_optimizer from ..errors import Errors, TempErrors from .. import util -# TODO: remove -from examples.pipeline.wiki_entity_linking import kb_creator - def _load_cfg(path): if path.exists(): @@ -1094,6 +1090,7 @@ class EntityLinker(Pipe): self.mention_encoder = True self.cfg = dict(cfg) self.kb = self.cfg["kb"] + self.doc_cutoff = self.cfg["doc_cutoff"] def use_avg_params(self): """Modify the pipe's encoders/models, to use their average parameter values.""" @@ -1134,6 +1131,7 @@ class EntityLinker(Pipe): start, end, gold_kb = entity mention = doc[start:end] sentence = mention.sent + first_par = doc[0:self.doc_cutoff].as_doc() candidates = self.kb.get_candidates(mention.text) for c in candidates: @@ -1144,7 +1142,7 @@ class EntityLinker(Pipe): entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) - article_docs.append(doc) + article_docs.append(first_par) sentence_docs.append(sentence.as_doc()) if len(entity_encodings) > 0: @@ -1158,6 +1156,10 @@ class EntityLinker(Pipe): entity_encodings = np.asarray(entity_encodings, dtype=np.float32) loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) + # print("scores", mention_encodings) + # print("golds", entity_encodings) + # print("loss", loss) + # print("d_scores", d_scores) mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) @@ -1180,9 +1182,26 @@ class EntityLinker(Pipe): return 0 def get_loss(self, docs, golds, scores): - loss, gradients = get_cossim_loss(scores, golds) + targets = [[1] for _ in golds] # assuming we're only using positive examples + loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets) + #loss = loss / len(golds) return loss, gradients + def get_cossim_loss_2(self, yh, y, t): + # Add a small constant to avoid 0 vectors + yh = yh + 1e-8 + y = y + 1e-8 + # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity + xp = get_array_module(yh) + norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) + norm_y = xp.linalg.norm(y, axis=1, keepdims=True) + mul_norms = norm_yh * norm_y + cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms + d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2)) + loss = xp.abs(cos - t).sum() + inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))]) + return loss, -inverse + def __call__(self, doc): entities, kb_ids = self.predict([doc]) self.set_annotations([doc], entities, kb_ids) @@ -1220,6 +1239,7 @@ class EntityLinker(Pipe): score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? scores.append(score) + # TODO: thresholding best_index = scores.index(max(scores)) best_candidate = candidates[best_index] final_entities.append(ent)