mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-27 20:33:42 +03:00
first tests with EL pipe
This commit is contained in:
parent
7de1ee69b8
commit
83dc7b46fd
|
@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ,
|
||||||
title_list = list(title_to_id.keys())
|
title_list = list(title_to_id.keys())
|
||||||
|
|
||||||
# TODO: remove this filter (just for quicker testing of code)
|
# TODO: remove this filter (just for quicker testing of code)
|
||||||
title_list = title_list[0:34200]
|
# title_list = title_list[0:34200]
|
||||||
title_to_id = {t: title_to_id[t] for t in title_list}
|
# title_to_id = {t: title_to_id[t] for t in title_list}
|
||||||
|
|
||||||
entity_list = [title_to_id[x] for x in title_list]
|
entity_list = [title_to_id[x] for x in title_list]
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,7 @@ class EntityEncoder:
|
||||||
|
|
||||||
DROP = 0
|
DROP = 0
|
||||||
EPOCHS = 5
|
EPOCHS = 5
|
||||||
STOP_THRESHOLD = 0.9 # 0.1
|
STOP_THRESHOLD = 0.1
|
||||||
|
|
||||||
BATCH_SIZE = 1000
|
BATCH_SIZE = 1000
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ class EntityEncoder:
|
||||||
|
|
||||||
print("Encoding", len(description_list), "entities")
|
print("Encoding", len(description_list), "entities")
|
||||||
|
|
||||||
batch_size = 10000
|
batch_size = 100000
|
||||||
|
|
||||||
start = 0
|
start = 0
|
||||||
stop = min(batch_size, len(description_list))
|
stop = min(batch_size, len(description_list))
|
||||||
|
|
|
@ -298,7 +298,7 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr
|
||||||
return correct_entries_per_article, incorrect_entries_per_article
|
return correct_entries_per_article, incorrect_entries_per_article
|
||||||
|
|
||||||
|
|
||||||
def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print):
|
def read_training(nlp, training_dir, dev, limit, to_print):
|
||||||
correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
|
correct_entries, incorrect_entries = read_training_entities(training_output=training_dir,
|
||||||
collect_correct=True,
|
collect_correct=True,
|
||||||
collect_incorrect=True)
|
collect_incorrect=True)
|
||||||
|
@ -306,7 +306,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
|
||||||
data = []
|
data = []
|
||||||
|
|
||||||
cnt = 0
|
cnt = 0
|
||||||
next_entity_nr = 1
|
|
||||||
files = listdir(training_dir)
|
files = listdir(training_dir)
|
||||||
for f in files:
|
for f in files:
|
||||||
if not limit or cnt < limit:
|
if not limit or cnt < limit:
|
||||||
|
@ -320,7 +319,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri
|
||||||
with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
|
with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file:
|
||||||
text = file.read()
|
text = file.read()
|
||||||
article_doc = nlp(text)
|
article_doc = nlp(text)
|
||||||
truncated_text = text[0:min(doc_cutoff, len(text))]
|
|
||||||
|
|
||||||
gold_entities = list()
|
gold_entities = list()
|
||||||
|
|
||||||
|
|
|
@ -121,15 +121,16 @@ if __name__ == "__main__":
|
||||||
if train_pipe:
|
if train_pipe:
|
||||||
id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
|
id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR)
|
||||||
|
|
||||||
train_data = training_set_creator.read_training(nlp=nlp,
|
train_limit = 10
|
||||||
training_dir=TRAINING_DIR,
|
print("Training on", train_limit, "articles")
|
||||||
id_to_descr=id_to_descr,
|
|
||||||
doc_cutoff=DOC_CHAR_CUTOFF,
|
|
||||||
dev=False,
|
|
||||||
limit=100,
|
|
||||||
to_print=False)
|
|
||||||
|
|
||||||
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb})
|
train_data = training_set_creator.read_training(nlp=nlp,
|
||||||
|
training_dir=TRAINING_DIR,
|
||||||
|
dev=False,
|
||||||
|
limit=train_limit,
|
||||||
|
to_print=False)
|
||||||
|
|
||||||
|
el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF})
|
||||||
nlp.add_pipe(el_pipe, last=True)
|
nlp.add_pipe(el_pipe, last=True)
|
||||||
|
|
||||||
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
|
||||||
|
@ -141,7 +142,7 @@ if __name__ == "__main__":
|
||||||
print("EPOCH", itn)
|
print("EPOCH", itn)
|
||||||
random.shuffle(train_data)
|
random.shuffle(train_data)
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(
|
nlp.update(
|
||||||
|
|
|
@ -11,9 +11,8 @@ from collections import OrderedDict
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, Maxout, Softmax
|
from thinc.v2v import Affine, Maxout, Softmax
|
||||||
from thinc.misc import LayerNorm
|
from thinc.misc import LayerNorm
|
||||||
from thinc.neural.util import to_categorical, copy_array
|
from thinc.neural.util import to_categorical
|
||||||
|
from thinc.neural.util import get_array_module
|
||||||
from spacy.cli.pretrain import get_cossim_loss
|
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..syntax.nn_parser cimport Parser
|
from ..syntax.nn_parser cimport Parser
|
||||||
|
@ -33,9 +32,6 @@ from .._ml import masked_language_model, create_default_optimizer
|
||||||
from ..errors import Errors, TempErrors
|
from ..errors import Errors, TempErrors
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
# TODO: remove
|
|
||||||
from examples.pipeline.wiki_entity_linking import kb_creator
|
|
||||||
|
|
||||||
|
|
||||||
def _load_cfg(path):
|
def _load_cfg(path):
|
||||||
if path.exists():
|
if path.exists():
|
||||||
|
@ -1094,6 +1090,7 @@ class EntityLinker(Pipe):
|
||||||
self.mention_encoder = True
|
self.mention_encoder = True
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.kb = self.cfg["kb"]
|
self.kb = self.cfg["kb"]
|
||||||
|
self.doc_cutoff = self.cfg["doc_cutoff"]
|
||||||
|
|
||||||
def use_avg_params(self):
|
def use_avg_params(self):
|
||||||
"""Modify the pipe's encoders/models, to use their average parameter values."""
|
"""Modify the pipe's encoders/models, to use their average parameter values."""
|
||||||
|
@ -1134,6 +1131,7 @@ class EntityLinker(Pipe):
|
||||||
start, end, gold_kb = entity
|
start, end, gold_kb = entity
|
||||||
mention = doc[start:end]
|
mention = doc[start:end]
|
||||||
sentence = mention.sent
|
sentence = mention.sent
|
||||||
|
first_par = doc[0:self.doc_cutoff].as_doc()
|
||||||
|
|
||||||
candidates = self.kb.get_candidates(mention.text)
|
candidates = self.kb.get_candidates(mention.text)
|
||||||
for c in candidates:
|
for c in candidates:
|
||||||
|
@ -1144,7 +1142,7 @@ class EntityLinker(Pipe):
|
||||||
entity_encoding = c.entity_vector
|
entity_encoding = c.entity_vector
|
||||||
|
|
||||||
entity_encodings.append(entity_encoding)
|
entity_encodings.append(entity_encoding)
|
||||||
article_docs.append(doc)
|
article_docs.append(first_par)
|
||||||
sentence_docs.append(sentence.as_doc())
|
sentence_docs.append(sentence.as_doc())
|
||||||
|
|
||||||
if len(entity_encodings) > 0:
|
if len(entity_encodings) > 0:
|
||||||
|
@ -1158,6 +1156,10 @@ class EntityLinker(Pipe):
|
||||||
entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
|
entity_encodings = np.asarray(entity_encodings, dtype=np.float32)
|
||||||
|
|
||||||
loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
|
loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None)
|
||||||
|
# print("scores", mention_encodings)
|
||||||
|
# print("golds", entity_encodings)
|
||||||
|
# print("loss", loss)
|
||||||
|
# print("d_scores", d_scores)
|
||||||
|
|
||||||
mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
|
mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention)
|
||||||
|
|
||||||
|
@ -1180,9 +1182,26 @@ class EntityLinker(Pipe):
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
loss, gradients = get_cossim_loss(scores, golds)
|
targets = [[1] for _ in golds] # assuming we're only using positive examples
|
||||||
|
loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets)
|
||||||
|
#loss = loss / len(golds)
|
||||||
return loss, gradients
|
return loss, gradients
|
||||||
|
|
||||||
|
def get_cossim_loss_2(self, yh, y, t):
|
||||||
|
# Add a small constant to avoid 0 vectors
|
||||||
|
yh = yh + 1e-8
|
||||||
|
y = y + 1e-8
|
||||||
|
# https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
|
||||||
|
xp = get_array_module(yh)
|
||||||
|
norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
|
||||||
|
norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
|
||||||
|
mul_norms = norm_yh * norm_y
|
||||||
|
cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms
|
||||||
|
d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2))
|
||||||
|
loss = xp.abs(cos - t).sum()
|
||||||
|
inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))])
|
||||||
|
return loss, -inverse
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
entities, kb_ids = self.predict([doc])
|
entities, kb_ids = self.predict([doc])
|
||||||
self.set_annotations([doc], entities, kb_ids)
|
self.set_annotations([doc], entities, kb_ids)
|
||||||
|
@ -1220,6 +1239,7 @@ class EntityLinker(Pipe):
|
||||||
score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ?
|
score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ?
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
|
|
||||||
|
# TODO: thresholding
|
||||||
best_index = scores.index(max(scores))
|
best_index = scores.index(max(scores))
|
||||||
best_candidate = candidates[best_index]
|
best_candidate = candidates[best_index]
|
||||||
final_entities.append(ent)
|
final_entities.append(ent)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user