spaCy/examples/keras_parikh_entailment/spacy_hook.py

78 lines
2.4 KiB
Python
Raw Normal View History

import numpy as np
2016-11-01 03:51:54 +03:00
from keras.models import model_from_json
2017-01-31 22:27:13 +03:00
try:
import cPickle as pickle
except ImportError:
import pickle
2016-11-01 03:51:54 +03:00
class KerasSimilarityShim(object):
entailment_types = ["entailment", "contradiction", "neutral"]
2016-11-01 03:51:54 +03:00
@classmethod
def load(cls, path, nlp, max_length=100, get_features=None):
2019-08-31 14:39:31 +03:00
2016-11-01 03:51:54 +03:00
if get_features is None:
2017-01-31 22:27:13 +03:00
get_features = get_word_ids
2019-08-31 14:39:31 +03:00
with (path / "config.json").open() as file_:
2017-01-31 22:27:13 +03:00
model = model_from_json(file_.read())
2019-08-31 14:39:31 +03:00
with (path / "model").open("rb") as file_:
2016-11-01 03:51:54 +03:00
weights = pickle.load(file_)
2019-08-31 14:39:31 +03:00
2016-11-01 03:51:54 +03:00
embeddings = get_embeddings(nlp.vocab)
weights.insert(1, embeddings)
model.set_weights(weights)
2017-01-31 22:27:13 +03:00
return cls(model, get_features=get_features, max_length=max_length)
2016-11-01 03:51:54 +03:00
2017-01-31 22:27:13 +03:00
def __init__(self, model, get_features=None, max_length=100):
2016-11-01 03:51:54 +03:00
self.model = model
self.get_features = get_features
2017-01-31 22:27:13 +03:00
self.max_length = max_length
2016-11-01 03:51:54 +03:00
def __call__(self, doc):
2019-08-31 14:39:31 +03:00
doc.user_hooks["similarity"] = self.predict
doc.user_span_hooks["similarity"] = self.predict
2017-01-31 22:27:13 +03:00
return doc
2016-11-01 03:51:54 +03:00
def predict(self, doc1, doc2):
x1 = self.get_features([doc1], max_length=self.max_length)
x2 = self.get_features([doc2], max_length=self.max_length)
2016-11-01 03:51:54 +03:00
scores = self.model.predict([x1, x2])
return self.entailment_types[scores.argmax()], scores.max()
2016-11-01 03:51:54 +03:00
def get_embeddings(vocab, nr_unk=100):
# the extra +1 is for a zero vector representing sentence-final padding
2019-08-31 14:39:31 +03:00
num_vectors = max(lex.rank for lex in vocab) + 2
# create random vectors for OOV tokens
oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
oov = oov / oov.sum(axis=1, keepdims=True)
2019-08-31 14:39:31 +03:00
vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32")
vectors[1 : (nr_unk + 1),] = oov
2016-11-01 03:51:54 +03:00
for lex in vocab:
if lex.has_vector and lex.vector_norm > 0:
2019-08-31 14:39:31 +03:00
vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
2016-11-01 03:51:54 +03:00
return vectors
def get_word_ids(docs, max_length=100, nr_unk=100):
2019-08-31 14:39:31 +03:00
Xs = np.zeros((len(docs), max_length), dtype="int32")
2016-11-01 03:51:54 +03:00
for i, doc in enumerate(docs):
for j, token in enumerate(doc):
if j == max_length:
break
if token.has_vector:
Xs[i, j] = token.rank + nr_unk + 1
else:
Xs[i, j] = token.rank % nr_unk + 1
2016-11-01 03:51:54 +03:00
return Xs