mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			78 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			78 lines
		
	
	
		
			2.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import numpy as np
 | 
						|
from keras.models import model_from_json
 | 
						|
 | 
						|
try:
 | 
						|
    import cPickle as pickle
 | 
						|
except ImportError:
 | 
						|
    import pickle
 | 
						|
 | 
						|
 | 
						|
class KerasSimilarityShim(object):
 | 
						|
    entailment_types = ["entailment", "contradiction", "neutral"]
 | 
						|
 | 
						|
    @classmethod
 | 
						|
    def load(cls, path, nlp, max_length=100, get_features=None):
 | 
						|
 | 
						|
        if get_features is None:
 | 
						|
            get_features = get_word_ids
 | 
						|
 | 
						|
        with (path / "config.json").open() as file_:
 | 
						|
            model = model_from_json(file_.read())
 | 
						|
        with (path / "model").open("rb") as file_:
 | 
						|
            weights = pickle.load(file_)
 | 
						|
 | 
						|
        embeddings = get_embeddings(nlp.vocab)
 | 
						|
        weights.insert(1, embeddings)
 | 
						|
        model.set_weights(weights)
 | 
						|
 | 
						|
        return cls(model, get_features=get_features, max_length=max_length)
 | 
						|
 | 
						|
    def __init__(self, model, get_features=None, max_length=100):
 | 
						|
        self.model = model
 | 
						|
        self.get_features = get_features
 | 
						|
        self.max_length = max_length
 | 
						|
 | 
						|
    def __call__(self, doc):
 | 
						|
        doc.user_hooks["similarity"] = self.predict
 | 
						|
        doc.user_span_hooks["similarity"] = self.predict
 | 
						|
 | 
						|
        return doc
 | 
						|
 | 
						|
    def predict(self, doc1, doc2):
 | 
						|
        x1 = self.get_features([doc1], max_length=self.max_length)
 | 
						|
        x2 = self.get_features([doc2], max_length=self.max_length)
 | 
						|
        scores = self.model.predict([x1, x2])
 | 
						|
 | 
						|
        return self.entailment_types[scores.argmax()], scores.max()
 | 
						|
 | 
						|
 | 
						|
def get_embeddings(vocab, nr_unk=100):
 | 
						|
    # the extra +1 is for a zero vector representing sentence-final padding
 | 
						|
    num_vectors = max(lex.rank for lex in vocab) + 2
 | 
						|
 | 
						|
    # create random vectors for OOV tokens
 | 
						|
    oov = np.random.normal(size=(nr_unk, vocab.vectors_length))
 | 
						|
    oov = oov / oov.sum(axis=1, keepdims=True)
 | 
						|
 | 
						|
    vectors = np.zeros((num_vectors + nr_unk, vocab.vectors_length), dtype="float32")
 | 
						|
    vectors[1 : (nr_unk + 1),] = oov
 | 
						|
    for lex in vocab:
 | 
						|
        if lex.has_vector and lex.vector_norm > 0:
 | 
						|
            vectors[nr_unk + lex.rank + 1] = lex.vector / lex.vector_norm
 | 
						|
 | 
						|
    return vectors
 | 
						|
 | 
						|
 | 
						|
def get_word_ids(docs, max_length=100, nr_unk=100):
 | 
						|
    Xs = np.zeros((len(docs), max_length), dtype="int32")
 | 
						|
 | 
						|
    for i, doc in enumerate(docs):
 | 
						|
        for j, token in enumerate(doc):
 | 
						|
            if j == max_length:
 | 
						|
                break
 | 
						|
            if token.has_vector:
 | 
						|
                Xs[i, j] = token.rank + nr_unk + 1
 | 
						|
            else:
 | 
						|
                Xs[i, j] = token.rank % nr_unk + 1
 | 
						|
    return Xs
 |