spaCy/examples/keras_parikh_entailment/spacy_hook.py

from keras.models import model_from_json
import numpy


class KerasSimilarityShim(object):
    @classmethod
    def load(cls, path, nlp, get_features=None):
        if get_features is None:
            get_features = doc2ids
        with (path / 'config.json').open() as file_:
            config = json.load(file_)
        model = model_from_json(config['model'])
        with (path / 'model').open('rb') as file_:
            weights = pickle.load(file_)
        embeddings = get_embeddings(nlp.vocab)
        model.set_weights([embeddings] + weights)
        return cls(model, get_features=get_features)

    def __init__(self, model, get_features=None):
        self.model = model
        self.get_features = get_features

    def __call__(self, doc):
        doc.user_hooks['similarity'] = self.predict
        doc.user_span_hooks['similarity'] = self.predict
    
    def predict(self, doc1, doc2):
        x1 = self.get_features(doc1)
        x2 = self.get_features(doc2)
        scores = self.model.predict([x1, x2])
        return scores[0]


def get_embeddings(vocab):
    max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
    vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
    for lex in vocab:
        if lex.has_vector:
            vectors[lex.rank + 1] = lex.vector
    return vectors


def get_word_ids(docs, tree_truncate=False, max_length=100):
    Xs = numpy.zeros((len(docs), max_length), dtype='int32')
    for i, doc in enumerate(docs):
        j = 0
        queue = [sent.root for sent in doc.sents]
        words = []
        while len(words) <= max_length and queue:
            word = queue.pop(0)
            if word.has_vector and not word.is_punct and not word.is_space:
                words.append(word)
                queue.extend(list(word.lefts))
                queue.extend(list(word.rights))
        words.sort()
        for j, token in enumerate(words):
            Xs[i, j] = token.rank + 1
            j += 1
            if j >= max_length:
                break
    return Xs


def create_similarity_pipeline(nlp):
    return [SimilarityModel.load(
                nlp.path / 'similarity',
                nlp,
                feature_extracter=get_features)]
Rename entailment example 2016-11-01 03:51:54 +03:00			`from keras.models import model_from_json`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 20:43:37 +03:00			`import numpy`
Rename entailment example 2016-11-01 03:51:54 +03:00

			`class KerasSimilarityShim(object):`
			`@classmethod`
			`def load(cls, path, nlp, get_features=None):`
			`if get_features is None:`
			`get_features = doc2ids`
			`with (path / 'config.json').open() as file_:`
			`config = json.load(file_)`
			`model = model_from_json(config['model'])`
			`with (path / 'model').open('rb') as file_:`
			`weights = pickle.load(file_)`
			`embeddings = get_embeddings(nlp.vocab)`
			`model.set_weights([embeddings] + weights)`
			`return cls(model, get_features=get_features)`

			`def __init__(self, model, get_features=None):`
			`self.model = model`
			`self.get_features = get_features`

			`def __call__(self, doc):`
			`doc.user_hooks['similarity'] = self.predict`
			`doc.user_span_hooks['similarity'] = self.predict`

			`def predict(self, doc1, doc2):`
			`x1 = self.get_features(doc1)`
			`x2 = self.get_features(doc2)`
			`scores = self.model.predict([x1, x2])`
			`return scores[0]`


Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 20:43:37 +03:00			`def get_embeddings(vocab):`
Rename entailment example 2016-11-01 03:51:54 +03:00			`max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)`
			`vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')`
			`for lex in vocab:`
			`if lex.has_vector:`
			`vectors[lex.rank + 1] = lex.vector`
			`return vectors`


Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 20:43:37 +03:00			`def get_word_ids(docs, tree_truncate=False, max_length=100):`
Rename entailment example 2016-11-01 03:51:54 +03:00			`Xs = numpy.zeros((len(docs), max_length), dtype='int32')`
			`for i, doc in enumerate(docs):`
			`j = 0`
Fix entailment example, and add a flag for BiRNN encoding. 2016-11-12 20:43:37 +03:00			`queue = [sent.root for sent in doc.sents]`
			`words = []`
			`while len(words) <= max_length and queue:`
			`word = queue.pop(0)`
			`if word.has_vector and not word.is_punct and not word.is_space:`
			`words.append(word)`
			`queue.extend(list(word.lefts))`
			`queue.extend(list(word.rights))`
			`words.sort()`
			`for j, token in enumerate(words):`
			`Xs[i, j] = token.rank + 1`
			`j += 1`
			`if j >= max_length:`
			`break`
Rename entailment example 2016-11-01 03:51:54 +03:00			`return Xs`


			`def create_similarity_pipeline(nlp):`
			`return [SimilarityModel.load(`
			`nlp.path / 'similarity',`
			`nlp,`
			`feature_extracter=get_features)]`