import numpy as np import json from keras.utils import to_categorical import plac import sys from keras_decomposable_attention import build_model from spacy_hook import get_embeddings, KerasSimilarityShim try: import cPickle as pickle except ImportError: import pickle import spacy # workaround for keras/tensorflow bug # see https://github.com/tensorflow/tensorflow/issues/3388 import os import importlib from keras import backend as K def set_keras_backend(backend): if K.backend() != backend: os.environ["KERAS_BACKEND"] = backend importlib.reload(K) assert K.backend() == backend if backend == "tensorflow": K.get_session().close() cfg = K.tf.ConfigProto() cfg.gpu_options.allow_growth = True K.set_session(K.tf.Session(config=cfg)) K.clear_session() set_keras_backend("tensorflow") def train(train_loc, dev_loc, shape, settings): train_texts1, train_texts2, train_labels = read_snli(train_loc) dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) print("Loading spaCy") nlp = spacy.load("en_vectors_web_lg") assert nlp.path is not None print("Processing texts...") train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0]) dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0]) print("Compiling network") model = build_model(get_embeddings(nlp.vocab), shape, settings) print(settings) model.fit( train_X, train_labels, validation_data=(dev_X, dev_labels), epochs=settings["nr_epoch"], batch_size=settings["batch_size"], ) if not (nlp.path / "similarity").exists(): (nlp.path / "similarity").mkdir() print("Saving to", nlp.path / "similarity") weights = model.get_weights() # remove the embedding matrix. We can reconstruct it. del weights[1] with (nlp.path / "similarity" / "model").open("wb") as file_: pickle.dump(weights, file_) with (nlp.path / "similarity" / "config.json").open("w") as file_: file_.write(model.to_json()) def evaluate(dev_loc, shape): dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) nlp = spacy.load("en_vectors_web_lg") nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0])) total = 0.0 correct = 0.0 for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels): doc1 = nlp(text1) doc2 = nlp(text2) sim, _ = doc1.similarity(doc2) if sim == KerasSimilarityShim.entailment_types[label.argmax()]: correct += 1 total += 1 return correct, total def demo(shape): nlp = spacy.load("en_vectors_web_lg") nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0])) doc1 = nlp(u"The king of France is bald.") doc2 = nlp(u"France has no king.") print("Sentence 1:", doc1) print("Sentence 2:", doc2) entailment_type, confidence = doc1.similarity(doc2) print("Entailment type:", entailment_type, "(Confidence:", confidence, ")") LABELS = {"entailment": 0, "contradiction": 1, "neutral": 2} def read_snli(path): texts1 = [] texts2 = [] labels = [] with open(path, "r") as file_: for line in file_: eg = json.loads(line) label = eg["gold_label"] if label == "-": # per Parikh, ignore - SNLI entries continue texts1.append(eg["sentence1"]) texts2.append(eg["sentence2"]) labels.append(LABELS[label]) return texts1, texts2, to_categorical(np.asarray(labels, dtype="int32")) def create_dataset(nlp, texts, hypotheses, num_unk, max_length): sents = texts + hypotheses sents_as_ids = [] for sent in sents: doc = nlp(sent) word_ids = [] for i, token in enumerate(doc): # skip odd spaces from tokenizer if token.has_vector and token.vector_norm == 0: continue if i > max_length: break if token.has_vector: word_ids.append(token.rank + num_unk + 1) else: # if we don't have a vector, pick an OOV entry word_ids.append(token.rank % num_unk + 1) # there must be a simpler way of generating padded arrays from lists... word_id_vec = np.zeros((max_length), dtype="int") clipped_len = min(max_length, len(word_ids)) word_id_vec[:clipped_len] = word_ids[:clipped_len] sents_as_ids.append(word_id_vec) return [np.array(sents_as_ids[: len(texts)]), np.array(sents_as_ids[len(texts) :])] @plac.annotations( mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]), train_loc=("Path to training data", "option", "t", str), dev_loc=("Path to development or test data", "option", "s", str), max_length=("Length to truncate sentences", "option", "L", int), nr_hidden=("Number of hidden units", "option", "H", int), dropout=("Dropout level", "option", "d", float), learn_rate=("Learning rate", "option", "r", float), batch_size=("Batch size for neural network training", "option", "b", int), nr_epoch=("Number of training epochs", "option", "e", int), entail_dir=( "Direction of entailment", "option", "D", str, ["both", "left", "right"], ), ) def main( mode, train_loc, dev_loc, max_length=50, nr_hidden=200, dropout=0.2, learn_rate=0.001, batch_size=1024, nr_epoch=10, entail_dir="both", ): shape = (max_length, nr_hidden, 3) settings = { "lr": learn_rate, "dropout": dropout, "batch_size": batch_size, "nr_epoch": nr_epoch, "entail_dir": entail_dir, } if mode == "train": if train_loc == None or dev_loc == None: print("Train mode requires paths to training and development data sets.") sys.exit(1) train(train_loc, dev_loc, shape, settings) elif mode == "evaluate": if dev_loc == None: print("Evaluate mode requires paths to test data set.") sys.exit(1) correct, total = evaluate(dev_loc, shape) print(correct, "/", total, correct / total) else: demo(shape) if __name__ == "__main__": plac.call(main)