mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 10:26:35 +03:00
9faea3ff10
* bug fixes in keras example * created contributor agreement * baseline for Parikh model * initial version of parikh 2016 implemented * tested asymmetric models * fixed grevious error in normalization * use standard SNLI test file * begin to rework parikh example * initial version of running example * start to document the new version * start to document the new version * Update Decompositional Attention.ipynb * fixed calls to similarity * updated the README * import sys package duh * simplified indexing on mapping word to IDs * stupid python indent error * added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
198 lines
6.3 KiB
Python
198 lines
6.3 KiB
Python
import numpy as np
|
|
import ujson as json
|
|
from keras.utils import to_categorical
|
|
import plac
|
|
import sys
|
|
|
|
from keras_decomposable_attention import build_model
|
|
from spacy_hook import get_embeddings, KerasSimilarityShim
|
|
|
|
try:
|
|
import cPickle as pickle
|
|
except ImportError:
|
|
import pickle
|
|
|
|
import spacy
|
|
|
|
# workaround for keras/tensorflow bug
|
|
# see https://github.com/tensorflow/tensorflow/issues/3388
|
|
import os
|
|
import importlib
|
|
from keras import backend as K
|
|
|
|
def set_keras_backend(backend):
|
|
if K.backend() != backend:
|
|
os.environ['KERAS_BACKEND'] = backend
|
|
importlib.reload(K)
|
|
assert K.backend() == backend
|
|
if backend == "tensorflow":
|
|
K.get_session().close()
|
|
cfg = K.tf.ConfigProto()
|
|
cfg.gpu_options.allow_growth = True
|
|
K.set_session(K.tf.Session(config=cfg))
|
|
K.clear_session()
|
|
|
|
set_keras_backend("tensorflow")
|
|
|
|
|
|
def train(train_loc, dev_loc, shape, settings):
|
|
train_texts1, train_texts2, train_labels = read_snli(train_loc)
|
|
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
|
|
|
print("Loading spaCy")
|
|
nlp = spacy.load('en_vectors_web_lg')
|
|
assert nlp.path is not None
|
|
|
|
print("Processing texts...")
|
|
train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
|
|
dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])
|
|
|
|
print("Compiling network")
|
|
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
|
|
|
print(settings)
|
|
model.fit(
|
|
train_X,
|
|
train_labels,
|
|
validation_data = (dev_X, dev_labels),
|
|
epochs = settings['nr_epoch'],
|
|
batch_size = settings['batch_size'])
|
|
|
|
if not (nlp.path / 'similarity').exists():
|
|
(nlp.path / 'similarity').mkdir()
|
|
print("Saving to", nlp.path / 'similarity')
|
|
weights = model.get_weights()
|
|
# remove the embedding matrix. We can reconstruct it.
|
|
del weights[1]
|
|
with (nlp.path / 'similarity' / 'model').open('wb') as file_:
|
|
pickle.dump(weights, file_)
|
|
with (nlp.path / 'similarity' / 'config.json').open('w') as file_:
|
|
file_.write(model.to_json())
|
|
|
|
|
|
def evaluate(dev_loc, shape):
|
|
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
|
nlp = spacy.load('en_vectors_web_lg')
|
|
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
|
|
|
total = 0.
|
|
correct = 0.
|
|
for text1, text2, label in zip(dev_texts1, dev_texts2, dev_labels):
|
|
doc1 = nlp(text1)
|
|
doc2 = nlp(text2)
|
|
sim, _ = doc1.similarity(doc2)
|
|
if sim == KerasSimilarityShim.entailment_types[label.argmax()]:
|
|
correct += 1
|
|
total += 1
|
|
return correct, total
|
|
|
|
|
|
def demo(shape):
|
|
nlp = spacy.load('en_vectors_web_lg')
|
|
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / 'similarity', nlp, shape[0]))
|
|
|
|
doc1 = nlp(u'The king of France is bald.')
|
|
doc2 = nlp(u'France has no king.')
|
|
|
|
print("Sentence 1:", doc1)
|
|
print("Sentence 2:", doc2)
|
|
|
|
entailment_type, confidence = doc1.similarity(doc2)
|
|
print("Entailment type:", entailment_type, "(Confidence:", confidence, ")")
|
|
|
|
|
|
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
|
def read_snli(path):
|
|
texts1 = []
|
|
texts2 = []
|
|
labels = []
|
|
with open(path, 'r') as file_:
|
|
for line in file_:
|
|
eg = json.loads(line)
|
|
label = eg['gold_label']
|
|
if label == '-': # per Parikh, ignore - SNLI entries
|
|
continue
|
|
texts1.append(eg['sentence1'])
|
|
texts2.append(eg['sentence2'])
|
|
labels.append(LABELS[label])
|
|
return texts1, texts2, to_categorical(np.asarray(labels, dtype='int32'))
|
|
|
|
def create_dataset(nlp, texts, hypotheses, num_unk, max_length):
|
|
sents = texts + hypotheses
|
|
|
|
sents_as_ids = []
|
|
for sent in sents:
|
|
doc = nlp(sent)
|
|
word_ids = []
|
|
|
|
for i, token in enumerate(doc):
|
|
# skip odd spaces from tokenizer
|
|
if token.has_vector and token.vector_norm == 0:
|
|
continue
|
|
|
|
if i > max_length:
|
|
break
|
|
|
|
if token.has_vector:
|
|
word_ids.append(token.rank + num_unk + 1)
|
|
else:
|
|
# if we don't have a vector, pick an OOV entry
|
|
word_ids.append(token.rank % num_unk + 1)
|
|
|
|
# there must be a simpler way of generating padded arrays from lists...
|
|
word_id_vec = np.zeros((max_length), dtype='int')
|
|
clipped_len = min(max_length, len(word_ids))
|
|
word_id_vec[:clipped_len] = word_ids[:clipped_len]
|
|
sents_as_ids.append(word_id_vec)
|
|
|
|
|
|
return [np.array(sents_as_ids[:len(texts)]), np.array(sents_as_ids[len(texts):])]
|
|
|
|
|
|
@plac.annotations(
|
|
mode=("Mode to execute", "positional", None, str, ["train", "evaluate", "demo"]),
|
|
train_loc=("Path to training data", "option", "t", str),
|
|
dev_loc=("Path to development or test data", "option", "s", str),
|
|
max_length=("Length to truncate sentences", "option", "L", int),
|
|
nr_hidden=("Number of hidden units", "option", "H", int),
|
|
dropout=("Dropout level", "option", "d", float),
|
|
learn_rate=("Learning rate", "option", "r", float),
|
|
batch_size=("Batch size for neural network training", "option", "b", int),
|
|
nr_epoch=("Number of training epochs", "option", "e", int),
|
|
entail_dir=("Direction of entailment", "option", "D", str, ["both", "left", "right"])
|
|
)
|
|
def main(mode, train_loc, dev_loc,
|
|
max_length = 50,
|
|
nr_hidden = 200,
|
|
dropout = 0.2,
|
|
learn_rate = 0.001,
|
|
batch_size = 1024,
|
|
nr_epoch = 10,
|
|
entail_dir="both"):
|
|
|
|
shape = (max_length, nr_hidden, 3)
|
|
settings = {
|
|
'lr': learn_rate,
|
|
'dropout': dropout,
|
|
'batch_size': batch_size,
|
|
'nr_epoch': nr_epoch,
|
|
'entail_dir': entail_dir
|
|
}
|
|
|
|
if mode == 'train':
|
|
if train_loc == None or dev_loc == None:
|
|
print("Train mode requires paths to training and development data sets.")
|
|
sys.exit(1)
|
|
train(train_loc, dev_loc, shape, settings)
|
|
elif mode == 'evaluate':
|
|
if dev_loc == None:
|
|
print("Evaluate mode requires paths to test data set.")
|
|
sys.exit(1)
|
|
correct, total = evaluate(dev_loc, shape)
|
|
print(correct, '/', total, correct / total)
|
|
else:
|
|
demo(shape)
|
|
|
|
if __name__ == '__main__':
|
|
plac.call(main)
|