mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-15 10:42:34 +03:00
Fix entailment example, and add a flag for BiRNN encoding.
This commit is contained in:
parent
2ee66117ba
commit
89df91846c
|
@ -3,25 +3,46 @@ import spacy
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import ujson as json
|
||||||
|
import numpy
|
||||||
|
from keras.utils.np_utils import to_categorical
|
||||||
|
|
||||||
from spacy_hook import get_embeddings, get_word_ids
|
from spacy_hook import get_embeddings, get_word_ids
|
||||||
from spacy_hook import create_similarity_pipeline
|
from spacy_hook import create_similarity_pipeline
|
||||||
|
|
||||||
from keras_decomposable_attention import build_model
|
from keras_decomposable_attention import build_model
|
||||||
|
|
||||||
|
|
||||||
def train(model_dir, train_loc, dev_loc, shape, settings):
|
def train(model_dir, train_loc, dev_loc, shape, settings):
|
||||||
|
train_texts1, train_texts2, train_labels = read_snli(train_loc)
|
||||||
|
dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)
|
||||||
|
|
||||||
print("Loading spaCy")
|
print("Loading spaCy")
|
||||||
nlp = spacy.load('en', tagger=False, parser=False, entity=False, matcher=False)
|
nlp = spacy.load('en')
|
||||||
print("Compiling network")
|
print("Compiling network")
|
||||||
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
model = build_model(get_embeddings(nlp.vocab), shape, settings)
|
||||||
print("Processing texts...")
|
print("Processing texts...")
|
||||||
train_X = get_features(list(nlp.pipe(train_texts)))
|
train_X1 = get_word_ids(list(nlp.pipe(train_texts1, n_threads=10, batch_size=10000)),
|
||||||
dev_X = get_features(list(nlp.pipe(dev_texts)))
|
max_length=shape[0],
|
||||||
|
tree_truncate=settings['tree_truncate'])
|
||||||
|
train_X2 = get_word_ids(list(nlp.pipe(train_texts2, n_threads=10, batch_size=10000)),
|
||||||
|
max_length=shape[0],
|
||||||
|
tree_truncate=settings['tree_truncate'])
|
||||||
|
dev_X1 = get_word_ids(list(nlp.pipe(dev_texts1, n_threads=10, batch_size=10000)),
|
||||||
|
max_length=shape[0],
|
||||||
|
tree_truncate=settings['tree_truncate'])
|
||||||
|
dev_X2 = get_word_ids(list(nlp.pipe(dev_texts2, n_threads=10, batch_size=10000)),
|
||||||
|
max_length=shape[0],
|
||||||
|
tree_truncate=settings['tree_truncate'])
|
||||||
|
|
||||||
|
print(train_X1.shape, train_X2.shape)
|
||||||
|
print(dev_X1.shape, dev_X2.shape)
|
||||||
|
print(train_labels.shape, dev_labels.shape)
|
||||||
|
print(settings)
|
||||||
model.fit(
|
model.fit(
|
||||||
train_X,
|
[train_X1, train_X2],
|
||||||
train_labels,
|
train_labels,
|
||||||
validation_data=(dev_X, dev_labels),
|
validation_data=([dev_X1, dev_X2], dev_labels),
|
||||||
nb_epoch=settings['nr_epoch'],
|
nb_epoch=settings['nr_epoch'],
|
||||||
batch_size=settings['batch_size'])
|
batch_size=settings['batch_size'])
|
||||||
|
|
||||||
|
@ -56,16 +77,20 @@ def demo(model_dir):
|
||||||
|
|
||||||
|
|
||||||
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
LABELS = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
|
||||||
def read_snli(loc):
|
def read_snli(path):
|
||||||
with open(loc) as file_:
|
texts1 = []
|
||||||
|
texts2 = []
|
||||||
|
labels = []
|
||||||
|
with path.open() as file_:
|
||||||
for line in file_:
|
for line in file_:
|
||||||
eg = json.loads(line)
|
eg = json.loads(line)
|
||||||
label = eg['gold_label']
|
label = eg['gold_label']
|
||||||
if label == '-':
|
if label == '-':
|
||||||
continue
|
continue
|
||||||
text1 = eg['sentence1']
|
texts1.append(eg['sentence1'])
|
||||||
text2 = eg['sentence2']
|
texts2.append(eg['sentence2'])
|
||||||
yield text1, text2, LABELS[label]
|
labels.append(LABELS[label])
|
||||||
|
return texts1, texts2, to_categorical(numpy.asarray(labels, dtype='int32'))
|
||||||
|
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -78,9 +103,13 @@ def read_snli(loc):
|
||||||
dropout=("Dropout level", "option", "d", float),
|
dropout=("Dropout level", "option", "d", float),
|
||||||
learn_rate=("Learning rate", "option", "e", float),
|
learn_rate=("Learning rate", "option", "e", float),
|
||||||
batch_size=("Batch size for neural network training", "option", "b", float),
|
batch_size=("Batch size for neural network training", "option", "b", float),
|
||||||
nr_epoch=("Number of training epochs", "option", "i", float)
|
nr_epoch=("Number of training epochs", "option", "i", float),
|
||||||
|
tree_truncate=("Truncate sentences by tree distance", "flag", "T", bool),
|
||||||
|
gru_encode=("Encode sentences with bidirectional GRU", "flag", "E", bool),
|
||||||
)
|
)
|
||||||
def main(mode, model_dir, train_loc, dev_loc,
|
def main(mode, model_dir, train_loc, dev_loc,
|
||||||
|
tree_truncate=False,
|
||||||
|
gru_encode=False,
|
||||||
max_length=100,
|
max_length=100,
|
||||||
nr_hidden=100,
|
nr_hidden=100,
|
||||||
dropout=0.2,
|
dropout=0.2,
|
||||||
|
@ -92,7 +121,9 @@ def main(mode, model_dir, train_loc, dev_loc,
|
||||||
'lr': learn_rate,
|
'lr': learn_rate,
|
||||||
'dropout': dropout,
|
'dropout': dropout,
|
||||||
'batch_size': batch_size,
|
'batch_size': batch_size,
|
||||||
'nr_epoch': nr_epoch
|
'nr_epoch': nr_epoch,
|
||||||
|
'tree_truncate': tree_truncate,
|
||||||
|
'gru_encode': gru_encode
|
||||||
}
|
}
|
||||||
if mode == 'train':
|
if mode == 'train':
|
||||||
train(model_dir, train_loc, dev_loc, shape, settings)
|
train(model_dir, train_loc, dev_loc, shape, settings)
|
||||||
|
@ -101,6 +132,5 @@ def main(mode, model_dir, train_loc, dev_loc,
|
||||||
else:
|
else:
|
||||||
demo(model_dir)
|
demo(model_dir)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
plac.call(main)
|
plac.call(main)
|
||||||
|
|
|
@ -4,6 +4,7 @@ import numpy
|
||||||
|
|
||||||
from keras.layers import InputSpec, Layer, Input, Dense, merge
|
from keras.layers import InputSpec, Layer, Input, Dense, merge
|
||||||
from keras.layers import Activation, Dropout, Embedding, TimeDistributed
|
from keras.layers import Activation, Dropout, Embedding, TimeDistributed
|
||||||
|
from keras.layers import Bidirectional, GRU
|
||||||
import keras.backend as K
|
import keras.backend as K
|
||||||
import theano.tensor as T
|
import theano.tensor as T
|
||||||
from keras.models import Sequential, Model, model_from_json
|
from keras.models import Sequential, Model, model_from_json
|
||||||
|
@ -12,7 +13,7 @@ from keras.optimizers import Adam
|
||||||
from keras.layers.normalization import BatchNormalization
|
from keras.layers.normalization import BatchNormalization
|
||||||
|
|
||||||
|
|
||||||
def build_model(vectors, shape, settings, use_rnn_encoding=False):
|
def build_model(vectors, shape, settings):
|
||||||
'''Compile the model.'''
|
'''Compile the model.'''
|
||||||
max_length, nr_hidden, nr_class = shape
|
max_length, nr_hidden, nr_class = shape
|
||||||
# Declare inputs.
|
# Declare inputs.
|
||||||
|
@ -21,8 +22,8 @@ def build_model(vectors, shape, settings, use_rnn_encoding=False):
|
||||||
|
|
||||||
# Construct operations, which we'll chain together.
|
# Construct operations, which we'll chain together.
|
||||||
embed = _StaticEmbedding(vectors, max_length, nr_hidden)
|
embed = _StaticEmbedding(vectors, max_length, nr_hidden)
|
||||||
if use_rnn_encoding:
|
if settings['gru_encode']:
|
||||||
encode = _BiLSTMEncode(max_length, nr_hidden)
|
encode = _BiRNNEncoding(max_length, nr_hidden)
|
||||||
attend = _Attention(max_length, nr_hidden)
|
attend = _Attention(max_length, nr_hidden)
|
||||||
align = _SoftAlignment(max_length, nr_hidden)
|
align = _SoftAlignment(max_length, nr_hidden)
|
||||||
compare = _Comparison(max_length, nr_hidden)
|
compare = _Comparison(max_length, nr_hidden)
|
||||||
|
@ -32,7 +33,7 @@ def build_model(vectors, shape, settings, use_rnn_encoding=False):
|
||||||
sent1 = embed(ids1) # Shape: (i, n)
|
sent1 = embed(ids1) # Shape: (i, n)
|
||||||
sent2 = embed(ids2) # Shape: (j, n)
|
sent2 = embed(ids2) # Shape: (j, n)
|
||||||
|
|
||||||
if use_rnn_encoding:
|
if settings['gru_encode']:
|
||||||
sent1 = encode(sent1)
|
sent1 = encode(sent1)
|
||||||
sent2 = encode(sent2)
|
sent2 = encode(sent2)
|
||||||
|
|
||||||
|
@ -78,15 +79,18 @@ class _StaticEmbedding(object):
|
||||||
|
|
||||||
def __call__(self, sentence):
|
def __call__(self, sentence):
|
||||||
return self.project(self.embed(sentence))
|
return self.project(self.embed(sentence))
|
||||||
|
|
||||||
|
|
||||||
class _BiRNNEncoding(object):
|
class _BiRNNEncoding(object):
|
||||||
def __init__(self, max_length, nr_out):
|
def __init__(self, max_length, nr_out):
|
||||||
self.model = Sequential()
|
self.model = Sequential()
|
||||||
self.model.add(Bidirectional(LSTM(nr_out, input_length=max_length)))
|
self.model.add(Bidirectional(GRU(int(nr_out/2), return_sequences=True),
|
||||||
|
input_shape=(max_length, nr_out)))
|
||||||
|
|
||||||
def __call__(self, sentence):
|
def __call__(self, sentence):
|
||||||
return self.model(sentence)
|
return self.model(sentence)
|
||||||
|
|
||||||
|
|
||||||
class _Attention(object):
|
class _Attention(object):
|
||||||
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
|
def __init__(self, max_length, nr_hidden, dropout=0.0, L2=1e-4, activation='relu'):
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from keras.models import model_from_json
|
from keras.models import model_from_json
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
class KerasSimilarityShim(object):
|
class KerasSimilarityShim(object):
|
||||||
|
@ -30,7 +31,7 @@ class KerasSimilarityShim(object):
|
||||||
return scores[0]
|
return scores[0]
|
||||||
|
|
||||||
|
|
||||||
def get_embeddings(cls, vocab):
|
def get_embeddings(vocab):
|
||||||
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
|
max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
|
||||||
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
|
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
|
||||||
for lex in vocab:
|
for lex in vocab:
|
||||||
|
@ -39,16 +40,24 @@ def get_embeddings(cls, vocab):
|
||||||
return vectors
|
return vectors
|
||||||
|
|
||||||
|
|
||||||
def get_word_ids(docs, max_length=100):
|
def get_word_ids(docs, tree_truncate=False, max_length=100):
|
||||||
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
|
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
j = 0
|
j = 0
|
||||||
for token in doc:
|
queue = [sent.root for sent in doc.sents]
|
||||||
if token.has_vector and not token.is_punct and not token.is_space:
|
words = []
|
||||||
Xs[i, j] = token.rank + 1
|
while len(words) <= max_length and queue:
|
||||||
j += 1
|
word = queue.pop(0)
|
||||||
if j >= max_length:
|
if word.has_vector and not word.is_punct and not word.is_space:
|
||||||
break
|
words.append(word)
|
||||||
|
queue.extend(list(word.lefts))
|
||||||
|
queue.extend(list(word.rights))
|
||||||
|
words.sort()
|
||||||
|
for j, token in enumerate(words):
|
||||||
|
Xs[i, j] = token.rank + 1
|
||||||
|
j += 1
|
||||||
|
if j >= max_length:
|
||||||
|
break
|
||||||
return Xs
|
return Xs
|
||||||
|
|
||||||
|
|
||||||
|
@ -57,6 +66,3 @@ def create_similarity_pipeline(nlp):
|
||||||
nlp.path / 'similarity',
|
nlp.path / 'similarity',
|
||||||
nlp,
|
nlp,
|
||||||
feature_extracter=get_features)]
|
feature_extracter=get_features)]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user