mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
147 lines
4.3 KiB
Cython
147 lines
4.3 KiB
Cython
|
"""Feed-forward neural network, using Thenao."""
|
||
|
|
||
|
import os
|
||
|
import sys
|
||
|
import time
|
||
|
|
||
|
import numpy
|
||
|
|
||
|
import theano
|
||
|
import theano.tensor as T
|
||
|
import plac
|
||
|
|
||
|
from spacy.gold import read_json_file
|
||
|
from spacy.gold import GoldParse
|
||
|
from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir
|
||
|
|
||
|
|
||
|
def build_model(n_classes, n_vocab, n_hidden, n_word_embed, n_tag_embed):
|
||
|
# allocate symbolic variables for the data
|
||
|
words = T.vector('words')
|
||
|
tags = T.vector('tags')
|
||
|
|
||
|
word_e = _init_embedding(n_words, n_word_embed)
|
||
|
tag_e = _init_embedding(n_tags, n_tag_embed)
|
||
|
label_e = _init_embedding(n_labels, n_label_embed)
|
||
|
maxent_W, maxent_b = _init_maxent_weights(n_hidden, n_classes)
|
||
|
hidden_W, hidden_b = _init_hidden_weights(28*28, n_hidden, T.tanh)
|
||
|
params = [hidden_W, hidden_b, maxent_W, maxent_b, word_e, tag_e, label_e]
|
||
|
|
||
|
x = T.concatenate([
|
||
|
T.flatten(word_e[word_indices], outdim=1),
|
||
|
T.flatten(tag_e[tag_indices], outdim=1)])
|
||
|
|
||
|
p_y_given_x = feed_layer(
|
||
|
T.nnet.softmax,
|
||
|
maxent_W,
|
||
|
maxent_b,
|
||
|
feed_layer(
|
||
|
T.tanh,
|
||
|
hidden_W,
|
||
|
hidden_b,
|
||
|
x))[0]
|
||
|
|
||
|
guess = T.argmax(p_y_given_x)
|
||
|
|
||
|
cost = (
|
||
|
-T.log(p_y_given_x[y])
|
||
|
+ L1(L1_reg, maxent_W, hidden_W, word_e, tag_e)
|
||
|
+ L2(L2_reg, maxent_W, hidden_W, wod_e, tag_e)
|
||
|
)
|
||
|
|
||
|
train_model = theano.function(
|
||
|
inputs=[words, tags, y],
|
||
|
outputs=guess,
|
||
|
updates=[update(learning_rate, param, cost) for param in params]
|
||
|
)
|
||
|
|
||
|
evaluate_model = theano.function(
|
||
|
inputs=[x, y],
|
||
|
outputs=T.neq(y, T.argmax(p_y_given_x[0])),
|
||
|
)
|
||
|
return train_model, evaluate_model
|
||
|
|
||
|
|
||
|
def _init_embedding(vocab_size, n_dim):
|
||
|
embedding = 0.2 * numpy.random.uniform(-1.0, 1.0, (vocab_size+1, n_dim))
|
||
|
return theano.shared(embedding).astype(theano.config.floatX)
|
||
|
|
||
|
|
||
|
def _init_maxent_weights(n_hidden, n_out):
|
||
|
weights = numpy.zeros((n_hidden, 10), dtype=theano.config.floatX)
|
||
|
bias = numpy.zeros((10,), dtype=theano.config.floatX)
|
||
|
return (
|
||
|
theano.shared(name='W', borrow=True, value=weights),
|
||
|
theano.shared(name='b', borrow=True, value=bias)
|
||
|
)
|
||
|
|
||
|
|
||
|
def _init_hidden_weights(n_in, n_out, activation=T.tanh):
|
||
|
rng = numpy.random.RandomState(1234)
|
||
|
weights = numpy.asarray(
|
||
|
rng.uniform(
|
||
|
low=-numpy.sqrt(6. / (n_in + n_out)),
|
||
|
high=numpy.sqrt(6. / (n_in + n_out)),
|
||
|
size=(n_in, n_out)
|
||
|
),
|
||
|
dtype=theano.config.floatX
|
||
|
)
|
||
|
|
||
|
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||
|
return (
|
||
|
theano.shared(value=weights, name='W', borrow=True),
|
||
|
theano.shared(value=bias, name='b', borrow=True)
|
||
|
)
|
||
|
|
||
|
|
||
|
def feed_layer(activation, weights, bias, input):
|
||
|
return activation(T.dot(input, weights) + bias)
|
||
|
|
||
|
|
||
|
def L1(L1_reg, w1, w2):
|
||
|
return L1_reg * (abs(w1).sum() + abs(w2).sum())
|
||
|
|
||
|
|
||
|
def L2(L2_reg, w1, w2):
|
||
|
return L2_reg * ((w1 ** 2).sum() + (w2 ** 2).sum())
|
||
|
|
||
|
|
||
|
def update(eta, param, cost):
|
||
|
return (param, param - (eta * T.grad(cost, param)))
|
||
|
|
||
|
|
||
|
def main(train_loc, eval_loc, model_dir):
|
||
|
learning_rate = 0.01
|
||
|
L1_reg = 0.00
|
||
|
L2_reg = 0.0001
|
||
|
|
||
|
print "... reading the data"
|
||
|
gold_train = list(read_json_file(train_loc))
|
||
|
print '... building the model'
|
||
|
pos_model_dir = path.join(model_dir, 'pos')
|
||
|
if path.exists(pos_model_dir):
|
||
|
shutil.rmtree(pos_model_dir)
|
||
|
os.mkdir(pos_model_dir)
|
||
|
|
||
|
setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir)
|
||
|
|
||
|
train_model, evaluate_model = build_model(n_hidden, len(POS_TAGS), learning_rate,
|
||
|
L1_reg, L2_reg)
|
||
|
|
||
|
print '... training'
|
||
|
for epoch in range(1, n_epochs+1):
|
||
|
for raw_text, sents in gold_tuples:
|
||
|
for (ids, words, tags, ner, heads, deps), _ in sents:
|
||
|
tokens = nlp.tokenizer.tokens_from_list(words)
|
||
|
for t in tokens:
|
||
|
guess = train_model([t.orth], [t.tag])
|
||
|
loss += guess != t.tag
|
||
|
print loss
|
||
|
# compute zero-one loss on validation set
|
||
|
#error = numpy.mean([evaluate_model(x, y) for x, y in dev_examples])
|
||
|
#print('epoch %i, validation error %f %%' % (epoch, error * 100))
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
plac.call(main)
|