mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-28 17:40:39 +03:00
* Begin reorganizing neuralnet work
This commit is contained in:
parent
e20106fdff
commit
31b5e58aeb
|
@ -23,7 +23,7 @@ from spacy.gold import GoldParse
|
||||||
|
|
||||||
from spacy.scorer import Scorer
|
from spacy.scorer import Scorer
|
||||||
|
|
||||||
from spacy.syntax.parser import Parser
|
from spacy.syntax.parser import Parser, get_templates
|
||||||
from spacy._theano import TheanoModel
|
from spacy._theano import TheanoModel
|
||||||
|
|
||||||
import theano
|
import theano
|
||||||
|
@ -40,76 +40,37 @@ theano.config.floatX = 'float32'
|
||||||
floatX = theano.config.floatX
|
floatX = theano.config.floatX
|
||||||
|
|
||||||
|
|
||||||
def th_share(w, name=''):
|
def L1(L1_reg, *weights):
|
||||||
return theano.shared(value=w, borrow=True, name=name)
|
return L1_reg * sum(abs(w).sum() for w in weights)
|
||||||
|
|
||||||
class Param(object):
|
|
||||||
def __init__(self, numpy_data, name='?', wrapper=th_share):
|
|
||||||
self.curr = wrapper(numpy_data, name=name+'_curr')
|
|
||||||
self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
|
|
||||||
name=name+'_step')
|
|
||||||
|
|
||||||
def updates(self, cost, timestep, eta, mu):
|
|
||||||
step = (mu * self.step) - T.grad(cost, self.curr)
|
|
||||||
curr = self.curr + (eta * step)
|
|
||||||
return [(self.curr, curr), (self.step, step)]
|
|
||||||
|
|
||||||
|
|
||||||
class AdadeltaParam(object):
|
|
||||||
def __init__(self, numpy_data, name='?', wrapper=th_share):
|
|
||||||
self.curr = wrapper(numpy_data, name=name+'_curr')
|
|
||||||
# accu: accumulate gradient magnitudes
|
|
||||||
self.accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
|
|
||||||
# delta_accu: accumulate update magnitudes (recursively!)
|
|
||||||
self.delta_accu = wrapper(numpy.zeros(numpy_data.shape, dtype=numpy_data.dtype))
|
|
||||||
|
|
||||||
def updates(self, cost, timestep, eps, rho):
|
|
||||||
# update accu (as in rmsprop)
|
|
||||||
grad = T.grad(cost, self.curr)
|
|
||||||
accu_new = rho * self.accu + (1 - rho) * grad ** 2
|
|
||||||
|
|
||||||
# compute parameter update, using the 'old' delta_accu
|
|
||||||
update = (grad * T.sqrt(self.delta_accu + eps) /
|
|
||||||
T.sqrt(accu_new + eps))
|
|
||||||
# update delta_accu (as accu, but accumulating updates)
|
|
||||||
delta_accu_new = rho * self.delta_accu + (1 - rho) * update ** 2
|
|
||||||
return [(self.curr, self.curr - update), (self.accu, accu_new),
|
|
||||||
(self.delta_accu, delta_accu_new)]
|
|
||||||
|
|
||||||
|
|
||||||
class AvgParam(object):
|
|
||||||
def __init__(self, numpy_data, name='?', wrapper=th_share):
|
|
||||||
self.curr = wrapper(numpy_data, name=name+'_curr')
|
|
||||||
self.avg = self.curr
|
|
||||||
self.avg = wrapper(numpy_data.copy(), name=name+'_avg')
|
|
||||||
self.step = wrapper(numpy.zeros(numpy_data.shape, numpy_data.dtype),
|
|
||||||
name=name+'_step')
|
|
||||||
|
|
||||||
def updates(self, cost, timestep, eta, mu):
|
|
||||||
step = (mu * self.step) - T.grad(cost, self.curr)
|
|
||||||
curr = self.curr + (eta * step)
|
|
||||||
alpha = (1 / timestep).clip(0.001, 0.9).astype(floatX)
|
|
||||||
avg = ((1 - alpha) * self.avg) + (alpha * curr)
|
|
||||||
return [(self.curr, curr), (self.step, step), (self.avg, avg)]
|
|
||||||
|
|
||||||
|
|
||||||
def feed_layer(activation, weights, bias, input_):
|
|
||||||
return activation(T.dot(input_, weights) + bias)
|
|
||||||
|
|
||||||
|
|
||||||
def L2(L2_reg, *weights):
|
def L2(L2_reg, *weights):
|
||||||
return L2_reg * sum((w ** 2).sum() for w in weights)
|
return L2_reg * sum((w ** 2).sum() for w in weights)
|
||||||
|
|
||||||
|
|
||||||
def L1(L1_reg, *weights):
|
def rms_prop(loss, params, eta=1.0, rho=0.9, eps=1e-6):
|
||||||
return L1_reg * sum(abs(w).sum() for w in weights)
|
updates = OrderedDict()
|
||||||
|
for param in params:
|
||||||
|
value = param.get_value(borrow=True)
|
||||||
|
accu = theano.shared(np.zeros(value.shape, dtype=value.dtype),
|
||||||
|
broadcastable=param.broadcastable)
|
||||||
|
|
||||||
|
grad = T.grad(loss, param)
|
||||||
|
accu_new = rho * accu + (1 - rho) * grad ** 2
|
||||||
|
updates[accu] = accu_new
|
||||||
|
updates[param] = param - (eta * grad / T.sqrt(accu_new + eps))
|
||||||
|
return updates
|
||||||
|
|
||||||
|
|
||||||
def relu(x):
|
def relu(x):
|
||||||
return x * (x > 0)
|
return x * (x > 0)
|
||||||
|
|
||||||
|
|
||||||
def _init_weights(n_in, n_out):
|
def feed_layer(activation, weights, bias, input_):
|
||||||
|
return activation(T.dot(input_, weights) + bias)
|
||||||
|
|
||||||
|
|
||||||
|
def init_weights(n_in, n_out):
|
||||||
rng = numpy.random.RandomState(1235)
|
rng = numpy.random.RandomState(1235)
|
||||||
|
|
||||||
weights = numpy.asarray(
|
weights = numpy.asarray(
|
||||||
|
@ -117,57 +78,35 @@ def _init_weights(n_in, n_out):
|
||||||
dtype=theano.config.floatX
|
dtype=theano.config.floatX
|
||||||
)
|
)
|
||||||
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
bias = numpy.zeros((n_out,), dtype=theano.config.floatX)
|
||||||
return [AvgParam(weights, name='W'), AvgParam(bias, name='b')]
|
return [wrapper(weights, name='W'), wrapper(bias, name='b')]
|
||||||
|
|
||||||
|
|
||||||
def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
|
def compile_model(n_classes, n_hidden, n_in, optimizer):
|
||||||
costs = T.ivector('costs')
|
|
||||||
is_gold = T.ivector('is_gold')
|
|
||||||
x = T.vector('x')
|
x = T.vector('x')
|
||||||
y = T.scalar('y')
|
costs = T.ivector('costs')
|
||||||
y_cost = T.scalar('y_cost')
|
loss = T.scalar('loss')
|
||||||
loss = T.scalar('cost')
|
|
||||||
timestep = theano.shared(1)
|
|
||||||
eta = T.scalar('eta').astype(floatX)
|
|
||||||
mu = T.scalar('mu').astype(floatX)
|
|
||||||
|
|
||||||
maxent_W, maxent_b = _init_weights(n_hidden, n_classes)
|
maxent_W, maxent_b = init_weights(n_hidden, n_classes)
|
||||||
hidden_W, hidden_b = _init_weights(n_in, n_hidden)
|
hidden_W, hidden_b = init_weights(n_in, n_hidden)
|
||||||
|
|
||||||
# Feed the inputs forward through the network
|
# Feed the inputs forward through the network
|
||||||
p_y_given_x = feed_layer(
|
p_y_given_x = feed_layer(
|
||||||
T.nnet.softmax,
|
T.nnet.softmax,
|
||||||
maxent_W.curr,
|
maxent_W,
|
||||||
maxent_b.curr,
|
maxent_b,
|
||||||
feed_layer(
|
feed_layer(
|
||||||
relu,
|
relu,
|
||||||
hidden_W.curr,
|
hidden_W,
|
||||||
hidden_b.curr,
|
hidden_b,
|
||||||
x))
|
x))
|
||||||
stabilizer = 1e-8
|
|
||||||
|
|
||||||
y_cost = costs[T.argmax(p_y_given_x[0])]
|
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + 1e-8)
|
||||||
|
|
||||||
loss = -T.log(T.sum(p_y_given_x[0] * T.eq(costs, 0)) + stabilizer)
|
|
||||||
|
|
||||||
debug = theano.function(
|
|
||||||
name='debug',
|
|
||||||
inputs=[x, costs],
|
|
||||||
outputs=[p_y_given_x, T.eq(costs, 0), p_y_given_x[0] * T.eq(costs, 0)],
|
|
||||||
)
|
|
||||||
|
|
||||||
train_model = theano.function(
|
train_model = theano.function(
|
||||||
name='train_model',
|
name='train_model',
|
||||||
inputs=[x, costs, eta, mu],
|
inputs=[x, costs],
|
||||||
outputs=[p_y_given_x[0], T.grad(loss, x), T.argmax(p_y_given_x, axis=1),
|
outputs=[p_y_given_x[0], T.grad(loss, x), loss],
|
||||||
loss],
|
updates=optimizer(loss, [maxent_W, maxent_b, hidden_W, hidden_b]),
|
||||||
updates=(
|
|
||||||
[(timestep, timestep + 1)] +
|
|
||||||
maxent_W.updates(loss, timestep, eta, mu) +
|
|
||||||
maxent_b.updates(loss, timestep, eta, mu) +
|
|
||||||
hidden_W.updates(loss, timestep, eta, mu) +
|
|
||||||
hidden_b.updates(loss, timestep, eta, mu)
|
|
||||||
),
|
|
||||||
on_unused_input='warn'
|
on_unused_input='warn'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -177,18 +116,18 @@ def compile_theano_model(n_classes, n_hidden, n_in, L1_reg, L2_reg):
|
||||||
outputs=[
|
outputs=[
|
||||||
feed_layer(
|
feed_layer(
|
||||||
T.nnet.softmax,
|
T.nnet.softmax,
|
||||||
maxent_W.avg,
|
maxent_W,
|
||||||
maxent_b.avg,
|
maxent_b,
|
||||||
feed_layer(
|
feed_layer(
|
||||||
relu,
|
relu,
|
||||||
hidden_W.avg,
|
hidden_W,
|
||||||
hidden_b.avg,
|
hidden_b,
|
||||||
x
|
x
|
||||||
)
|
)
|
||||||
)[0]
|
)[0]
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
return debug, train_model, evaluate_model
|
return train_model, evaluate_model
|
||||||
|
|
||||||
|
|
||||||
def score_model(scorer, nlp, annot_tuples, verbose=False):
|
def score_model(scorer, nlp, annot_tuples, verbose=False):
|
||||||
|
@ -202,21 +141,6 @@ def score_model(scorer, nlp, annot_tuples, verbose=False):
|
||||||
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
|
eta=0.01, mu=0.9, nv_hidden=100, nv_word=10, nv_tag=10, nv_label=10,
|
||||||
seed=0, n_sents=0, verbose=False):
|
seed=0, n_sents=0, verbose=False):
|
||||||
def make_model(n_classes, (words, tags, labels), model_dir):
|
|
||||||
n_in = (nv_word * len(words)) + \
|
|
||||||
(nv_tag * len(tags)) + \
|
|
||||||
(nv_label * len(labels))
|
|
||||||
debug, train_func, predict_func = compile_theano_model(n_classes, nv_hidden,
|
|
||||||
n_in, 0.0, 0.0)
|
|
||||||
return TheanoModel(
|
|
||||||
n_classes,
|
|
||||||
((nv_word, words), (nv_tag, tags), (nv_label, labels)),
|
|
||||||
train_func,
|
|
||||||
predict_func,
|
|
||||||
model_loc=model_dir,
|
|
||||||
eta=eta, mu=mu,
|
|
||||||
debug=debug)
|
|
||||||
|
|
||||||
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
|
@ -230,7 +154,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config',
|
Config.write(dep_model_dir, 'config',
|
||||||
seed=seed,
|
seed=seed,
|
||||||
features=feat_set,
|
templates=tuple(),
|
||||||
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
labels=Language.ParserTransitionSystem.get_labels(gold_tuples),
|
||||||
vector_lengths=(nv_word, nv_tag, nv_label),
|
vector_lengths=(nv_word, nv_tag, nv_label),
|
||||||
hidden_nodes=nv_hidden,
|
hidden_nodes=nv_hidden,
|
||||||
|
@ -238,13 +162,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
mu=mu
|
mu=mu
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Bake-in hyper-parameters
|
||||||
|
optimizer = lambda loss, params: rms_prop(loss, params, eta=eta, rho=rho, eps=eps)
|
||||||
|
nlp = Language(data_dir=model_dir)
|
||||||
|
n_classes = nlp.parser.model.n_classes
|
||||||
|
train, predict = compile_model(n_classes, nv_hidden, n_in, optimizer)
|
||||||
|
nlp.parser.model = TheanoModel(n_classes, input_spec, train,
|
||||||
|
predict, model_loc)
|
||||||
|
|
||||||
if n_sents > 0:
|
if n_sents > 0:
|
||||||
gold_tuples = gold_tuples[:n_sents]
|
gold_tuples = gold_tuples[:n_sents]
|
||||||
|
|
||||||
nlp = Language(data_dir=model_dir)
|
|
||||||
nlp._parser = Parser(nlp.vocab.strings, dep_model_dir, nlp.ParserTransitionSystem,
|
|
||||||
make_model)
|
|
||||||
|
|
||||||
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
|
print "Itn.\tP.Loss\tUAS\tTag %\tToken %"
|
||||||
log_loc = path.join(model_dir, 'job.log')
|
log_loc = path.join(model_dir, 'job.log')
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user