This commit is contained in:
Matthew Honnibal 2017-05-08 08:29:36 -05:00
commit bef89ef23d
5 changed files with 383 additions and 313 deletions

View File

@ -18,6 +18,8 @@ import spacy.attrs
import io import io
from thinc.neural.ops import CupyOps from thinc.neural.ops import CupyOps
from thinc.neural import Model from thinc.neural import Model
from spacy.es import Spanish
from spacy.attrs import POS
try: try:
import cupy import cupy
@ -156,20 +158,15 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
for tag in tags: for tag in tags:
vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]} vocab.morphology.tag_map[tag] = {POS: tag.split('__', 1)[0]}
tagger = Tagger(vocab) tagger = Tagger(vocab)
encoder = TokenVectorEncoder(vocab) encoder = TokenVectorEncoder(vocab, width=64)
parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0) parser = DependencyParser(vocab, actions=actions, features=features, L1=0.0)
Xs, ys = organize_data(vocab, train_sents) Xs, ys = organize_data(vocab, train_sents)
dev_Xs, dev_ys = organize_data(vocab, dev_sents) dev_Xs, dev_ys = organize_data(vocab, dev_sents)
#Xs = Xs[:1000]
#ys = ys[:1000]
#dev_Xs = dev_Xs[:1000]
#dev_ys = dev_ys[:1000]
with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer): with encoder.model.begin_training(Xs[:100], ys[:100]) as (trainer, optimizer):
docs = list(Xs) docs = list(Xs)
for doc in docs: for doc in docs:
encoder(doc) encoder(doc)
parser.begin_training(docs, ys)
nn_loss = [0.] nn_loss = [0.]
def track_progress(): def track_progress():
with encoder.tagger.use_params(optimizer.averages): with encoder.tagger.use_params(optimizer.averages):
@ -191,11 +188,23 @@ def main(lang_name, train_loc, dev_loc, model_dir, clusters_loc=None):
upd_tokvecs(d_tokvecs, sgd=optimizer) upd_tokvecs(d_tokvecs, sgd=optimizer)
encoder.update(docs, golds, sgd=optimizer) encoder.update(docs, golds, sgd=optimizer)
nn_loss[-1] += loss nn_loss[-1] += loss
nlp = LangClass(vocab=vocab, tagger=tagger, parser=parser) nlp = LangClass(vocab=vocab, parser=parser)
#nlp.end_training(model_dir) scorer = score_model(vocab, encoder, parser, read_conllx(dev_loc))
scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc)) print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
#nlp.end_training(model_dir)
#scorer = score_model(vocab, tagger, parser, read_conllx(dev_loc))
#print('%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.las, scorer.tags_acc))
if __name__ == '__main__': if __name__ == '__main__':
import cProfile
import pstats
if 1:
plac.call(main)
else:
cProfile.runctx("plac.call(main)", globals(), locals(), "Profile.prof")
s = pstats.Stats("Profile.prof")
s.strip_dirs().sort_stats("time").print_stats()
plac.call(main) plac.call(main)

View File

@ -7,8 +7,125 @@ from thinc.neural._classes.static_vectors import StaticVectors
from thinc.neural._classes.batchnorm import BatchNorm from thinc.neural._classes.batchnorm import BatchNorm
from thinc.neural._classes.resnet import Residual from thinc.neural._classes.resnet import Residual
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP
import numpy
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
nI=Dimension("Input size"),
nF=Dimension("Number of features"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nO, obj.nF, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
b=Biases("Bias vector",
lambda obj: (obj.nO,)),
d_W=Gradient("W"),
d_b=Gradient("b")
)
class PrecomputableAffine(Model):
def __init__(self, nO=None, nI=None, nF=None, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nI = nI
self.nF = nF
def begin_update(self, X, drop=0.):
# X: (b, i)
# Xf: (b, f, i)
# dY: (b, o)
# dYf: (b, f, o)
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W)
Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1))
Yf += self.b
def backward(dY_ids, sgd=None):
dY, ids = dY_ids
Xf = X[ids]
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]])
db = dY.sum(axis=0)
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W)
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]])
self.d_W += dW
self.d_b += db
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return Yf, backward
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
nI=Dimension("Input size"),
nF=Dimension("Number of features"),
nP=Dimension("Number of pieces"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
b=Biases("Bias vector",
lambda obj: (obj.nO, obj.nP)),
d_W=Gradient("W"),
d_b=Gradient("b")
)
class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, pieces=2, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nP = pieces
self.nI = nI
self.nF = nF
def begin_update(self, X, drop=0.):
# X: (b, i)
# Yfp: (f, b, o, p)
# Yf: (f, b, o)
# Xf: (b, f, i)
# dY: (b, o)
# dYp: (b, o, p)
# W: (f, o, p, i)
# b: (o, p)
# Yfp = numpy.einsum('bi,fopi->fbop', X, self.W)
Yfp = self.ops.xp.tensordot(X, self.W,
axes=[[1], [3]]).transpose((1, 0, 2, 3))
Yfp = self.ops.xp.ascontiguousarray(Yfp)
Yfp += self.b
Yf = self.ops.allocate((self.nF, X.shape[0], self.nO))
which = self.ops.allocate((self.nF, X.shape[0], self.nO), dtype='i')
for i in range(self.nF):
Yf[i], which[i] = self.ops.maxout(Yfp[i])
def backward(dY_ids, sgd=None):
dY, ids = dY_ids
Xf = X[ids]
dYp = self.ops.allocate((dY.shape[0], self.nO, self.nP))
for i in range(self.nF):
dYp += self.ops.backprop_maxout(dY, which[i], self.nP)
#dXf = numpy.einsum('bop,fopi->bfi', dYp, self.W)
dXf = self.ops.xp.tensordot(dYp, self.W, axes=[[1,2], [1,2]])
#dW = numpy.einsum('bfi,bop->fopi', Xf, dYp)
dW = self.ops.xp.tensordot(Xf, dYp, axes=[[0], [0]])
dW = dW.transpose((0, 2, 3, 1))
db = dYp.sum(axis=0)
self.d_W += dW
self.d_b += db
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return Yf, backward
def get_col(idx): def get_col(idx):
def forward(X, drop=0.): def forward(X, drop=0.):
@ -22,55 +139,36 @@ def get_col(idx):
return layerize(forward) return layerize(forward)
def build_model(state2vec, width, depth, nr_class): def build_tok2vec(lang, width, depth=2, embed_size=1000):
with Model.define_operators({'>>': chain, '**': clone}): cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE]
model = ( with Model.define_operators({'>>': chain, '|': concatenate, '**': clone}):
state2vec #static = get_col(cols.index(ID)) >> StaticVectors(lang, width)
>> Maxout(width, 1344) lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size)
>> Maxout(width, width) prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width//4, embed_size)
>> Affine(nr_class, width) suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width//4, embed_size)
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width//4, embed_size)
tok2vec = (
doc2feats(cols)
>> with_flatten(
#(static | prefix | suffix | shape)
(lower | prefix | suffix | shape)
>> Maxout(width)
>> (ExtractWindow(nW=1) >> Maxout(width, width*3))
>> (ExtractWindow(nW=1) >> Maxout(width, width*3))
)
) )
return model return tok2vec
def build_debug_model(state2vec, width, depth, nr_class): def doc2feats(cols):
with Model.define_operators({'>>': chain, '**': clone}): def forward(docs, drop=0.):
model = ( feats = [doc.to_array(cols) for doc in docs]
state2vec feats = [model.ops.asarray(f, dtype='uint64') for f in feats]
#>> Maxout(width) return feats, None
>> Maxout(nr_class)
)
return model
def build_debug_state2vec(width, nr_vector=1000, nF=1, nB=0, nS=1, nL=2, nR=2):
ops = Model.ops
def forward(tokens_attrs_vectors, drop=0.):
tokens, attr_vals, tokvecs = tokens_attrs_vectors
orig_tokvecs_shape = tokvecs.shape
tokvecs = tokvecs.reshape((tokvecs.shape[0], tokvecs.shape[1] *
tokvecs.shape[2]))
vector = tokvecs
def backward(d_vector, sgd=None):
d_tokvecs = vector.reshape(orig_tokvecs_shape)
return (tokens, d_tokvecs)
return vector, backward
model = layerize(forward) model = layerize(forward)
return model return model
def build_state2vec(nr_context_tokens, width, nr_vector=1000):
ops = Model.ops
with Model.define_operators({'|': concatenate, '+': add, '>>': chain}):
#hiddens = [get_col(i) >> Maxout(width) for i in range(nr_context_tokens)]
features = [get_col(i) for i in range(nr_context_tokens)]
model = get_token_vectors >> concatenate(*features) >> ReLu(width)
return model
def print_shape(prefix): def print_shape(prefix):
def forward(X, drop=0.): def forward(X, drop=0.):
return X, lambda dX, **kwargs: dX return X, lambda dX, **kwargs: dX
@ -86,87 +184,6 @@ def get_token_vectors(tokens_attrs_vectors, drop=0.):
return vectors, backward return vectors, backward
def build_parser_state2vec(width, nr_vector=1000, nF=1, nB=0, nS=1, nL=2, nR=2):
embed_tags = _reshape(chain(get_col(0), HashEmbed(16, nr_vector)))
embed_deps = _reshape(chain(get_col(1), HashEmbed(16, nr_vector)))
ops = embed_tags.ops
def forward(tokens_attrs_vectors, drop=0.):
tokens, attr_vals, tokvecs = tokens_attrs_vectors
tagvecs, bp_tagvecs = embed_deps.begin_update(attr_vals, drop=drop)
depvecs, bp_depvecs = embed_tags.begin_update(attr_vals, drop=drop)
orig_tokvecs_shape = tokvecs.shape
tokvecs = tokvecs.reshape((tokvecs.shape[0], tokvecs.shape[1] *
tokvecs.shape[2]))
shapes = (tagvecs.shape, depvecs.shape, tokvecs.shape)
assert tagvecs.shape[0] == depvecs.shape[0] == tokvecs.shape[0], shapes
vector = ops.xp.hstack((tagvecs, depvecs, tokvecs))
def backward(d_vector, sgd=None):
d_tagvecs, d_depvecs, d_tokvecs = backprop_concatenate(d_vector, shapes)
assert d_tagvecs.shape == shapes[0], (d_tagvecs.shape, shapes)
assert d_depvecs.shape == shapes[1], (d_depvecs.shape, shapes)
assert d_tokvecs.shape == shapes[2], (d_tokvecs.shape, shapes)
bp_tagvecs(d_tagvecs)
bp_depvecs(d_depvecs)
d_tokvecs = d_tokvecs.reshape(orig_tokvecs_shape)
return (tokens, d_tokvecs)
return vector, backward
model = layerize(forward)
model._layers = [embed_tags, embed_deps]
return model
def backprop_concatenate(gradient, shapes):
grads = []
start = 0
for shape in shapes:
end = start + shape[1]
grads.append(gradient[:, start : end])
start = end
return grads
def _reshape(layer):
'''Transforms input with shape
(states, tokens, features)
into input with shape:
(states * tokens, features)
So that it can be used with a token-wise feature extraction layer, e.g.
an embedding layer. The embedding layer outputs:
(states * tokens, ndim)
But we want to concatenate the vectors for the tokens, so we produce:
(states, tokens * ndim)
We then need to reverse the transforms to do the backward pass. Recall
the simple rule here: each layer is a map:
inputs -> (outputs, (d_outputs->d_inputs))
So the shapes must match like this:
shape of forward input == shape of backward output
shape of backward input == shape of forward output
'''
def forward(X__bfm, drop=0.):
b, f, m = X__bfm.shape
B = b*f
M = f*m
X__Bm = X__bfm.reshape((B, m))
y__Bn, bp_yBn = layer.begin_update(X__Bm, drop=drop)
n = y__Bn.shape[1]
N = f * n
y__bN = y__Bn.reshape((b, N))
def backward(dy__bN, sgd=None):
dy__Bn = dy__bN.reshape((B, n))
dX__Bm = bp_yBn(dy__Bn, sgd)
if dX__Bm is None:
return None
else:
return dX__Bm.reshape((b, f, m))
return y__bN, backward
model = layerize(forward)
model._layers.append(layer)
return model
@layerize @layerize
def flatten(seqs, drop=0.): def flatten(seqs, drop=0.):
ops = Model.ops ops = Model.ops
@ -177,32 +194,44 @@ def flatten(seqs, drop=0.):
return X, finish_update return X, finish_update
def build_tok2vec(lang, width, depth=2, embed_size=1000): #def build_feature_precomputer(model, feat_maps):
cols = [ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG] # '''Allow a model to be "primed" by pre-computing input features in bulk.
with Model.define_operators({'>>': chain, '|': concatenate, '**': clone}): #
#static = get_col(cols.index(ID)) >> StaticVectors(lang, width) # This is used for the parser, where we want to take a batch of documents,
lower = get_col(cols.index(LOWER)) >> HashEmbed(width, embed_size) # and compute vectors for each (token, position) pair. These vectors can then
prefix = get_col(cols.index(PREFIX)) >> HashEmbed(width, embed_size) # be reused, especially for beam-search.
suffix = get_col(cols.index(SUFFIX)) >> HashEmbed(width, embed_size) #
shape = get_col(cols.index(SHAPE)) >> HashEmbed(width, embed_size) # Let's say we're using 12 features for each state, e.g. word at start of
tok2vec = ( # buffer, three words on stack, their children, etc. In the normal arc-eager
doc2feats(cols) # system, a document of length N is processed in 2*N states. This means we'll
>> with_flatten( # create 2*N*12 feature vectors --- but if we pre-compute, we only need
#(static | prefix | suffix | shape) # N*12 vector computations. The saving for beam-search is much better:
(lower | prefix | suffix | shape) # if we have a beam of k, we'll normally make 2*N*12*K computations --
>> Maxout(width, width*4) # so we can save the factor k. This also gives a nice CPU/GPU division:
>> Residual((ExtractWindow(nW=1) >> Maxout(width, width*3))) # we can do all our hard maths up front, packed into large multiplications,
>> Residual((ExtractWindow(nW=1) >> Maxout(width, width*3))) # and do the hard-to-program parsing on the CPU.
>> Residual((ExtractWindow(nW=1) >> Maxout(width, width*3))) # '''
) # def precompute(input_vectors):
) # cached, backprops = zip(*[lyr.begin_update(input_vectors)
return tok2vec # for lyr in feat_maps)
# def forward(batch_token_ids, drop=0.):
# output = ops.allocate((batch_size, output_width))
# # i: batch index
# # j: position index (i.e. N0, S0, etc
# # tok_i: Index of the token within its document
# for i, token_ids in enumerate(batch_token_ids):
# for j, tok_i in enumerate(token_ids):
# output[i] += cached[j][tok_i]
# def backward(d_vector, sgd=None):
# d_inputs = ops.allocate((batch_size, n_feat, vec_width))
# for i, token_ids in enumerate(batch_token_ids):
# for j in range(len(token_ids)):
# d_inputs[i][j] = backprops[j](d_vector, sgd)
# # Return the IDs, so caller can associate to correct token
# return (batch_token_ids, d_inputs)
# return vector, backward
# return chain(layerize(forward), model)
# return precompute
#
#
def doc2feats(cols):
def forward(docs, drop=0.):
feats = [doc.to_array(cols) for doc in docs]
feats = [model.ops.asarray(f, dtype='uint64') for f in feats]
return feats, None
model = layerize(forward)
return model

View File

@ -23,7 +23,7 @@ class TokenVectorEncoder(object):
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.''' '''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
def __init__(self, vocab, **cfg): def __init__(self, vocab, **cfg):
self.vocab = vocab self.vocab = vocab
self.model = build_tok2vec(vocab.lang, 64, **cfg) self.model = build_tok2vec(vocab.lang, **cfg)
self.tagger = chain( self.tagger = chain(
self.model, self.model,
flatten, flatten,

View File

@ -13,5 +13,6 @@ cdef class Parser:
cdef readonly object model cdef readonly object model
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef readonly object cfg cdef readonly object cfg
cdef public object feature_maps
#cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil #cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil

View File

@ -28,8 +28,11 @@ from murmurhash.mrmr cimport hash64
from preshed.maps cimport MapStruct from preshed.maps cimport MapStruct
from preshed.maps cimport map_get from preshed.maps cimport map_get
from numpy import exp
from thinc.api import layerize, chain
from thinc.neural import Model, Maxout
from .._ml import PrecomputableAffine, PrecomputableMaxouts
from . import _parse_features from . import _parse_features
from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport CONTEXT_SIZE
from ._parse_features cimport fill_context from ._parse_features cimport fill_context
@ -44,10 +47,9 @@ from ..strings cimport StringStore
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..attrs cimport TAG, DEP from ..attrs cimport TAG, DEP
from .._ml import build_parser_state2vec, build_model
from .._ml import build_state2vec, build_model
from .._ml import build_debug_state2vec, build_debug_model
def get_templates(*args, **kwargs):
return []
USE_FTRL = True USE_FTRL = True
DEBUG = False DEBUG = False
@ -56,8 +58,109 @@ def set_debug(val):
DEBUG = val DEBUG = val
def get_templates(*args, **kwargs): def get_greedy_model_for_batch(tokvecs, TransitionSystem moves, upper_model, lower_model):
return [] cdef int[:, :] is_valid_
cdef float[:, :] costs_
lengths = [len(t) for t in tokvecs]
tokvecs = upper_model.ops.flatten(tokvecs)
is_valid = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='i')
costs = upper_model.ops.allocate((len(tokvecs), moves.n_moves), dtype='f')
token_ids = upper_model.ops.allocate((len(tokvecs), lower_model.nF), dtype='i')
cached, bp_features = lower_model.begin_update(tokvecs, drop=0.)
is_valid_ = is_valid
costs_ = costs
def forward(states_offsets, drop=0.):
nonlocal is_valid, costs, token_ids, moves
states, offsets = states_offsets
assert len(states) != 0
is_valid = is_valid[:len(states)]
costs = costs[:len(states)]
token_ids = token_ids[:len(states)]
is_valid = is_valid[:len(states)]
cdef StateClass state
cdef int i
for i, (offset, state) in enumerate(zip(offsets, states)):
state.set_context_tokens(token_ids[i])
moves.set_valid(&is_valid_[i, 0], state.c)
adjusted_ids = token_ids.copy()
for i, offset in enumerate(offsets):
adjusted_ids[i] *= token_ids[i] >= 0
adjusted_ids[i] += offset
features = upper_model.ops.allocate((len(states), lower_model.nO), dtype='f')
for i in range(len(states)):
for j, tok_i in enumerate(adjusted_ids[i]):
if tok_i >= 0:
features[i] += cached[j, tok_i]
scores, bp_scores = upper_model.begin_update(features, drop=drop)
scores = upper_model.ops.relu(scores)
softmaxed = upper_model.ops.softmax(scores)
# Renormalize for invalid actions
softmaxed *= is_valid
totals = softmaxed.sum(axis=1)
for total in totals:
assert total > 0, (totals, scores, softmaxed)
assert total <= 1.1, totals
softmaxed /= softmaxed.sum(axis=1).reshape((softmaxed.shape[0], 1))
def backward(golds, sgd=None):
nonlocal costs_, is_valid_, moves
cdef int i
for i, (state, gold) in enumerate(zip(states, golds)):
moves.set_costs(&is_valid_[i, 0], &costs_[i, 0],
state, gold)
d_scores = scores.copy()
d_scores.fill(0)
set_log_loss(upper_model.ops, d_scores,
scores, is_valid, costs)
upper_model.ops.backprop_relu(d_scores, scores, inplace=True)
d_features = bp_scores(d_scores, sgd)
d_tokens = bp_features((d_features, adjusted_ids), sgd)
return (token_ids, d_tokens)
return softmaxed, backward
return layerize(forward)
def set_log_loss(ops, gradients, scores, is_valid, costs):
"""Do multi-label log loss"""
n = gradients.shape[0]
scores = scores * is_valid
g_scores = scores * is_valid * (costs <= 0.)
exps = ops.xp.exp(scores - scores.max(axis=1).reshape((n, 1)))
exps *= is_valid
g_exps = ops.xp.exp(g_scores - g_scores.max(axis=1).reshape((n, 1)))
g_exps *= costs <= 0.
g_exps *= is_valid
gradients[:] = exps / exps.sum(axis=1).reshape((n, 1))
gradients -= g_exps / g_exps.sum(axis=1).reshape((n, 1))
def transition_batch(TransitionSystem moves, states, scores):
cdef StateClass state
cdef int guess
for state, guess in zip(states, scores.argmax(axis=1)):
action = moves.c[guess]
action.do(state.c, action.label)
def init_states(TransitionSystem moves, docs):
cdef Doc doc
cdef StateClass state
offsets = []
states = []
offset = 0
for i, doc in enumerate(docs):
state = StateClass.init(doc.c, doc.length)
moves.initialize_state(state.c)
states.append(state)
offsets.append(offset)
offset += len(doc)
return states, offsets
cdef class Parser: cdef class Parser:
@ -107,8 +210,9 @@ cdef class Parser:
cfg['actions'] = TransitionSystem.get_actions(**cfg) cfg['actions'] = TransitionSystem.get_actions(**cfg)
self.moves = TransitionSystem(vocab.strings, cfg['actions']) self.moves = TransitionSystem(vocab.strings, cfg['actions'])
if model is None: if model is None:
model = self.build_model(**cfg) self.model, self.feature_maps = self.build_model(**cfg)
self.model = model else:
self.model, self.feature_maps = model
self.cfg = cfg self.cfg = cfg
def __reduce__(self): def __reduce__(self):
@ -116,10 +220,10 @@ cdef class Parser:
def build_model(self, width=128, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_): def build_model(self, width=128, nr_vector=1000, nF=1, nB=1, nS=1, nL=1, nR=1, **_):
nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR) nr_context_tokens = StateClass.nr_context_tokens(nF, nB, nS, nL, nR)
state2vec = build_state2vec(nr_context_tokens, width, nr_vector)
#state2vec = build_debug_state2vec(width, nr_vector) upper = chain(Maxout(width, width), Maxout(self.moves.n_moves, width))
model = build_debug_model(state2vec, width*2, 2, self.moves.n_moves) lower = PrecomputableMaxouts(width, nF=nr_context_tokens, nI=width*2)
return model return upper, lower
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
""" """
@ -131,7 +235,6 @@ cdef class Parser:
None None
""" """
self.parse_batch([tokens]) self.parse_batch([tokens])
self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
""" """
@ -167,169 +270,53 @@ cdef class Parser:
yield doc yield doc
def parse_batch(self, docs): def parse_batch(self, docs):
states = self._init_states(docs)
nr_class = self.moves.n_moves
cdef Doc doc cdef Doc doc
cdef StateClass state cdef StateClass state
cdef int guess model = get_greedy_model_for_batch([d.tensor for d in docs],
tokvecs = [d.tensor for d in docs] self.moves, self.model, self.feature_maps)
states, offsets = init_states(self.moves, docs)
all_states = list(states) all_states = list(states)
todo = zip(states, tokvecs) todo = list(zip(states, offsets))
while todo: while todo:
todo = filter(lambda sp: not sp[0].py_is_final(), todo) todo = filter(lambda sp: not sp[0].py_is_final(), todo)
if not todo: if not todo:
break break
states, tokvecs = zip(*todo) states, offsets = zip(*todo)
scores, _ = self._begin_update(states, tokvecs) scores = model((states, offsets))
self._transition_batch(states, docs, scores) transition_batch(self.moves, states, scores)
todo = [st for st in todo if not st[0].py_is_final()]
for state, doc in zip(all_states, docs): for state, doc in zip(all_states, docs):
self.moves.finalize_state(state.c) self.moves.finalize_state(state.c)
for i in range(doc.length): for i in range(doc.length):
doc.c[i] = state.c._sent[i] doc.c[i] = state.c._sent[i]
for doc in docs:
def begin_training(self, docs, golds): self.moves.finalize_doc(doc)
for gold in golds:
self.moves.preprocess_gold(gold)
states = self._init_states(docs)
tokvecs = [d.tensor for d in docs]
d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs]
nr_class = self.moves.n_moves
costs = self.model.ops.allocate((len(docs), nr_class), dtype='f')
gradients = self.model.ops.allocate((len(docs), nr_class), dtype='f')
is_valid = self.model.ops.allocate((len(docs), nr_class), dtype='i')
attr_names = numpy.zeros((2,), dtype='i')
attr_names[0] = TAG
attr_names[1] = DEP
features = self._get_features(states, tokvecs, attr_names)
self.model.begin_training(features)
def update(self, docs, golds, drop=0., sgd=None): def update(self, docs, golds, drop=0., sgd=None):
if isinstance(docs, Doc) and isinstance(golds, GoldParse): if isinstance(docs, Doc) and isinstance(golds, GoldParse):
return self.update([docs], [golds], drop=drop) return self.update([docs], [golds], drop=drop)
for gold in golds: for gold in golds:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
states = self._init_states(docs)
tokvecs = [d.tensor for d in docs] model = get_greedy_model_for_batch([d.tensor for d in docs],
self.moves, self.model, self.feature_maps)
states, offsets = init_states(self.moves, docs)
d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs] d_tokens = [self.model.ops.allocate(d.tensor.shape) for d in docs]
nr_class = self.moves.n_moves
output = list(d_tokens) output = list(d_tokens)
todo = zip(states, tokvecs, golds, d_tokens) todo = zip(states, offsets, golds, d_tokens)
assert len(states) == len(todo)
losses = []
while todo: while todo:
# Get unfinished states (and their matching gold and token gradients) # Get unfinished states (and their matching gold and token gradients)
todo = filter(lambda sp: not sp[0].py_is_final(), todo) todo = filter(lambda sp: not sp[0].py_is_final(), todo)
if not todo: if not todo:
break break
states, tokvecs, golds, d_tokens = zip(*todo) states, offsets, golds, d_tokens = zip(*todo)
scores, finish_update = self._begin_update(states, tokvecs) scores, finish_update = model.begin_update((states, offsets))
token_ids, batch_token_grads = finish_update(golds, sgd=sgd, losses=losses, (token_ids, d_state_features) = finish_update(golds, sgd=sgd)
force_gold=False) for i, token_ids in enumerate(token_ids):
batch_token_grads *= (token_ids >= 0).reshape((token_ids.shape[0], token_ids.shape[1], 1)) d_tokens[i][token_ids] += d_state_features[i]
token_ids *= token_ids >= 0 transition_batch(self.moves, states, scores)
if hasattr(self.model.ops.xp, 'scatter_add'): return output
for i, tok_ids in enumerate(token_ids):
self.model.ops.xp.scatter_add(d_tokens[i],
tok_ids, batch_token_grads[i])
else:
for i, tok_ids in enumerate(token_ids):
self.model.ops.xp.add.at(d_tokens[i],
tok_ids, batch_token_grads[i])
self._transition_batch(states, docs, scores)
return output, sum(losses)
def _begin_update(self, states, tokvecs, drop=0.):
nr_class = self.moves.n_moves
attr_names = numpy.zeros((2,), dtype='i')
attr_names[0] = TAG
attr_names[1] = DEP
features = self._get_features(states, tokvecs, attr_names)
scores, finish_update = self.model.begin_update(features, drop=drop)
assert scores.shape[0] == len(states), (len(states), scores.shape)
assert len(scores.shape) == 2
is_valid = self.model.ops.allocate((len(states), nr_class), dtype='i')
self._validate_batch(is_valid, states)
softmaxed = self.model.ops.softmax(scores)
softmaxed *= is_valid
softmaxed /= softmaxed.sum(axis=1).reshape((softmaxed.shape[0], 1))
def backward(golds, sgd=None, losses=[], force_gold=False):
nonlocal softmaxed
costs = self.model.ops.allocate((len(states), nr_class), dtype='f')
d_scores = self.model.ops.allocate((len(states), nr_class), dtype='f')
self._cost_batch(costs, is_valid, states, golds)
self._set_gradient(d_scores, scores, is_valid, costs)
losses.append(self.model.ops.xp.abs(d_scores).sum())
if force_gold:
softmaxed *= costs <= 0
return finish_update(d_scores, sgd=sgd)
return softmaxed, backward
def _init_states(self, docs):
states = []
cdef Doc doc
cdef StateClass state
for i, doc in enumerate(docs):
state = StateClass.init(doc.c, doc.length)
self.moves.initialize_state(state.c)
states.append(state)
return states
def _get_features(self, states, all_tokvecs, attr_names,
nF=1, nB=0, nS=2, nL=2, nR=2):
n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR)
vector_length = all_tokvecs[0].shape[1]
cpu_tokens = numpy.zeros((len(states), n_tokens), dtype='int32')
features = numpy.zeros((len(states), n_tokens, attr_names.shape[0]), dtype='uint64')
tokvecs = self.model.ops.allocate((len(states), n_tokens, vector_length), dtype='f')
for i, state in enumerate(states):
state.set_context_tokens(cpu_tokens[i], nF, nB, nS, nL, nR)
for i in range(len(states)):
for j, tok_i in enumerate(cpu_tokens[i]):
if tok_i >= 0:
tokvecs[i, j] = all_tokvecs[i][tok_i]
return (cpu_tokens, self.model.ops.asarray(features), tokvecs)
def _validate_batch(self, int[:, ::1] is_valid, states):
cdef StateClass state
cdef int i
for i, state in enumerate(states):
self.moves.set_valid(&is_valid[i, 0], state.c)
def _cost_batch(self, float[:, ::1] costs, int[:, ::1] is_valid,
states, golds):
cdef int i
cdef StateClass state
cdef GoldParse gold
for i, (state, gold) in enumerate(zip(states, golds)):
self.moves.set_costs(&is_valid[i, 0], &costs[i, 0], state, gold)
def _transition_batch(self, states, docs, scores):
cdef StateClass state
cdef int guess
for state, doc, guess in zip(states, docs, scores.argmax(axis=1)):
action = self.moves.c[guess]
orths = [t.lex.orth for t in state.c._sent[:state.c.length]]
words = [doc.vocab.strings[w] for w in orths]
if not action.is_valid(state.c, action.label):
ValueError("Invalid action", scores)
action.do(state.c, action.label)
def _set_gradient(self, gradients, scores, is_valid, costs):
"""Do multi-label log loss"""
cdef double Z, gZ, max_, g_max
n = gradients.shape[0]
scores = scores * is_valid
g_scores = scores * is_valid * (costs <= 0.)
exps = self.model.ops.xp.exp(scores - scores.max(axis=1).reshape((n, 1)))
exps *= is_valid
g_exps = self.model.ops.xp.exp(g_scores - g_scores.max(axis=1).reshape((n, 1)))
g_exps *= costs <= 0.
g_exps *= is_valid
gradients[:] = exps / exps.sum(axis=1).reshape((n, 1))
gradients -= g_exps / g_exps.sum(axis=1).reshape((n, 1))
def step_through(self, Doc doc, GoldParse gold=None): def step_through(self, Doc doc, GoldParse gold=None):
""" """
@ -366,6 +353,50 @@ cdef class Parser:
self.cfg.setdefault('extra_labels', []).append(label) self.cfg.setdefault('extra_labels', []).append(label)
def _begin_update(self, model, states, tokvecs, drop=0.):
nr_class = self.moves.n_moves
attr_names = self.model.ops.allocate((2,), dtype='i')
attr_names[0] = TAG
attr_names[1] = DEP
features = self._get_features(states, tokvecs, attr_names)
scores, finish_update = self.model.begin_update(features, drop=drop)
assert scores.shape[0] == len(states), (len(states), scores.shape)
assert len(scores.shape) == 2
is_valid = self.model.ops.allocate((len(states), nr_class), dtype='i')
self._validate_batch(is_valid, states)
softmaxed = self.model.ops.softmax(scores)
softmaxed *= is_valid
softmaxed /= softmaxed.sum(axis=1).reshape((softmaxed.shape[0], 1))
def backward(golds, sgd=None, losses=[], force_gold=False):
nonlocal softmaxed
costs = self.model.ops.allocate((len(states), nr_class), dtype='f')
d_scores = self.model.ops.allocate((len(states), nr_class), dtype='f')
self._cost_batch(costs, is_valid, states, golds)
self._set_gradient(d_scores, scores, is_valid, costs)
losses.append(numpy.abs(d_scores).sum())
if force_gold:
softmaxed *= costs <= 0
return finish_update(d_scores, sgd=sgd)
return softmaxed, backward
def _get_features(self, states, all_tokvecs, attr_names,
nF=1, nB=0, nS=2, nL=2, nR=2):
n_tokens = states[0].nr_context_tokens(nF, nB, nS, nL, nR)
vector_length = all_tokvecs[0].shape[1]
tokens = self.model.ops.allocate((len(states), n_tokens), dtype='int32')
features = self.model.ops.allocate((len(states), n_tokens, attr_names.shape[0]), dtype='uint64')
tokvecs = self.model.ops.allocate((len(states), n_tokens, vector_length), dtype='f')
for i, state in enumerate(states):
state.set_context_tokens(tokens[i], nF, nB, nS, nL, nR)
state.set_attributes(features[i], tokens[i], attr_names)
state.set_token_vectors(tokvecs[i], all_tokvecs[i], tokens[i])
return (tokens, features, tokvecs)
cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
if prob <= 0 or prob >= 1.: if prob <= 0 or prob >= 1.:
return 0 return 0