WIP refactor parser

This commit is contained in:
Matthew Honnibal 2021-01-25 23:22:10 +11:00
parent b456929bfd
commit 267ffb5605

View File

@ -18,8 +18,9 @@ from ..pipeline._parser_internals.stateclass cimport StateClass
cdef WeightsC get_c_weights(model) except *:
cdef WeightsC output
cdef precompute_hiddens state2vec = model.state2vec
cdef np.ndarray bias = state2vec.bias
output.feat_weights = state2vec.get_feat_weights()
output.feat_bias = <const float*>state2vec.bias.data
output.feat_bias = <const float*>bias.data
cdef np.ndarray vec2scores_W
cdef np.ndarray vec2scores_b
if model.vec2scores is None:
@ -220,27 +221,23 @@ class ParserStepModel(Model):
activation = None
else:
activation = "relu"
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
activation=activation, train=train)
self.state2vec = precompute_hiddens(
len(docs),
self.tokvecs,
layers[1],
activation=activation,
train=train
)
if has_upper:
self.vec2scores = layers[-1]
else:
self.vec2scores = None
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
self.backprops = []
self._class_mask = numpy.zeros((self.nO,), dtype='f')
self._class_mask.fill(1)
if unseen_classes is not None:
for class_ in unseen_classes:
self._class_mask[class_] = 0.
def clear_memory(self):
del self.tokvecs
del self.bp_tokvecs
del self.state2vec
del self.backprops
del self._class_mask
@property
def nO(self):
if self.attrs["has_upper"]:
@ -248,6 +245,13 @@ class ParserStepModel(Model):
else:
return self.state2vec.get_dim("nO")
def clear_memory(self):
del self.tokvecs
del self.bp_tokvecs
del self.state2vec
del self.backprops
del self._class_mask
def class_is_unseen(self, class_):
return self._class_mask[class_]
@ -269,54 +273,22 @@ class ParserStepModel(Model):
c_ids += ids.shape[1]
return ids
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((
util.get_async(self.cuda_stream, token_ids),
util.get_async(self.cuda_stream, d_vector),
get_d_tokvecs
))
else:
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient.
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
# Tells CUDA to block, so our async copies complete.
if self.cuda_stream is not None:
self.cuda_stream.synchronize()
for ids, d_vector, bp_vector in self.backprops:
d_state_features = bp_vector((d_vector, ids))
ids = ids.flatten()
d_state_features = d_state_features.reshape(
(ids.size, d_state_features.shape[2]))
self.ops.scatter_add(d_tokvecs, ids,
d_state_features)
# Padded -- see update()
self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs
NUMPY_OPS = NumpyOps()
def step_forward(model: ParserStepModel, states, is_train):
token_ids = model.get_token_ids(states)
def step_forward(model: ParserStepModel, token_ids, is_train):
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
mask = None
if model.attrs["has_upper"]:
vec2scores = ensure_same_device(model.ops, model.vec2scores)
dropout_rate = model.attrs["dropout_rate"]
if is_train and dropout_rate > 0:
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
mask = model.ops.get_dropout_mask(vector.shape, dropout_rate)
vector *= mask
scores, get_d_vector = model.vec2scores(vector, is_train)
scores, get_d_vector = vec2scores(vector, is_train)
else:
scores = NumpyOps().asarray(vector)
scores = vector
get_d_vector = lambda d_scores: d_scores
# If the class is unseen, make sure its score is minimum
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
scores[:, model._class_mask == 0] = model.ops.xp.nanmin(scores)
def backprop_parser_step(d_scores):
# Zero vectors for unseen classes
@ -324,11 +296,18 @@ def step_forward(model: ParserStepModel, states, is_train):
d_vector = get_d_vector(d_scores)
if mask is not None:
d_vector *= mask
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
return None
return get_d_tokvecs(d_vector)
return scores, backprop_parser_step
def ensure_same_device(ops, model):
"""Ensure a model is on the same device as a given ops"""
if not isinstance(model.ops, ops.__class__):
model._to_ops(ops)
return model
cdef class precompute_hiddens:
"""Allow a model to be "primed" by pre-computing input features in bulk.
@ -347,31 +326,23 @@ cdef class precompute_hiddens:
and do the hard-to-program parsing on the CPU.
"""
cdef readonly int nF, nO, nP
cdef bint _is_synchronized
cdef public object ops
cdef public object numpy_ops
cdef np.ndarray _features
cdef np.ndarray _cached
cdef np.ndarray bias
cdef object _cuda_stream
cdef object _bp_hiddens
cdef object activation
cdef readonly object bias
cdef readonly object activation
cdef readonly object _features
cdef readonly object _cached
cdef readonly object _bp_hiddens
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
activation="maxout", train=False):
gpu_cached, bp_features = lower_model(tokvecs, train)
cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray):
# Note the passing of cuda_stream here: it lets
# cupy make the copy asynchronously.
# We then have to block before first use.
cached = gpu_cached.get(stream=cuda_stream)
else:
cached = gpu_cached
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
else:
self.bias = lower_model.get_param("b")
def __init__(
self,
batch_size,
tokvecs,
lower_model,
activation="maxout",
train=False
):
cached, bp_features = lower_model(tokvecs, train)
self.bias = lower_model.get_param("b")
self.nF = cached.shape[1]
if lower_model.has_dim("nP"):
self.nP = lower_model.get_dim("nP")
@ -379,19 +350,18 @@ cdef class precompute_hiddens:
self.nP = 1
self.nO = cached.shape[2]
self.ops = lower_model.ops
self.numpy_ops = NumpyOps()
assert activation in (None, "relu", "maxout")
self.activation = activation
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized and self._cuda_stream is not None:
self._cuda_stream.synchronize()
self._is_synchronized = True
return <float*>self._cached.data
cdef np.ndarray cached
if isinstance(self._cached, numpy.ndarray):
cached = self._cached
else:
cached = self._cached.get()
return <float*>cached.data
def has_dim(self, name):
if name == "nF":
@ -433,57 +403,25 @@ cdef class precompute_hiddens:
return self.begin_update(X)[0]
def begin_update(self, token_ids):
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO, self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
# - Input to backward on GPU!
# - Output from backward on GPU
nO = self.nO
nP = self.nP
hidden = self.model.ops.alloc2f(
token_ids.shape[0],
nO * nP
)
bp_hiddens = self._bp_hiddens
feat_weights = self.cached
self.ops.scatter_add(
hidden,
feat_weights,
token_ids
)
hidden += self.bias
statevec, mask = self.ops.maxout(hidden.reshape((-1, nO, nP)))
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector_ids):
d_state_vector, token_ids = d_state_vector_ids
d_state_vector = bp_nonlinearity(d_state_vector)
d_tokens = bp_hiddens((d_state_vector, token_ids))
return d_tokens
return state_vector, backward
def _nonlinearity(self, state_vector):
if self.activation == "maxout":
return self._maxout_nonlinearity(state_vector)
else:
return self._relu_nonlinearity(state_vector)
def _maxout_nonlinearity(self, state_vector):
state_vector, mask = self.numpy_ops.maxout(state_vector)
# We're outputting to CPU, but we need this variable on GPU for the
# backward pass.
mask = self.ops.asarray(mask)
def backprop_maxout(d_best):
return self.ops.backprop_maxout(d_best, mask, self.nP)
def backward(d_statevec):
return bp_hiddens(
self.ops.backprop_maxout(d_statevec, mask, nP)
)
return state_vector, backprop_maxout
def _relu_nonlinearity(self, state_vector):
state_vector = state_vector.reshape((state_vector.shape[0], -1))
mask = state_vector >= 0.
state_vector *= mask
# We're outputting to CPU, but we need this variable on GPU for the
# backward pass.
mask = self.ops.asarray(mask)
def backprop_relu(d_best):
d_best *= mask
return d_best.reshape((d_best.shape + (1,)))
return state_vector, backprop_relu
return statevec, backward