mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-14 10:12:22 +03:00
WIP refactor parser
This commit is contained in:
parent
b456929bfd
commit
267ffb5605
|
@ -18,8 +18,9 @@ from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||||
cdef WeightsC get_c_weights(model) except *:
|
cdef WeightsC get_c_weights(model) except *:
|
||||||
cdef WeightsC output
|
cdef WeightsC output
|
||||||
cdef precompute_hiddens state2vec = model.state2vec
|
cdef precompute_hiddens state2vec = model.state2vec
|
||||||
|
cdef np.ndarray bias = state2vec.bias
|
||||||
output.feat_weights = state2vec.get_feat_weights()
|
output.feat_weights = state2vec.get_feat_weights()
|
||||||
output.feat_bias = <const float*>state2vec.bias.data
|
output.feat_bias = <const float*>bias.data
|
||||||
cdef np.ndarray vec2scores_W
|
cdef np.ndarray vec2scores_W
|
||||||
cdef np.ndarray vec2scores_b
|
cdef np.ndarray vec2scores_b
|
||||||
if model.vec2scores is None:
|
if model.vec2scores is None:
|
||||||
|
@ -220,27 +221,23 @@ class ParserStepModel(Model):
|
||||||
activation = None
|
activation = None
|
||||||
else:
|
else:
|
||||||
activation = "relu"
|
activation = "relu"
|
||||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
self.state2vec = precompute_hiddens(
|
||||||
activation=activation, train=train)
|
len(docs),
|
||||||
|
self.tokvecs,
|
||||||
|
layers[1],
|
||||||
|
activation=activation,
|
||||||
|
train=train
|
||||||
|
)
|
||||||
if has_upper:
|
if has_upper:
|
||||||
self.vec2scores = layers[-1]
|
self.vec2scores = layers[-1]
|
||||||
else:
|
else:
|
||||||
self.vec2scores = None
|
self.vec2scores = None
|
||||||
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
|
||||||
self.backprops = []
|
|
||||||
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
||||||
self._class_mask.fill(1)
|
self._class_mask.fill(1)
|
||||||
if unseen_classes is not None:
|
if unseen_classes is not None:
|
||||||
for class_ in unseen_classes:
|
for class_ in unseen_classes:
|
||||||
self._class_mask[class_] = 0.
|
self._class_mask[class_] = 0.
|
||||||
|
|
||||||
def clear_memory(self):
|
|
||||||
del self.tokvecs
|
|
||||||
del self.bp_tokvecs
|
|
||||||
del self.state2vec
|
|
||||||
del self.backprops
|
|
||||||
del self._class_mask
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def nO(self):
|
def nO(self):
|
||||||
if self.attrs["has_upper"]:
|
if self.attrs["has_upper"]:
|
||||||
|
@ -248,6 +245,13 @@ class ParserStepModel(Model):
|
||||||
else:
|
else:
|
||||||
return self.state2vec.get_dim("nO")
|
return self.state2vec.get_dim("nO")
|
||||||
|
|
||||||
|
def clear_memory(self):
|
||||||
|
del self.tokvecs
|
||||||
|
del self.bp_tokvecs
|
||||||
|
del self.state2vec
|
||||||
|
del self.backprops
|
||||||
|
del self._class_mask
|
||||||
|
|
||||||
def class_is_unseen(self, class_):
|
def class_is_unseen(self, class_):
|
||||||
return self._class_mask[class_]
|
return self._class_mask[class_]
|
||||||
|
|
||||||
|
@ -269,54 +273,22 @@ class ParserStepModel(Model):
|
||||||
c_ids += ids.shape[1]
|
c_ids += ids.shape[1]
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
|
||||||
if isinstance(self.state2vec.ops, CupyOps) \
|
|
||||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
|
||||||
self.backprops.append((
|
|
||||||
util.get_async(self.cuda_stream, token_ids),
|
|
||||||
util.get_async(self.cuda_stream, d_vector),
|
|
||||||
get_d_tokvecs
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
|
||||||
|
|
||||||
|
def step_forward(model: ParserStepModel, token_ids, is_train):
|
||||||
def finish_steps(self, golds):
|
|
||||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
|
||||||
# values don't affect the real gradient.
|
|
||||||
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
|
||||||
# Tells CUDA to block, so our async copies complete.
|
|
||||||
if self.cuda_stream is not None:
|
|
||||||
self.cuda_stream.synchronize()
|
|
||||||
for ids, d_vector, bp_vector in self.backprops:
|
|
||||||
d_state_features = bp_vector((d_vector, ids))
|
|
||||||
ids = ids.flatten()
|
|
||||||
d_state_features = d_state_features.reshape(
|
|
||||||
(ids.size, d_state_features.shape[2]))
|
|
||||||
self.ops.scatter_add(d_tokvecs, ids,
|
|
||||||
d_state_features)
|
|
||||||
# Padded -- see update()
|
|
||||||
self.bp_tokvecs(d_tokvecs[:-1])
|
|
||||||
return d_tokvecs
|
|
||||||
|
|
||||||
NUMPY_OPS = NumpyOps()
|
|
||||||
|
|
||||||
def step_forward(model: ParserStepModel, states, is_train):
|
|
||||||
token_ids = model.get_token_ids(states)
|
|
||||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||||
mask = None
|
mask = None
|
||||||
if model.attrs["has_upper"]:
|
if model.attrs["has_upper"]:
|
||||||
|
vec2scores = ensure_same_device(model.ops, model.vec2scores)
|
||||||
dropout_rate = model.attrs["dropout_rate"]
|
dropout_rate = model.attrs["dropout_rate"]
|
||||||
if is_train and dropout_rate > 0:
|
if is_train and dropout_rate > 0:
|
||||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
mask = model.ops.get_dropout_mask(vector.shape, dropout_rate)
|
||||||
vector *= mask
|
vector *= mask
|
||||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
scores, get_d_vector = vec2scores(vector, is_train)
|
||||||
else:
|
else:
|
||||||
scores = NumpyOps().asarray(vector)
|
scores = vector
|
||||||
get_d_vector = lambda d_scores: d_scores
|
get_d_vector = lambda d_scores: d_scores
|
||||||
# If the class is unseen, make sure its score is minimum
|
# If the class is unseen, make sure its score is minimum
|
||||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
scores[:, model._class_mask == 0] = model.ops.xp.nanmin(scores)
|
||||||
|
|
||||||
def backprop_parser_step(d_scores):
|
def backprop_parser_step(d_scores):
|
||||||
# Zero vectors for unseen classes
|
# Zero vectors for unseen classes
|
||||||
|
@ -324,11 +296,18 @@ def step_forward(model: ParserStepModel, states, is_train):
|
||||||
d_vector = get_d_vector(d_scores)
|
d_vector = get_d_vector(d_scores)
|
||||||
if mask is not None:
|
if mask is not None:
|
||||||
d_vector *= mask
|
d_vector *= mask
|
||||||
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
return get_d_tokvecs(d_vector)
|
||||||
return None
|
|
||||||
return scores, backprop_parser_step
|
return scores, backprop_parser_step
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_same_device(ops, model):
|
||||||
|
"""Ensure a model is on the same device as a given ops"""
|
||||||
|
if not isinstance(model.ops, ops.__class__):
|
||||||
|
model._to_ops(ops)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
cdef class precompute_hiddens:
|
cdef class precompute_hiddens:
|
||||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||||
|
|
||||||
|
@ -347,31 +326,23 @@ cdef class precompute_hiddens:
|
||||||
and do the hard-to-program parsing on the CPU.
|
and do the hard-to-program parsing on the CPU.
|
||||||
"""
|
"""
|
||||||
cdef readonly int nF, nO, nP
|
cdef readonly int nF, nO, nP
|
||||||
cdef bint _is_synchronized
|
|
||||||
cdef public object ops
|
cdef public object ops
|
||||||
cdef public object numpy_ops
|
cdef readonly object bias
|
||||||
cdef np.ndarray _features
|
cdef readonly object activation
|
||||||
cdef np.ndarray _cached
|
cdef readonly object _features
|
||||||
cdef np.ndarray bias
|
cdef readonly object _cached
|
||||||
cdef object _cuda_stream
|
cdef readonly object _bp_hiddens
|
||||||
cdef object _bp_hiddens
|
|
||||||
cdef object activation
|
|
||||||
|
|
||||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
def __init__(
|
||||||
activation="maxout", train=False):
|
self,
|
||||||
gpu_cached, bp_features = lower_model(tokvecs, train)
|
batch_size,
|
||||||
cdef np.ndarray cached
|
tokvecs,
|
||||||
if not isinstance(gpu_cached, numpy.ndarray):
|
lower_model,
|
||||||
# Note the passing of cuda_stream here: it lets
|
activation="maxout",
|
||||||
# cupy make the copy asynchronously.
|
train=False
|
||||||
# We then have to block before first use.
|
):
|
||||||
cached = gpu_cached.get(stream=cuda_stream)
|
cached, bp_features = lower_model(tokvecs, train)
|
||||||
else:
|
self.bias = lower_model.get_param("b")
|
||||||
cached = gpu_cached
|
|
||||||
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
|
|
||||||
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
|
|
||||||
else:
|
|
||||||
self.bias = lower_model.get_param("b")
|
|
||||||
self.nF = cached.shape[1]
|
self.nF = cached.shape[1]
|
||||||
if lower_model.has_dim("nP"):
|
if lower_model.has_dim("nP"):
|
||||||
self.nP = lower_model.get_dim("nP")
|
self.nP = lower_model.get_dim("nP")
|
||||||
|
@ -379,19 +350,18 @@ cdef class precompute_hiddens:
|
||||||
self.nP = 1
|
self.nP = 1
|
||||||
self.nO = cached.shape[2]
|
self.nO = cached.shape[2]
|
||||||
self.ops = lower_model.ops
|
self.ops = lower_model.ops
|
||||||
self.numpy_ops = NumpyOps()
|
|
||||||
assert activation in (None, "relu", "maxout")
|
assert activation in (None, "relu", "maxout")
|
||||||
self.activation = activation
|
self.activation = activation
|
||||||
self._is_synchronized = False
|
|
||||||
self._cuda_stream = cuda_stream
|
|
||||||
self._cached = cached
|
self._cached = cached
|
||||||
self._bp_hiddens = bp_features
|
self._bp_hiddens = bp_features
|
||||||
|
|
||||||
cdef const float* get_feat_weights(self) except NULL:
|
cdef const float* get_feat_weights(self) except NULL:
|
||||||
if not self._is_synchronized and self._cuda_stream is not None:
|
cdef np.ndarray cached
|
||||||
self._cuda_stream.synchronize()
|
if isinstance(self._cached, numpy.ndarray):
|
||||||
self._is_synchronized = True
|
cached = self._cached
|
||||||
return <float*>self._cached.data
|
else:
|
||||||
|
cached = self._cached.get()
|
||||||
|
return <float*>cached.data
|
||||||
|
|
||||||
def has_dim(self, name):
|
def has_dim(self, name):
|
||||||
if name == "nF":
|
if name == "nF":
|
||||||
|
@ -433,57 +403,25 @@ cdef class precompute_hiddens:
|
||||||
return self.begin_update(X)[0]
|
return self.begin_update(X)[0]
|
||||||
|
|
||||||
def begin_update(self, token_ids):
|
def begin_update(self, token_ids):
|
||||||
cdef np.ndarray state_vector = numpy.zeros(
|
nO = self.nO
|
||||||
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
nP = self.nP
|
||||||
# This is tricky, but (assuming GPU available);
|
hidden = self.model.ops.alloc2f(
|
||||||
# - Input to forward on CPU
|
token_ids.shape[0],
|
||||||
# - Output from forward on CPU
|
nO * nP
|
||||||
# - Input to backward on GPU!
|
)
|
||||||
# - Output from backward on GPU
|
|
||||||
bp_hiddens = self._bp_hiddens
|
bp_hiddens = self._bp_hiddens
|
||||||
|
feat_weights = self.cached
|
||||||
|
self.ops.scatter_add(
|
||||||
|
hidden,
|
||||||
|
feat_weights,
|
||||||
|
token_ids
|
||||||
|
)
|
||||||
|
hidden += self.bias
|
||||||
|
statevec, mask = self.ops.maxout(hidden.reshape((-1, nO, nP)))
|
||||||
|
|
||||||
feat_weights = self.get_feat_weights()
|
def backward(d_statevec):
|
||||||
cdef int[:, ::1] ids = token_ids
|
return bp_hiddens(
|
||||||
sum_state_features(<float*>state_vector.data,
|
self.ops.backprop_maxout(d_statevec, mask, nP)
|
||||||
feat_weights, &ids[0,0],
|
)
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
|
||||||
state_vector += self.bias
|
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
|
||||||
|
|
||||||
def backward(d_state_vector_ids):
|
|
||||||
d_state_vector, token_ids = d_state_vector_ids
|
|
||||||
d_state_vector = bp_nonlinearity(d_state_vector)
|
|
||||||
d_tokens = bp_hiddens((d_state_vector, token_ids))
|
|
||||||
return d_tokens
|
|
||||||
return state_vector, backward
|
|
||||||
|
|
||||||
def _nonlinearity(self, state_vector):
|
|
||||||
if self.activation == "maxout":
|
|
||||||
return self._maxout_nonlinearity(state_vector)
|
|
||||||
else:
|
|
||||||
return self._relu_nonlinearity(state_vector)
|
|
||||||
|
|
||||||
def _maxout_nonlinearity(self, state_vector):
|
|
||||||
state_vector, mask = self.numpy_ops.maxout(state_vector)
|
|
||||||
# We're outputting to CPU, but we need this variable on GPU for the
|
|
||||||
# backward pass.
|
|
||||||
mask = self.ops.asarray(mask)
|
|
||||||
|
|
||||||
def backprop_maxout(d_best):
|
|
||||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
|
||||||
|
|
||||||
return state_vector, backprop_maxout
|
return statevec, backward
|
||||||
|
|
||||||
def _relu_nonlinearity(self, state_vector):
|
|
||||||
state_vector = state_vector.reshape((state_vector.shape[0], -1))
|
|
||||||
mask = state_vector >= 0.
|
|
||||||
state_vector *= mask
|
|
||||||
# We're outputting to CPU, but we need this variable on GPU for the
|
|
||||||
# backward pass.
|
|
||||||
mask = self.ops.asarray(mask)
|
|
||||||
|
|
||||||
def backprop_relu(d_best):
|
|
||||||
d_best *= mask
|
|
||||||
return d_best.reshape((d_best.shape + (1,)))
|
|
||||||
|
|
||||||
return state_vector, backprop_relu
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user