mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
Try to move parser to simpler PrecomputedAffine class. Currently broken -- maybe the previous change
This commit is contained in:
parent
3ff8c35a79
commit
b272890a8c
96
spacy/_ml.py
96
spacy/_ml.py
|
@ -17,14 +17,19 @@ from .tokens.doc import Doc
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
|
|
||||||
|
def _init_for_precomputed(W, ops):
|
||||||
|
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
|
||||||
|
ops.xavier_uniform_init(reshaped)
|
||||||
|
W[:] = reshaped.reshape(W.shape)
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed)
|
@describe.on_data(_set_dimensions_if_needed)
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
nF=Dimension("Number of features"),
|
nF=Dimension("Number of features"),
|
||||||
nO=Dimension("Output size"),
|
nO=Dimension("Output size"),
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix",
|
||||||
lambda obj: (obj.nO, obj.nF, obj.nI),
|
lambda obj: (obj.nF, obj.nO, obj.nI),
|
||||||
lambda W, ops: ops.xavier_uniform_init(W)),
|
lambda W, ops: _init_for_precomputed(W, ops)),
|
||||||
b=Biases("Bias vector",
|
b=Biases("Bias vector",
|
||||||
lambda obj: (obj.nO,)),
|
lambda obj: (obj.nO,)),
|
||||||
d_W=Gradient("W"),
|
d_W=Gradient("W"),
|
||||||
|
@ -39,25 +44,25 @@ class PrecomputableAffine(Model):
|
||||||
|
|
||||||
def begin_update(self, X, drop=0.):
|
def begin_update(self, X, drop=0.):
|
||||||
# X: (b, i)
|
# X: (b, i)
|
||||||
# Xf: (b, f, i)
|
# Yf: (b, f, i)
|
||||||
# dY: (b, o)
|
# dY: (b, o)
|
||||||
# dYf: (b, f, o)
|
# dYf: (b, f, o)
|
||||||
#Yf = numpy.einsum('bi,ofi->bfo', X, self.W)
|
#Yf = numpy.einsum('bi,foi->bfo', X, self.W)
|
||||||
Yf = self.ops.xp.tensordot(
|
Yf = self.ops.xp.tensordot(
|
||||||
X, self.W, axes=[[1], [2]]).transpose((0, 2, 1))
|
X, self.W, axes=[[1], [2]])
|
||||||
Yf += self.b
|
Yf += self.b
|
||||||
def backward(dY_ids, sgd=None):
|
def backward(dY_ids, sgd=None):
|
||||||
|
tensordot = self.ops.xp.tensordot
|
||||||
dY, ids = dY_ids
|
dY, ids = dY_ids
|
||||||
Xf = X[ids]
|
Xf = X[ids]
|
||||||
|
|
||||||
|
#dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
|
||||||
|
dXf = tensordot(dY, self.W, axes=[[1], [1]])
|
||||||
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
#dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
||||||
dW = self.ops.xp.tensordot(dY, Xf, axes=[[0], [0]])
|
dW = tensordot(dY, Xf, axes=[[0], [0]])
|
||||||
db = dY.sum(axis=0)
|
# ofi -> foi
|
||||||
#dXf = numpy.einsum('bo,ofi->bfi', dY, self.W)
|
self.d_W += dW.transpose((1, 0, 2))
|
||||||
dXf = self.ops.xp.tensordot(dY, self.W, axes=[[1], [0]])
|
self.d_b += dY.sum(axis=0)
|
||||||
|
|
||||||
self.d_W += dW
|
|
||||||
self.d_b += db
|
|
||||||
|
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
|
@ -144,14 +149,70 @@ def Tok2Vec(width, embed_size, preprocess=None):
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
def get_col(idx):
|
def foreach(layer):
|
||||||
|
def forward(Xs, drop=0.):
|
||||||
|
results = []
|
||||||
|
backprops = []
|
||||||
|
for X in Xs:
|
||||||
|
result, bp = layer.begin_update(X, drop=drop)
|
||||||
|
results.append(result)
|
||||||
|
backprops.append(bp)
|
||||||
|
def backward(d_results, sgd=None):
|
||||||
|
dXs = []
|
||||||
|
for d_result, backprop in zip(d_results, backprops):
|
||||||
|
dXs.append(backprop(d_result, sgd))
|
||||||
|
return dXs
|
||||||
|
return results, backward
|
||||||
|
model = layerize(forward)
|
||||||
|
model._layers.append(layer)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def rebatch(size, layer):
|
||||||
|
ops = layer.ops
|
||||||
def forward(X, drop=0.):
|
def forward(X, drop=0.):
|
||||||
|
if X.shape[0] < size:
|
||||||
|
return layer.begin_update(X)
|
||||||
|
parts = _divide_array(X, size)
|
||||||
|
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
|
||||||
|
for p in parts])
|
||||||
|
y = ops.flatten(results)
|
||||||
|
def backward(dy, sgd=None):
|
||||||
|
d_parts = [bp(y, sgd=sgd) for bp, y in
|
||||||
|
zip(bp_results, _divide_array(dy, size))]
|
||||||
|
try:
|
||||||
|
dX = ops.flatten(d_parts)
|
||||||
|
except TypeError:
|
||||||
|
dX = None
|
||||||
|
except ValueError:
|
||||||
|
dX = None
|
||||||
|
return dX
|
||||||
|
return y, backward
|
||||||
|
model = layerize(forward)
|
||||||
|
model._layers.append(layer)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def _divide_array(X, size):
|
||||||
|
parts = []
|
||||||
|
index = 0
|
||||||
|
while index < len(X):
|
||||||
|
parts.append(X[index : index + size])
|
||||||
|
index += size
|
||||||
|
return parts
|
||||||
|
|
||||||
|
|
||||||
|
def get_col(idx):
|
||||||
|
assert idx >= 0, idx
|
||||||
|
def forward(X, drop=0.):
|
||||||
|
assert idx >= 0, idx
|
||||||
if isinstance(X, numpy.ndarray):
|
if isinstance(X, numpy.ndarray):
|
||||||
ops = NumpyOps()
|
ops = NumpyOps()
|
||||||
else:
|
else:
|
||||||
ops = CupyOps()
|
ops = CupyOps()
|
||||||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||||
def backward(y, sgd=None):
|
def backward(y, sgd=None):
|
||||||
|
assert idx >= 0, idx
|
||||||
dX = ops.allocate(X.shape)
|
dX = ops.allocate(X.shape)
|
||||||
dX[:, idx] += y
|
dX[:, idx] += y
|
||||||
return dX
|
return dX
|
||||||
|
@ -171,12 +232,9 @@ def doc2feats(cols=None):
|
||||||
def forward(docs, drop=0.):
|
def forward(docs, drop=0.):
|
||||||
feats = []
|
feats = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
if 'cached_feats' not in doc.user_data:
|
feats.append(
|
||||||
doc.user_data['cached_feats'] = model.ops.asarray(
|
model.ops.asarray(doc.to_array(cols),
|
||||||
doc.to_array(cols),
|
dtype='uint64'))
|
||||||
dtype='uint64')
|
|
||||||
feats.append(doc.user_data['cached_feats'])
|
|
||||||
assert feats[-1].dtype == 'uint64'
|
|
||||||
return feats, None
|
return feats, None
|
||||||
model = layerize(forward)
|
model = layerize(forward)
|
||||||
model.cols = cols
|
model.cols = cols
|
||||||
|
|
|
@ -84,7 +84,7 @@ cdef class precompute_hiddens:
|
||||||
we can do all our hard maths up front, packed into large multiplications,
|
we can do all our hard maths up front, packed into large multiplications,
|
||||||
and do the hard-to-program parsing on the CPU.
|
and do the hard-to-program parsing on the CPU.
|
||||||
'''
|
'''
|
||||||
cdef int nF, nO, nP
|
cdef int nF, nO
|
||||||
cdef bint _is_synchronized
|
cdef bint _is_synchronized
|
||||||
cdef public object ops
|
cdef public object ops
|
||||||
cdef np.ndarray _features
|
cdef np.ndarray _features
|
||||||
|
@ -104,9 +104,8 @@ cdef class precompute_hiddens:
|
||||||
cached = gpu_cached
|
cached = gpu_cached
|
||||||
self.nF = cached.shape[1]
|
self.nF = cached.shape[1]
|
||||||
self.nO = cached.shape[2]
|
self.nO = cached.shape[2]
|
||||||
self.nP = cached.shape[3]
|
|
||||||
self.ops = lower_model.ops
|
self.ops = lower_model.ops
|
||||||
self._features = numpy.zeros((batch_size, self.nO, self.nP), dtype='f')
|
self._features = numpy.zeros((batch_size, self.nO), dtype='f')
|
||||||
self._is_synchronized = False
|
self._is_synchronized = False
|
||||||
self._cuda_stream = cuda_stream
|
self._cuda_stream = cuda_stream
|
||||||
self._cached = cached
|
self._cached = cached
|
||||||
|
@ -133,24 +132,15 @@ cdef class precompute_hiddens:
|
||||||
cdef int[:, ::1] ids = token_ids
|
cdef int[:, ::1] ids = token_ids
|
||||||
self._sum_features(<float*>state_vector.data,
|
self._sum_features(<float*>state_vector.data,
|
||||||
<float*>hiddens.data, &ids[0,0],
|
<float*>hiddens.data, &ids[0,0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO)
|
||||||
|
|
||||||
output, bp_output = self._apply_nonlinearity(state_vector)
|
def backward(d_state_vector, sgd=None):
|
||||||
|
|
||||||
def backward(d_output, sgd=None):
|
|
||||||
# This will usually be on GPU
|
# This will usually be on GPU
|
||||||
if isinstance(d_output, numpy.ndarray):
|
if isinstance(d_state_vector, numpy.ndarray):
|
||||||
d_output = self.ops.xp.array(d_output)
|
d_state_vector = self.ops.xp.array(d_state_vector)
|
||||||
d_state_vector = bp_output(d_output, sgd)
|
|
||||||
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
||||||
return d_tokens
|
return d_tokens
|
||||||
return output, backward
|
return state_vector, backward
|
||||||
|
|
||||||
def _apply_nonlinearity(self, X):
|
|
||||||
if self.nP < 2:
|
|
||||||
return X.reshape(X.shape[:2]), lambda dX, sgd=None: dX.reshape(X.shape)
|
|
||||||
best, which = self.ops.maxout(X)
|
|
||||||
return best, lambda dX, sgd=None: self.ops.backprop_maxout(dX, which, self.nP)
|
|
||||||
|
|
||||||
cdef void _sum_features(self, float* output,
|
cdef void _sum_features(self, float* output,
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||||
|
@ -223,11 +213,9 @@ cdef class Parser:
|
||||||
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
|
def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg):
|
||||||
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
token_vector_width = util.env_opt('token_vector_width', token_vector_width)
|
||||||
hidden_width = util.env_opt('hidden_width', hidden_width)
|
hidden_width = util.env_opt('hidden_width', hidden_width)
|
||||||
maxout_pieces = util.env_opt('parser_maxout_pieces', 1)
|
lower = PrecomputableAffine(hidden_width,
|
||||||
lower = PrecomputableMaxouts(hidden_width,
|
|
||||||
nF=cls.nr_feature,
|
nF=cls.nr_feature,
|
||||||
nI=token_vector_width,
|
nI=token_vector_width)
|
||||||
pieces=maxout_pieces)
|
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
upper = chain(
|
upper = chain(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user