Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
ines 2017-10-29 03:58:21 +01:00
commit 256c7dac5a
9 changed files with 173 additions and 161 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7 numpy>=1.7
cymem>=1.30,<1.32 cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0 preshed>=1.0.0,<2.0.0
thinc>=6.9.0,<6.10.0 thinc>=6.10.0,<6.11.0
murmurhash>=0.28,<0.29 murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6 plac<1.0.0,>=0.9.6
six six

View File

@ -61,7 +61,7 @@ LINK_OPTIONS = {
# I don't understand this very well yet. See Issue #267 # I don't understand this very well yet. See Issue #267
# Fingers crossed! # Fingers crossed!
USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None USE_OPENMP_DEFAULT = '0' if sys.platform != 'darwin' else None
if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1': if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1':
if sys.platform == 'darwin': if sys.platform == 'darwin':
COMPILE_OPTIONS['other'].append('-fopenmp') COMPILE_OPTIONS['other'].append('-fopenmp')
@ -190,7 +190,7 @@ def setup_package():
'murmurhash>=0.28,<0.29', 'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32', 'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0', 'preshed>=1.0.0,<2.0.0',
'thinc>=6.9.0,<6.10.0', 'thinc>=6.10.0,<6.11.0',
'plac<1.0.0,>=0.9.6', 'plac<1.0.0,>=0.9.6',
'six', 'six',
'pathlib', 'pathlib',

View File

@ -13,12 +13,14 @@ from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module, copy_array
from thinc.neural._lsuv import svd_orthonormal
from thinc import describe from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp import thinc.extra.load_nlp
from thinc.neural._lsuv import svd_orthonormal
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from . import util from . import util
@ -75,78 +77,25 @@ def _preprocess_doc(docs, drop=0.):
return (keys, vals, lengths), None return (keys, vals, lengths), None
def _init_for_precomputed(W, ops): @describe.on_data(_set_dimensions_if_needed,
if (W**2).sum() != 0.: lambda model, X, y: model.init_weights(model))
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes( @describe.attributes(
nI=Dimension("Input size"), nI=Dimension("Input size"),
nF=Dimension("Number of features"), nF=Dimension("Number of features"),
nO=Dimension("Output size"), nO=Dimension("Output size"),
nP=Dimension("Maxout pieces"),
W=Synapses("Weights matrix", W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nI), lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
lambda W, ops: _init_for_precomputed(W, ops)),
b=Biases("Bias vector", b=Biases("Bias vector",
lambda obj: (obj.nO,)), lambda obj: (obj.nO, obj.nP)),
pad=Synapses("Pad",
lambda obj: (1, obj.nF, obj.nO, obj.nP),
lambda M, ops: ops.normal_init(M, 1.)),
d_W=Gradient("W"), d_W=Gradient("W"),
d_pad=Gradient("pad"),
d_b=Gradient("b")) d_b=Gradient("b"))
class PrecomputableAffine(Model): class PrecomputableAffine(Model):
def __init__(self, nO=None, nI=None, nF=None, **kwargs): def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nI = nI
self.nF = nF
def begin_update(self, X, drop=0.):
# X: (b, i)
# Yf: (b, f, i)
# dY: (b, o)
# dYf: (b, f, o)
# Yf = numpy.einsum('bi,foi->bfo', X, self.W)
Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]])
Yf += self.b
def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot
dY, ids = dY_ids
Xf = X[ids]
# dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
dXf = tensordot(dY, self.W, axes=[[1], [1]])
# dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = tensordot(dY, Xf, axes=[[0], [0]])
# ofi -> foi
self.d_W += dW.transpose((1, 0, 2))
self.d_b += dY.sum(axis=0)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return Yf, backward
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
nI=Dimension("Input size"),
nF=Dimension("Number of features"),
nP=Dimension("Number of pieces"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
b=Biases("Bias vector",
lambda obj: (obj.nO, obj.nP)),
d_W=Gradient("W"),
d_b=Gradient("b"))
class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
Model.__init__(self, **kwargs) Model.__init__(self, **kwargs)
self.nO = nO self.nO = nO
self.nP = nP self.nP = nP
@ -154,31 +103,96 @@ class PrecomputableMaxouts(Model):
self.nF = nF self.nF = nF
def begin_update(self, X, drop=0.): def begin_update(self, X, drop=0.):
# X: (b, i) Yf = self.ops.xp.dot(X,
# Yfp: (b, f, o, p) self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
# Xf: (f, b, i) Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
# dYp: (b, o, p) Yf = self._add_padding(Yf)
# W: (f, o, p, i)
# b: (o, p)
# bi,opfi->bfop
# bop,fopi->bfi
# bop,fbi->opfi : fopi
tensordot = self.ops.xp.tensordot
Yfp = tensordot(X, self.W, axes=[[1], [3]])
Yfp += self.b
def backward(dYp_ids, sgd=None): def backward(dY_ids, sgd=None):
dYp, ids = dYp_ids dY, ids = dY_ids
dY, ids = self._backprop_padding(dY, ids)
Xf = X[ids] Xf = X[ids]
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]]) Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
dW = tensordot(dYp, Xf, axes=[[0], [0]])
self.d_W += dW.transpose((2, 0, 1, 3)) self.d_b += dY.sum(axis=0)
self.d_b += dYp.sum(axis=0) dY = dY.reshape((dY.shape[0], self.nO*self.nP))
Wopfi = self.W.transpose((1, 2, 0, 3))
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
# Reuse the buffer
dWopfi = Wopfi; dWopfi.fill(0.)
self.ops.xp.dot(dY.T, Xf, out=dWopfi)
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
# (o, p, f, i) --> (f, o, p, i)
self.d_W += dWopfi.transpose((2, 0, 1, 3))
if sgd is not None: if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf return dXf.reshape((dXf.shape[0], self.nF, self.nI))
return Yf, backward
return Yfp, backward def _add_padding(self, Yf):
Yf_padded = self.ops.xp.vstack((self.pad, Yf))
return Yf_padded[1:]
def _backprop_padding(self, dY, ids):
for i in range(ids.shape[0]):
for j in range(ids.shape[1]):
if ids[i, j] < 0:
self.d_pad[0, j] += dY[i, j]
return dY, ids
@staticmethod
def init_weights(model):
'''This is like the 'layer sequential unit variance', but instead
of taking the actual inputs, we randomly generate whitened data.
Why's this all so complicated? We have a huge number of inputs,
and the maxout unit makes guessing the dynamics tricky. Instead
we set the maxout weights to values that empirically result in
whitened outputs given whitened inputs.
'''
if (model.W**2).sum() != 0.:
return
model.ops.normal_init(model.W, model.nF * model.nI, inplace=True)
ids = numpy.zeros((5000, model.nF), dtype='i')
ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i')
tokvecs = numpy.zeros((5000, model.nI), dtype='f')
tokvecs += numpy.random.normal(loc=0., scale=1.,
size=tokvecs.size).reshape(tokvecs.shape)
def predict(ids, tokvecs):
# nS ids. nW tokvecs
hiddens = model(tokvecs) # (nW, f, o, p)
# need nS vectors
vectors = model.ops.allocate((ids.shape[0], model.nO, model.nP))
for i, feats in enumerate(ids):
for j, id_ in enumerate(feats):
vectors[i] += hiddens[id_, j]
vectors += model.b
if model.nP >= 2:
return model.ops.maxout(vectors)[0]
else:
return vectors * (vectors >= 0)
tol_var = 0.01
tol_mean = 0.01
t_max = 10
t_i = 0
for t_i in range(t_max):
acts1 = predict(ids, tokvecs)
var = numpy.var(acts1)
mean = numpy.mean(acts1)
if abs(var - 1.0) >= tol_var:
model.W /= numpy.sqrt(var)
elif abs(mean) >= tol_mean:
model.b -= mean
else:
break
def link_vectors_to_models(vocab): def link_vectors_to_models(vocab):
@ -228,9 +242,10 @@ def Tok2Vec(width, embed_size, **kwargs):
tok2vec = ( tok2vec = (
FeatureExtracter(cols) FeatureExtracter(cols)
>> with_flatten( >> with_flatten(
embed >> (convolution ** 4), pad=4) embed
>> convolution ** 4, pad=4
)
) )
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width tok2vec.nO = width
tok2vec.embed = embed tok2vec.embed = embed
@ -265,34 +280,6 @@ def asarray(ops, dtype):
return layerize(forward) return layerize(forward)
def rebatch(size, layer):
ops = layer.ops
def forward(X, drop=0.):
if X.shape[0] < size:
return layer.begin_update(X)
parts = _divide_array(X, size)
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts])
y = ops.flatten(results)
def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))]
try:
dX = ops.flatten(d_parts)
except TypeError:
dX = None
except ValueError:
dX = None
return dX
return y, backward
model = layerize(forward)
model._layers.append(layer)
return model
def _divide_array(X, size): def _divide_array(X, size):
parts = [] parts = []
index = 0 index = 0

View File

@ -1,8 +1,11 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import bz2 try:
import gzip import bz2
import gzip
except ImportError:
pass
import math import math
from ast import literal_eval from ast import literal_eval
from pathlib import Path from pathlib import Path

View File

@ -30,6 +30,10 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
try:
from thinc.neural.optimizers import Optimizer
except ImportError:
from thinc.neural.optimizers import Adam as Optimizer
pickle = pickle pickle = pickle
copy_reg = copy_reg copy_reg = copy_reg

View File

@ -110,7 +110,7 @@ cdef cppclass StateC:
ids[3] = this.S(1) ids[3] = this.S(1)
ids[4] = this.H(this.S(0)) ids[4] = this.H(this.S(0))
ids[5] = this.L(this.B(0), 1) ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 2) ids[6] = this.L(this.S(0), 1)
ids[7] = this.R(this.S(0), 1) ids[7] = this.R(this.S(0), 1)
elif n == 13: elif n == 13:
ids[0] = this.B(0) ids[0] = this.B(0)

View File

@ -16,5 +16,6 @@ cdef class Parser:
cdef public object _multitasks cdef public object _multitasks
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* hW, const float* hb, const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil

View File

@ -1,5 +1,4 @@
# cython: infer_types=True # cython: infer_types=True
# cython: profile=True
# cython: cdivision=True # cython: cdivision=True
# cython: boundscheck=False # cython: boundscheck=False
# coding: utf-8 # coding: utf-8
@ -27,8 +26,9 @@ from thinc.v2v import Model, Maxout, Affine
from thinc.misc import LayerNorm from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models from .._ml import link_vectors_to_models
from ..compat import json_dumps, copy_array from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
@ -74,6 +74,7 @@ cdef class precompute_hiddens:
cdef public object ops cdef public object ops
cdef np.ndarray _features cdef np.ndarray _features
cdef np.ndarray _cached cdef np.ndarray _cached
cdef np.ndarray bias
cdef object _cuda_stream cdef object _cuda_stream
cdef object _bp_hiddens cdef object _bp_hiddens
@ -89,9 +90,10 @@ cdef class precompute_hiddens:
else: else:
cached = gpu_cached cached = gpu_cached
self.nF = cached.shape[1] self.nF = cached.shape[1]
self.nO = cached.shape[2]
self.nP = getattr(lower_model, 'nP', 1) self.nP = getattr(lower_model, 'nP', 1)
self.nO = cached.shape[2]
self.ops = lower_model.ops self.ops = lower_model.ops
self.bias = lower_model.b
self._is_synchronized = False self._is_synchronized = False
self._cuda_stream = cuda_stream self._cuda_stream = cuda_stream
self._cached = cached self._cached = cached
@ -108,7 +110,7 @@ cdef class precompute_hiddens:
def begin_update(self, token_ids, drop=0.): def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros( cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO*self.nP), dtype='f') (token_ids.shape[0], self.nO, self.nP), dtype='f')
# This is tricky, but (assuming GPU available); # This is tricky, but (assuming GPU available);
# - Input to forward on CPU # - Input to forward on CPU
# - Output from forward on CPU # - Output from forward on CPU
@ -119,15 +121,15 @@ cdef class precompute_hiddens:
feat_weights = self.get_feat_weights() feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data, sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0, 0], feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP) token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector) state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None): def backward(d_state_vector, sgd=None):
if bp_nonlinearity is not None: d_state_vector = bp_nonlinearity(d_state_vector, sgd)
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
# This will usually be on GPU # This will usually be on GPU
if isinstance(d_state_vector, numpy.ndarray): if not isinstance(d_state_vector, self.ops.xp.ndarray):
d_state_vector = self.ops.xp.array(d_state_vector) d_state_vector = self.ops.xp.array(d_state_vector)
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd) d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
return d_tokens return d_tokens
@ -135,27 +137,34 @@ cdef class precompute_hiddens:
def _nonlinearity(self, state_vector): def _nonlinearity(self, state_vector):
if self.nP == 1: if self.nP == 1:
return state_vector, None state_vector = state_vector.reshape(state_vector.shape[:-1])
state_vector = state_vector.reshape( mask = state_vector >= 0.
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP)) state_vector *= mask
best, which = self.ops.maxout(state_vector) else:
state_vector, mask = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None): def backprop_nonlinearity(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP) if self.nP == 1:
d_best *= mask
return best, backprop d_best = d_best.reshape((d_best.shape + (1,)))
return d_best
else:
return self.ops.backprop_maxout(d_best, mask, self.nP)
return state_vector, backprop_nonlinearity
cdef void sum_state_features(float* output, cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil: const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i cdef int idx, b, f, i
cdef const float* feature cdef const float* feature
padding = cached - (F * O)
for b in range(B): for b in range(B):
for f in range(F): for f in range(F):
if token_ids[f] < 0: if token_ids[f] < 0:
continue feature = &padding[f*O]
idx = token_ids[f] * F * O + f*O else:
feature = &cached[idx] idx = token_ids[f] * F * O + f*O
feature = &cached[idx]
for i in range(O): for i in range(O):
output[i] += feature[i] output[i] += feature[i]
output += O output += O
@ -220,13 +229,9 @@ cdef class Parser:
raise ValueError("Currently parser depth is hard-coded to 1.") raise ValueError("Currently parser depth is hard-coded to 1.")
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2)) cfg.get('maxout_pieces', 2))
if parser_maxout_pieces != 2:
raise ValueError("Currently parser_maxout_pieces is hard-coded "
"to 2")
token_vector_width = util.env_opt('token_vector_width', token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128)) cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width', hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
cfg.get('hidden_width', 200))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000)) embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -237,9 +242,10 @@ cdef class Parser:
tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=cfg.get('pretrained_dims', 0)) pretrained_dims=cfg.get('pretrained_dims', 0))
tok2vec = chain(tok2vec, flatten) tok2vec = chain(tok2vec, flatten)
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class, lower = PrecomputableAffine(hidden_width,
nF=cls.nr_feature, nP=parser_maxout_pieces, nF=cls.nr_feature, nI=token_vector_width,
nI=token_vector_width) nP=parser_maxout_pieces)
lower.nP = parser_maxout_pieces
with Model.use_device('cpu'): with Model.use_device('cpu'):
upper = chain( upper = chain(
@ -391,19 +397,20 @@ cdef class Parser:
hW = <float*>hidden_weights.data hW = <float*>hidden_weights.data
hb = <float*>hidden_bias.data hb = <float*>hidden_bias.data
bias = <float*>state2vec.bias.data
cdef int nr_hidden = hidden_weights.shape[0] cdef int nr_hidden = hidden_weights.shape[0]
cdef int nr_task = states.size() cdef int nr_task = states.size()
with nogil: with nogil:
for i in cython.parallel.prange(nr_task, num_threads=2, for i in range(nr_task):
schedule='guided'):
self._parseC(states[i], self._parseC(states[i],
feat_weights, hW, hb, feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece) nr_class, nr_hidden, nr_feat, nr_piece)
PyErr_CheckSignals() PyErr_CheckSignals()
return state_objs return state_objs
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* hW, const float* hb, const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
token_ids = <int*>calloc(nr_feat, sizeof(int)) token_ids = <int*>calloc(nr_feat, sizeof(int))
is_valid = <int*>calloc(nr_class, sizeof(int)) is_valid = <int*>calloc(nr_class, sizeof(int))
@ -413,17 +420,24 @@ cdef class Parser:
with gil: with gil:
PyErr_SetFromErrno(MemoryError) PyErr_SetFromErrno(MemoryError)
PyErr_CheckSignals() PyErr_CheckSignals()
cdef float feature
while not state.is_final(): while not state.is_final():
state.set_context_tokens(token_ids, nr_feat) state.set_context_tokens(token_ids, nr_feat)
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
memset(scores, 0, nr_class * sizeof(float)) memset(scores, 0, nr_class * sizeof(float))
sum_state_features(vectors, sum_state_features(vectors,
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece) feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
for i in range(nr_hidden * nr_piece):
vectors[i] += bias[i]
V = vectors V = vectors
W = hW W = hW
for i in range(nr_hidden): for i in range(nr_hidden):
feature = V[0] if V[0] >= V[1] else V[1] if nr_piece == 1:
feature = V[0] if V[0] >= 0. else 0.
elif nr_piece == 2:
feature = V[0] if V[0] >= V[1] else V[1]
else:
feature = Vec.max(V, nr_piece)
for j in range(nr_class): for j in range(nr_class):
scores[j] += feature * W[j] scores[j] += feature * W[j]
W += nr_class W += nr_class
@ -644,9 +658,10 @@ cdef class Parser:
xp = get_array_module(d_tokvecs) xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops: for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd) d_state_features = bp_vector(d_vector, sgd=sgd)
mask = ids >= 0 ids = ids.flatten()
d_state_features *= mask.reshape(ids.shape + (1,)) d_state_features = d_state_features.reshape(
self.model[0].ops.scatter_add(d_tokvecs, ids * mask, (ids.size, d_state_features.shape[2]))
self.model[0].ops.scatter_add(d_tokvecs, ids,
d_state_features) d_state_features)
bp_tokvecs(d_tokvecs, sgd=sgd) bp_tokvecs(d_tokvecs, sgd=sgd)
@ -665,7 +680,7 @@ cdef class Parser:
lower, stream, drop=0.0) lower, stream, drop=0.0)
return (tokvecs, bp_tokvecs), state2vec, upper return (tokvecs, bp_tokvecs), state2vec, upper
nr_feature = 8 nr_feature = 13
def get_token_ids(self, states): def get_token_ids(self, states):
cdef StateClass state cdef StateClass state

View File

@ -40,6 +40,8 @@ def parser(vocab):
def test_init_parser(parser): def test_init_parser(parser):
pass pass
# TODO: This is flakey, because it depends on what the parser first learns.
@pytest.mark.xfail
def test_add_label(parser): def test_add_label(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc) doc = parser(doc)