mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-11 16:52:21 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
256c7dac5a
|
@ -3,7 +3,7 @@ pathlib
|
||||||
numpy>=1.7
|
numpy>=1.7
|
||||||
cymem>=1.30,<1.32
|
cymem>=1.30,<1.32
|
||||||
preshed>=1.0.0,<2.0.0
|
preshed>=1.0.0,<2.0.0
|
||||||
thinc>=6.9.0,<6.10.0
|
thinc>=6.10.0,<6.11.0
|
||||||
murmurhash>=0.28,<0.29
|
murmurhash>=0.28,<0.29
|
||||||
plac<1.0.0,>=0.9.6
|
plac<1.0.0,>=0.9.6
|
||||||
six
|
six
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -61,7 +61,7 @@ LINK_OPTIONS = {
|
||||||
|
|
||||||
# I don't understand this very well yet. See Issue #267
|
# I don't understand this very well yet. See Issue #267
|
||||||
# Fingers crossed!
|
# Fingers crossed!
|
||||||
USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None
|
USE_OPENMP_DEFAULT = '0' if sys.platform != 'darwin' else None
|
||||||
if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1':
|
if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1':
|
||||||
if sys.platform == 'darwin':
|
if sys.platform == 'darwin':
|
||||||
COMPILE_OPTIONS['other'].append('-fopenmp')
|
COMPILE_OPTIONS['other'].append('-fopenmp')
|
||||||
|
@ -190,7 +190,7 @@ def setup_package():
|
||||||
'murmurhash>=0.28,<0.29',
|
'murmurhash>=0.28,<0.29',
|
||||||
'cymem>=1.30,<1.32',
|
'cymem>=1.30,<1.32',
|
||||||
'preshed>=1.0.0,<2.0.0',
|
'preshed>=1.0.0,<2.0.0',
|
||||||
'thinc>=6.9.0,<6.10.0',
|
'thinc>=6.10.0,<6.11.0',
|
||||||
'plac<1.0.0,>=0.9.6',
|
'plac<1.0.0,>=0.9.6',
|
||||||
'six',
|
'six',
|
||||||
'pathlib',
|
'pathlib',
|
||||||
|
|
215
spacy/_ml.py
215
spacy/_ml.py
|
@ -13,12 +13,14 @@ from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
|
||||||
from thinc.api import uniqued, wrap, noop
|
from thinc.api import uniqued, wrap, noop
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module, copy_array
|
||||||
|
from thinc.neural._lsuv import svd_orthonormal
|
||||||
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
from thinc.describe import Dimension, Synapses, Biases, Gradient
|
||||||
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
from thinc.neural._classes.affine import _set_dimensions_if_needed
|
||||||
import thinc.extra.load_nlp
|
import thinc.extra.load_nlp
|
||||||
|
from thinc.neural._lsuv import svd_orthonormal
|
||||||
|
|
||||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -75,78 +77,25 @@ def _preprocess_doc(docs, drop=0.):
|
||||||
return (keys, vals, lengths), None
|
return (keys, vals, lengths), None
|
||||||
|
|
||||||
|
|
||||||
def _init_for_precomputed(W, ops):
|
@describe.on_data(_set_dimensions_if_needed,
|
||||||
if (W**2).sum() != 0.:
|
lambda model, X, y: model.init_weights(model))
|
||||||
return
|
|
||||||
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
|
|
||||||
ops.xavier_uniform_init(reshaped)
|
|
||||||
W[:] = reshaped.reshape(W.shape)
|
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed)
|
|
||||||
@describe.attributes(
|
@describe.attributes(
|
||||||
nI=Dimension("Input size"),
|
nI=Dimension("Input size"),
|
||||||
nF=Dimension("Number of features"),
|
nF=Dimension("Number of features"),
|
||||||
nO=Dimension("Output size"),
|
nO=Dimension("Output size"),
|
||||||
|
nP=Dimension("Maxout pieces"),
|
||||||
W=Synapses("Weights matrix",
|
W=Synapses("Weights matrix",
|
||||||
lambda obj: (obj.nF, obj.nO, obj.nI),
|
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
|
||||||
lambda W, ops: _init_for_precomputed(W, ops)),
|
|
||||||
b=Biases("Bias vector",
|
|
||||||
lambda obj: (obj.nO,)),
|
|
||||||
d_W=Gradient("W"),
|
|
||||||
d_b=Gradient("b"))
|
|
||||||
class PrecomputableAffine(Model):
|
|
||||||
def __init__(self, nO=None, nI=None, nF=None, **kwargs):
|
|
||||||
Model.__init__(self, **kwargs)
|
|
||||||
self.nO = nO
|
|
||||||
self.nI = nI
|
|
||||||
self.nF = nF
|
|
||||||
|
|
||||||
def begin_update(self, X, drop=0.):
|
|
||||||
# X: (b, i)
|
|
||||||
# Yf: (b, f, i)
|
|
||||||
# dY: (b, o)
|
|
||||||
# dYf: (b, f, o)
|
|
||||||
# Yf = numpy.einsum('bi,foi->bfo', X, self.W)
|
|
||||||
Yf = self.ops.xp.tensordot(
|
|
||||||
X, self.W, axes=[[1], [2]])
|
|
||||||
Yf += self.b
|
|
||||||
|
|
||||||
def backward(dY_ids, sgd=None):
|
|
||||||
tensordot = self.ops.xp.tensordot
|
|
||||||
dY, ids = dY_ids
|
|
||||||
Xf = X[ids]
|
|
||||||
|
|
||||||
# dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
|
|
||||||
dXf = tensordot(dY, self.W, axes=[[1], [1]])
|
|
||||||
# dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
|
|
||||||
dW = tensordot(dY, Xf, axes=[[0], [0]])
|
|
||||||
# ofi -> foi
|
|
||||||
self.d_W += dW.transpose((1, 0, 2))
|
|
||||||
self.d_b += dY.sum(axis=0)
|
|
||||||
|
|
||||||
if sgd is not None:
|
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
|
||||||
return dXf
|
|
||||||
|
|
||||||
return Yf, backward
|
|
||||||
|
|
||||||
|
|
||||||
@describe.on_data(_set_dimensions_if_needed)
|
|
||||||
@describe.attributes(
|
|
||||||
nI=Dimension("Input size"),
|
|
||||||
nF=Dimension("Number of features"),
|
|
||||||
nP=Dimension("Number of pieces"),
|
|
||||||
nO=Dimension("Output size"),
|
|
||||||
W=Synapses("Weights matrix",
|
|
||||||
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
|
|
||||||
lambda W, ops: ops.xavier_uniform_init(W)),
|
|
||||||
b=Biases("Bias vector",
|
b=Biases("Bias vector",
|
||||||
lambda obj: (obj.nO, obj.nP)),
|
lambda obj: (obj.nO, obj.nP)),
|
||||||
|
pad=Synapses("Pad",
|
||||||
|
lambda obj: (1, obj.nF, obj.nO, obj.nP),
|
||||||
|
lambda M, ops: ops.normal_init(M, 1.)),
|
||||||
d_W=Gradient("W"),
|
d_W=Gradient("W"),
|
||||||
|
d_pad=Gradient("pad"),
|
||||||
d_b=Gradient("b"))
|
d_b=Gradient("b"))
|
||||||
class PrecomputableMaxouts(Model):
|
class PrecomputableAffine(Model):
|
||||||
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
|
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
|
||||||
Model.__init__(self, **kwargs)
|
Model.__init__(self, **kwargs)
|
||||||
self.nO = nO
|
self.nO = nO
|
||||||
self.nP = nP
|
self.nP = nP
|
||||||
|
@ -154,31 +103,96 @@ class PrecomputableMaxouts(Model):
|
||||||
self.nF = nF
|
self.nF = nF
|
||||||
|
|
||||||
def begin_update(self, X, drop=0.):
|
def begin_update(self, X, drop=0.):
|
||||||
# X: (b, i)
|
Yf = self.ops.xp.dot(X,
|
||||||
# Yfp: (b, f, o, p)
|
self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
|
||||||
# Xf: (f, b, i)
|
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
|
||||||
# dYp: (b, o, p)
|
Yf = self._add_padding(Yf)
|
||||||
# W: (f, o, p, i)
|
|
||||||
# b: (o, p)
|
|
||||||
# bi,opfi->bfop
|
|
||||||
# bop,fopi->bfi
|
|
||||||
# bop,fbi->opfi : fopi
|
|
||||||
tensordot = self.ops.xp.tensordot
|
|
||||||
Yfp = tensordot(X, self.W, axes=[[1], [3]])
|
|
||||||
Yfp += self.b
|
|
||||||
|
|
||||||
def backward(dYp_ids, sgd=None):
|
def backward(dY_ids, sgd=None):
|
||||||
dYp, ids = dYp_ids
|
dY, ids = dY_ids
|
||||||
|
dY, ids = self._backprop_padding(dY, ids)
|
||||||
Xf = X[ids]
|
Xf = X[ids]
|
||||||
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]])
|
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
|
||||||
dW = tensordot(dYp, Xf, axes=[[0], [0]])
|
|
||||||
self.d_W += dW.transpose((2, 0, 1, 3))
|
self.d_b += dY.sum(axis=0)
|
||||||
self.d_b += dYp.sum(axis=0)
|
dY = dY.reshape((dY.shape[0], self.nO*self.nP))
|
||||||
|
|
||||||
|
Wopfi = self.W.transpose((1, 2, 0, 3))
|
||||||
|
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
|
||||||
|
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
|
||||||
|
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
|
||||||
|
|
||||||
|
# Reuse the buffer
|
||||||
|
dWopfi = Wopfi; dWopfi.fill(0.)
|
||||||
|
self.ops.xp.dot(dY.T, Xf, out=dWopfi)
|
||||||
|
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
|
||||||
|
# (o, p, f, i) --> (f, o, p, i)
|
||||||
|
self.d_W += dWopfi.transpose((2, 0, 1, 3))
|
||||||
|
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
return dXf
|
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
||||||
|
return Yf, backward
|
||||||
|
|
||||||
return Yfp, backward
|
def _add_padding(self, Yf):
|
||||||
|
Yf_padded = self.ops.xp.vstack((self.pad, Yf))
|
||||||
|
return Yf_padded[1:]
|
||||||
|
|
||||||
|
def _backprop_padding(self, dY, ids):
|
||||||
|
for i in range(ids.shape[0]):
|
||||||
|
for j in range(ids.shape[1]):
|
||||||
|
if ids[i, j] < 0:
|
||||||
|
self.d_pad[0, j] += dY[i, j]
|
||||||
|
return dY, ids
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def init_weights(model):
|
||||||
|
'''This is like the 'layer sequential unit variance', but instead
|
||||||
|
of taking the actual inputs, we randomly generate whitened data.
|
||||||
|
|
||||||
|
Why's this all so complicated? We have a huge number of inputs,
|
||||||
|
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||||
|
we set the maxout weights to values that empirically result in
|
||||||
|
whitened outputs given whitened inputs.
|
||||||
|
'''
|
||||||
|
if (model.W**2).sum() != 0.:
|
||||||
|
return
|
||||||
|
model.ops.normal_init(model.W, model.nF * model.nI, inplace=True)
|
||||||
|
|
||||||
|
ids = numpy.zeros((5000, model.nF), dtype='i')
|
||||||
|
ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i')
|
||||||
|
tokvecs = numpy.zeros((5000, model.nI), dtype='f')
|
||||||
|
tokvecs += numpy.random.normal(loc=0., scale=1.,
|
||||||
|
size=tokvecs.size).reshape(tokvecs.shape)
|
||||||
|
|
||||||
|
def predict(ids, tokvecs):
|
||||||
|
# nS ids. nW tokvecs
|
||||||
|
hiddens = model(tokvecs) # (nW, f, o, p)
|
||||||
|
# need nS vectors
|
||||||
|
vectors = model.ops.allocate((ids.shape[0], model.nO, model.nP))
|
||||||
|
for i, feats in enumerate(ids):
|
||||||
|
for j, id_ in enumerate(feats):
|
||||||
|
vectors[i] += hiddens[id_, j]
|
||||||
|
vectors += model.b
|
||||||
|
if model.nP >= 2:
|
||||||
|
return model.ops.maxout(vectors)[0]
|
||||||
|
else:
|
||||||
|
return vectors * (vectors >= 0)
|
||||||
|
|
||||||
|
tol_var = 0.01
|
||||||
|
tol_mean = 0.01
|
||||||
|
t_max = 10
|
||||||
|
t_i = 0
|
||||||
|
for t_i in range(t_max):
|
||||||
|
acts1 = predict(ids, tokvecs)
|
||||||
|
var = numpy.var(acts1)
|
||||||
|
mean = numpy.mean(acts1)
|
||||||
|
if abs(var - 1.0) >= tol_var:
|
||||||
|
model.W /= numpy.sqrt(var)
|
||||||
|
elif abs(mean) >= tol_mean:
|
||||||
|
model.b -= mean
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
def link_vectors_to_models(vocab):
|
def link_vectors_to_models(vocab):
|
||||||
|
@ -228,9 +242,10 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
tok2vec = (
|
tok2vec = (
|
||||||
FeatureExtracter(cols)
|
FeatureExtracter(cols)
|
||||||
>> with_flatten(
|
>> with_flatten(
|
||||||
embed >> (convolution ** 4), pad=4)
|
embed
|
||||||
|
>> convolution ** 4, pad=4
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
tok2vec.nO = width
|
tok2vec.nO = width
|
||||||
tok2vec.embed = embed
|
tok2vec.embed = embed
|
||||||
|
@ -265,34 +280,6 @@ def asarray(ops, dtype):
|
||||||
return layerize(forward)
|
return layerize(forward)
|
||||||
|
|
||||||
|
|
||||||
def rebatch(size, layer):
|
|
||||||
ops = layer.ops
|
|
||||||
|
|
||||||
def forward(X, drop=0.):
|
|
||||||
if X.shape[0] < size:
|
|
||||||
return layer.begin_update(X)
|
|
||||||
parts = _divide_array(X, size)
|
|
||||||
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
|
|
||||||
for p in parts])
|
|
||||||
y = ops.flatten(results)
|
|
||||||
|
|
||||||
def backward(dy, sgd=None):
|
|
||||||
d_parts = [bp(y, sgd=sgd) for bp, y in
|
|
||||||
zip(bp_results, _divide_array(dy, size))]
|
|
||||||
try:
|
|
||||||
dX = ops.flatten(d_parts)
|
|
||||||
except TypeError:
|
|
||||||
dX = None
|
|
||||||
except ValueError:
|
|
||||||
dX = None
|
|
||||||
return dX
|
|
||||||
|
|
||||||
return y, backward
|
|
||||||
model = layerize(forward)
|
|
||||||
model._layers.append(layer)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def _divide_array(X, size):
|
def _divide_array(X, size):
|
||||||
parts = []
|
parts = []
|
||||||
index = 0
|
index = 0
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import bz2
|
try:
|
||||||
import gzip
|
import bz2
|
||||||
|
import gzip
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
import math
|
import math
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
|
@ -30,6 +30,10 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
cupy = None
|
cupy = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
from thinc.neural.optimizers import Optimizer
|
||||||
|
except ImportError:
|
||||||
|
from thinc.neural.optimizers import Adam as Optimizer
|
||||||
|
|
||||||
pickle = pickle
|
pickle = pickle
|
||||||
copy_reg = copy_reg
|
copy_reg = copy_reg
|
||||||
|
|
|
@ -110,7 +110,7 @@ cdef cppclass StateC:
|
||||||
ids[3] = this.S(1)
|
ids[3] = this.S(1)
|
||||||
ids[4] = this.H(this.S(0))
|
ids[4] = this.H(this.S(0))
|
||||||
ids[5] = this.L(this.B(0), 1)
|
ids[5] = this.L(this.B(0), 1)
|
||||||
ids[6] = this.L(this.S(0), 2)
|
ids[6] = this.L(this.S(0), 1)
|
||||||
ids[7] = this.R(this.S(0), 1)
|
ids[7] = this.R(this.S(0), 1)
|
||||||
elif n == 13:
|
elif n == 13:
|
||||||
ids[0] = this.B(0)
|
ids[0] = this.B(0)
|
||||||
|
|
|
@ -16,5 +16,6 @@ cdef class Parser:
|
||||||
cdef public object _multitasks
|
cdef public object _multitasks
|
||||||
|
|
||||||
cdef void _parseC(self, StateC* state,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* hW, const float* hb,
|
const float* feat_weights, const float* bias,
|
||||||
|
const float* hW, const float* hb,
|
||||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
# cython: profile=True
|
|
||||||
# cython: cdivision=True
|
# cython: cdivision=True
|
||||||
# cython: boundscheck=False
|
# cython: boundscheck=False
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
@ -27,8 +26,9 @@ from thinc.v2v import Model, Maxout, Affine
|
||||||
from thinc.misc import LayerNorm
|
from thinc.misc import LayerNorm
|
||||||
from thinc.neural.ops import CupyOps
|
from thinc.neural.ops import CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module
|
||||||
|
from thinc.linalg cimport Vec, VecVec
|
||||||
|
|
||||||
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
|
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
|
||||||
from .._ml import link_vectors_to_models
|
from .._ml import link_vectors_to_models
|
||||||
from ..compat import json_dumps, copy_array
|
from ..compat import json_dumps, copy_array
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
@ -74,6 +74,7 @@ cdef class precompute_hiddens:
|
||||||
cdef public object ops
|
cdef public object ops
|
||||||
cdef np.ndarray _features
|
cdef np.ndarray _features
|
||||||
cdef np.ndarray _cached
|
cdef np.ndarray _cached
|
||||||
|
cdef np.ndarray bias
|
||||||
cdef object _cuda_stream
|
cdef object _cuda_stream
|
||||||
cdef object _bp_hiddens
|
cdef object _bp_hiddens
|
||||||
|
|
||||||
|
@ -89,9 +90,10 @@ cdef class precompute_hiddens:
|
||||||
else:
|
else:
|
||||||
cached = gpu_cached
|
cached = gpu_cached
|
||||||
self.nF = cached.shape[1]
|
self.nF = cached.shape[1]
|
||||||
self.nO = cached.shape[2]
|
|
||||||
self.nP = getattr(lower_model, 'nP', 1)
|
self.nP = getattr(lower_model, 'nP', 1)
|
||||||
|
self.nO = cached.shape[2]
|
||||||
self.ops = lower_model.ops
|
self.ops = lower_model.ops
|
||||||
|
self.bias = lower_model.b
|
||||||
self._is_synchronized = False
|
self._is_synchronized = False
|
||||||
self._cuda_stream = cuda_stream
|
self._cuda_stream = cuda_stream
|
||||||
self._cached = cached
|
self._cached = cached
|
||||||
|
@ -108,7 +110,7 @@ cdef class precompute_hiddens:
|
||||||
|
|
||||||
def begin_update(self, token_ids, drop=0.):
|
def begin_update(self, token_ids, drop=0.):
|
||||||
cdef np.ndarray state_vector = numpy.zeros(
|
cdef np.ndarray state_vector = numpy.zeros(
|
||||||
(token_ids.shape[0], self.nO*self.nP), dtype='f')
|
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
||||||
# This is tricky, but (assuming GPU available);
|
# This is tricky, but (assuming GPU available);
|
||||||
# - Input to forward on CPU
|
# - Input to forward on CPU
|
||||||
# - Output from forward on CPU
|
# - Output from forward on CPU
|
||||||
|
@ -119,15 +121,15 @@ cdef class precompute_hiddens:
|
||||||
feat_weights = self.get_feat_weights()
|
feat_weights = self.get_feat_weights()
|
||||||
cdef int[:, ::1] ids = token_ids
|
cdef int[:, ::1] ids = token_ids
|
||||||
sum_state_features(<float*>state_vector.data,
|
sum_state_features(<float*>state_vector.data,
|
||||||
feat_weights, &ids[0, 0],
|
feat_weights, &ids[0,0],
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||||
|
state_vector += self.bias
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||||
|
|
||||||
def backward(d_state_vector, sgd=None):
|
def backward(d_state_vector, sgd=None):
|
||||||
if bp_nonlinearity is not None:
|
|
||||||
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
|
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
|
||||||
# This will usually be on GPU
|
# This will usually be on GPU
|
||||||
if isinstance(d_state_vector, numpy.ndarray):
|
if not isinstance(d_state_vector, self.ops.xp.ndarray):
|
||||||
d_state_vector = self.ops.xp.array(d_state_vector)
|
d_state_vector = self.ops.xp.array(d_state_vector)
|
||||||
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
|
||||||
return d_tokens
|
return d_tokens
|
||||||
|
@ -135,25 +137,32 @@ cdef class precompute_hiddens:
|
||||||
|
|
||||||
def _nonlinearity(self, state_vector):
|
def _nonlinearity(self, state_vector):
|
||||||
if self.nP == 1:
|
if self.nP == 1:
|
||||||
return state_vector, None
|
state_vector = state_vector.reshape(state_vector.shape[:-1])
|
||||||
state_vector = state_vector.reshape(
|
mask = state_vector >= 0.
|
||||||
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
|
state_vector *= mask
|
||||||
best, which = self.ops.maxout(state_vector)
|
else:
|
||||||
|
state_vector, mask = self.ops.maxout(state_vector)
|
||||||
|
|
||||||
def backprop(d_best, sgd=None):
|
def backprop_nonlinearity(d_best, sgd=None):
|
||||||
return self.ops.backprop_maxout(d_best, which, self.nP)
|
if self.nP == 1:
|
||||||
|
d_best *= mask
|
||||||
return best, backprop
|
d_best = d_best.reshape((d_best.shape + (1,)))
|
||||||
|
return d_best
|
||||||
|
else:
|
||||||
|
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||||
|
return state_vector, backprop_nonlinearity
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(float* output,
|
cdef void sum_state_features(float* output,
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||||
cdef int idx, b, f, i
|
cdef int idx, b, f, i
|
||||||
cdef const float* feature
|
cdef const float* feature
|
||||||
|
padding = cached - (F * O)
|
||||||
for b in range(B):
|
for b in range(B):
|
||||||
for f in range(F):
|
for f in range(F):
|
||||||
if token_ids[f] < 0:
|
if token_ids[f] < 0:
|
||||||
continue
|
feature = &padding[f*O]
|
||||||
|
else:
|
||||||
idx = token_ids[f] * F * O + f*O
|
idx = token_ids[f] * F * O + f*O
|
||||||
feature = &cached[idx]
|
feature = &cached[idx]
|
||||||
for i in range(O):
|
for i in range(O):
|
||||||
|
@ -220,13 +229,9 @@ cdef class Parser:
|
||||||
raise ValueError("Currently parser depth is hard-coded to 1.")
|
raise ValueError("Currently parser depth is hard-coded to 1.")
|
||||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||||
cfg.get('maxout_pieces', 2))
|
cfg.get('maxout_pieces', 2))
|
||||||
if parser_maxout_pieces != 2:
|
|
||||||
raise ValueError("Currently parser_maxout_pieces is hard-coded "
|
|
||||||
"to 2")
|
|
||||||
token_vector_width = util.env_opt('token_vector_width',
|
token_vector_width = util.env_opt('token_vector_width',
|
||||||
cfg.get('token_vector_width', 128))
|
cfg.get('token_vector_width', 128))
|
||||||
hidden_width = util.env_opt('hidden_width',
|
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
|
||||||
cfg.get('hidden_width', 200))
|
|
||||||
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
|
||||||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||||
|
@ -237,9 +242,10 @@ cdef class Parser:
|
||||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||||
pretrained_dims=cfg.get('pretrained_dims', 0))
|
pretrained_dims=cfg.get('pretrained_dims', 0))
|
||||||
tok2vec = chain(tok2vec, flatten)
|
tok2vec = chain(tok2vec, flatten)
|
||||||
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
|
lower = PrecomputableAffine(hidden_width,
|
||||||
nF=cls.nr_feature, nP=parser_maxout_pieces,
|
nF=cls.nr_feature, nI=token_vector_width,
|
||||||
nI=token_vector_width)
|
nP=parser_maxout_pieces)
|
||||||
|
lower.nP = parser_maxout_pieces
|
||||||
|
|
||||||
with Model.use_device('cpu'):
|
with Model.use_device('cpu'):
|
||||||
upper = chain(
|
upper = chain(
|
||||||
|
@ -391,19 +397,20 @@ cdef class Parser:
|
||||||
|
|
||||||
hW = <float*>hidden_weights.data
|
hW = <float*>hidden_weights.data
|
||||||
hb = <float*>hidden_bias.data
|
hb = <float*>hidden_bias.data
|
||||||
|
bias = <float*>state2vec.bias.data
|
||||||
cdef int nr_hidden = hidden_weights.shape[0]
|
cdef int nr_hidden = hidden_weights.shape[0]
|
||||||
cdef int nr_task = states.size()
|
cdef int nr_task = states.size()
|
||||||
with nogil:
|
with nogil:
|
||||||
for i in cython.parallel.prange(nr_task, num_threads=2,
|
for i in range(nr_task):
|
||||||
schedule='guided'):
|
|
||||||
self._parseC(states[i],
|
self._parseC(states[i],
|
||||||
feat_weights, hW, hb,
|
feat_weights, bias, hW, hb,
|
||||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
return state_objs
|
return state_objs
|
||||||
|
|
||||||
cdef void _parseC(self, StateC* state,
|
cdef void _parseC(self, StateC* state,
|
||||||
const float* feat_weights, const float* hW, const float* hb,
|
const float* feat_weights, const float* bias,
|
||||||
|
const float* hW, const float* hb,
|
||||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
||||||
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
token_ids = <int*>calloc(nr_feat, sizeof(int))
|
||||||
is_valid = <int*>calloc(nr_class, sizeof(int))
|
is_valid = <int*>calloc(nr_class, sizeof(int))
|
||||||
|
@ -413,17 +420,24 @@ cdef class Parser:
|
||||||
with gil:
|
with gil:
|
||||||
PyErr_SetFromErrno(MemoryError)
|
PyErr_SetFromErrno(MemoryError)
|
||||||
PyErr_CheckSignals()
|
PyErr_CheckSignals()
|
||||||
|
cdef float feature
|
||||||
while not state.is_final():
|
while not state.is_final():
|
||||||
state.set_context_tokens(token_ids, nr_feat)
|
state.set_context_tokens(token_ids, nr_feat)
|
||||||
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
|
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
|
||||||
memset(scores, 0, nr_class * sizeof(float))
|
memset(scores, 0, nr_class * sizeof(float))
|
||||||
sum_state_features(vectors,
|
sum_state_features(vectors,
|
||||||
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
|
||||||
|
for i in range(nr_hidden * nr_piece):
|
||||||
|
vectors[i] += bias[i]
|
||||||
V = vectors
|
V = vectors
|
||||||
W = hW
|
W = hW
|
||||||
for i in range(nr_hidden):
|
for i in range(nr_hidden):
|
||||||
|
if nr_piece == 1:
|
||||||
|
feature = V[0] if V[0] >= 0. else 0.
|
||||||
|
elif nr_piece == 2:
|
||||||
feature = V[0] if V[0] >= V[1] else V[1]
|
feature = V[0] if V[0] >= V[1] else V[1]
|
||||||
|
else:
|
||||||
|
feature = Vec.max(V, nr_piece)
|
||||||
for j in range(nr_class):
|
for j in range(nr_class):
|
||||||
scores[j] += feature * W[j]
|
scores[j] += feature * W[j]
|
||||||
W += nr_class
|
W += nr_class
|
||||||
|
@ -644,9 +658,10 @@ cdef class Parser:
|
||||||
xp = get_array_module(d_tokvecs)
|
xp = get_array_module(d_tokvecs)
|
||||||
for ids, d_vector, bp_vector in backprops:
|
for ids, d_vector, bp_vector in backprops:
|
||||||
d_state_features = bp_vector(d_vector, sgd=sgd)
|
d_state_features = bp_vector(d_vector, sgd=sgd)
|
||||||
mask = ids >= 0
|
ids = ids.flatten()
|
||||||
d_state_features *= mask.reshape(ids.shape + (1,))
|
d_state_features = d_state_features.reshape(
|
||||||
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
|
(ids.size, d_state_features.shape[2]))
|
||||||
|
self.model[0].ops.scatter_add(d_tokvecs, ids,
|
||||||
d_state_features)
|
d_state_features)
|
||||||
bp_tokvecs(d_tokvecs, sgd=sgd)
|
bp_tokvecs(d_tokvecs, sgd=sgd)
|
||||||
|
|
||||||
|
@ -665,7 +680,7 @@ cdef class Parser:
|
||||||
lower, stream, drop=0.0)
|
lower, stream, drop=0.0)
|
||||||
return (tokvecs, bp_tokvecs), state2vec, upper
|
return (tokvecs, bp_tokvecs), state2vec, upper
|
||||||
|
|
||||||
nr_feature = 8
|
nr_feature = 13
|
||||||
|
|
||||||
def get_token_ids(self, states):
|
def get_token_ids(self, states):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
|
|
|
@ -40,6 +40,8 @@ def parser(vocab):
|
||||||
def test_init_parser(parser):
|
def test_init_parser(parser):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# TODO: This is flakey, because it depends on what the parser first learns.
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_add_label(parser):
|
def test_add_label(parser):
|
||||||
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
|
||||||
doc = parser(doc)
|
doc = parser(doc)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user