Merge pull request #1467 from explosion/feature/better-parser

💫 Bug fixes to parser model (requires retraining)
This commit is contained in:
Matthew Honnibal 2017-10-29 02:05:22 +02:00 committed by GitHub
commit c43cc5361d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 173 additions and 161 deletions

View File

@ -3,7 +3,7 @@ pathlib
numpy>=1.7
cymem>=1.30,<1.32
preshed>=1.0.0,<2.0.0
thinc>=6.9.0,<6.10.0
thinc>=6.10.0,<6.11.0
murmurhash>=0.28,<0.29
plac<1.0.0,>=0.9.6
six

View File

@ -61,7 +61,7 @@ LINK_OPTIONS = {
# I don't understand this very well yet. See Issue #267
# Fingers crossed!
USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None
USE_OPENMP_DEFAULT = '0' if sys.platform != 'darwin' else None
if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1':
if sys.platform == 'darwin':
COMPILE_OPTIONS['other'].append('-fopenmp')
@ -190,7 +190,7 @@ def setup_package():
'murmurhash>=0.28,<0.29',
'cymem>=1.30,<1.32',
'preshed>=1.0.0,<2.0.0',
'thinc>=6.9.0,<6.10.0',
'thinc>=6.10.0,<6.11.0',
'plac<1.0.0,>=0.9.6',
'six',
'pathlib',

View File

@ -13,12 +13,14 @@ from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module
from thinc.neural.util import get_array_module, copy_array
from thinc.neural._lsuv import svd_orthonormal
from thinc import describe
from thinc.describe import Dimension, Synapses, Biases, Gradient
from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp
from thinc.neural._lsuv import svd_orthonormal
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from . import util
@ -75,78 +77,25 @@ def _preprocess_doc(docs, drop=0.):
return (keys, vals, lengths), None
def _init_for_precomputed(W, ops):
if (W**2).sum() != 0.:
return
reshaped = W.reshape((W.shape[1], W.shape[0] * W.shape[2]))
ops.xavier_uniform_init(reshaped)
W[:] = reshaped.reshape(W.shape)
@describe.on_data(_set_dimensions_if_needed)
@describe.on_data(_set_dimensions_if_needed,
lambda model, X, y: model.init_weights(model))
@describe.attributes(
nI=Dimension("Input size"),
nF=Dimension("Number of features"),
nO=Dimension("Output size"),
nP=Dimension("Maxout pieces"),
W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nI),
lambda W, ops: _init_for_precomputed(W, ops)),
b=Biases("Bias vector",
lambda obj: (obj.nO,)),
d_W=Gradient("W"),
d_b=Gradient("b"))
class PrecomputableAffine(Model):
def __init__(self, nO=None, nI=None, nF=None, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nI = nI
self.nF = nF
def begin_update(self, X, drop=0.):
# X: (b, i)
# Yf: (b, f, i)
# dY: (b, o)
# dYf: (b, f, o)
# Yf = numpy.einsum('bi,foi->bfo', X, self.W)
Yf = self.ops.xp.tensordot(
X, self.W, axes=[[1], [2]])
Yf += self.b
def backward(dY_ids, sgd=None):
tensordot = self.ops.xp.tensordot
dY, ids = dY_ids
Xf = X[ids]
# dXf = numpy.einsum('bo,foi->bfi', dY, self.W)
dXf = tensordot(dY, self.W, axes=[[1], [1]])
# dW = numpy.einsum('bo,bfi->ofi', dY, Xf)
dW = tensordot(dY, Xf, axes=[[0], [0]])
# ofi -> foi
self.d_W += dW.transpose((1, 0, 2))
self.d_b += dY.sum(axis=0)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return Yf, backward
@describe.on_data(_set_dimensions_if_needed)
@describe.attributes(
nI=Dimension("Input size"),
nF=Dimension("Number of features"),
nP=Dimension("Number of pieces"),
nO=Dimension("Output size"),
W=Synapses("Weights matrix",
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI),
lambda W, ops: ops.xavier_uniform_init(W)),
lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
b=Biases("Bias vector",
lambda obj: (obj.nO, obj.nP)),
pad=Synapses("Pad",
lambda obj: (1, obj.nF, obj.nO, obj.nP),
lambda M, ops: ops.normal_init(M, 1.)),
d_W=Gradient("W"),
d_pad=Gradient("pad"),
d_b=Gradient("b"))
class PrecomputableMaxouts(Model):
def __init__(self, nO=None, nI=None, nF=None, nP=3, **kwargs):
class PrecomputableAffine(Model):
def __init__(self, nO=None, nI=None, nF=None, nP=None, **kwargs):
Model.__init__(self, **kwargs)
self.nO = nO
self.nP = nP
@ -154,31 +103,96 @@ class PrecomputableMaxouts(Model):
self.nF = nF
def begin_update(self, X, drop=0.):
# X: (b, i)
# Yfp: (b, f, o, p)
# Xf: (f, b, i)
# dYp: (b, o, p)
# W: (f, o, p, i)
# b: (o, p)
# bi,opfi->bfop
# bop,fopi->bfi
# bop,fbi->opfi : fopi
tensordot = self.ops.xp.tensordot
Yfp = tensordot(X, self.W, axes=[[1], [3]])
Yfp += self.b
Yf = self.ops.xp.dot(X,
self.W.reshape((self.nF*self.nO*self.nP, self.nI)).T)
Yf = Yf.reshape((Yf.shape[0], self.nF, self.nO, self.nP))
Yf = self._add_padding(Yf)
def backward(dYp_ids, sgd=None):
dYp, ids = dYp_ids
def backward(dY_ids, sgd=None):
dY, ids = dY_ids
dY, ids = self._backprop_padding(dY, ids)
Xf = X[ids]
dXf = tensordot(dYp, self.W, axes=[[1, 2], [1, 2]])
dW = tensordot(dYp, Xf, axes=[[0], [0]])
self.d_W += dW.transpose((2, 0, 1, 3))
self.d_b += dYp.sum(axis=0)
Xf = Xf.reshape((Xf.shape[0], self.nF * self.nI))
self.d_b += dY.sum(axis=0)
dY = dY.reshape((dY.shape[0], self.nO*self.nP))
Wopfi = self.W.transpose((1, 2, 0, 3))
Wopfi = self.ops.xp.ascontiguousarray(Wopfi)
Wopfi = Wopfi.reshape((self.nO*self.nP, self.nF * self.nI))
dXf = self.ops.dot(dY.reshape((dY.shape[0], self.nO*self.nP)), Wopfi)
# Reuse the buffer
dWopfi = Wopfi; dWopfi.fill(0.)
self.ops.xp.dot(dY.T, Xf, out=dWopfi)
dWopfi = dWopfi.reshape((self.nO, self.nP, self.nF, self.nI))
# (o, p, f, i) --> (f, o, p, i)
self.d_W += dWopfi.transpose((2, 0, 1, 3))
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
return Yf, backward
return Yfp, backward
def _add_padding(self, Yf):
Yf_padded = self.ops.xp.vstack((self.pad, Yf))
return Yf_padded[1:]
def _backprop_padding(self, dY, ids):
for i in range(ids.shape[0]):
for j in range(ids.shape[1]):
if ids[i, j] < 0:
self.d_pad[0, j] += dY[i, j]
return dY, ids
@staticmethod
def init_weights(model):
'''This is like the 'layer sequential unit variance', but instead
of taking the actual inputs, we randomly generate whitened data.
Why's this all so complicated? We have a huge number of inputs,
and the maxout unit makes guessing the dynamics tricky. Instead
we set the maxout weights to values that empirically result in
whitened outputs given whitened inputs.
'''
if (model.W**2).sum() != 0.:
return
model.ops.normal_init(model.W, model.nF * model.nI, inplace=True)
ids = numpy.zeros((5000, model.nF), dtype='i')
ids += numpy.asarray(numpy.random.uniform(0, 1000, ids.shape), dtype='i')
tokvecs = numpy.zeros((5000, model.nI), dtype='f')
tokvecs += numpy.random.normal(loc=0., scale=1.,
size=tokvecs.size).reshape(tokvecs.shape)
def predict(ids, tokvecs):
# nS ids. nW tokvecs
hiddens = model(tokvecs) # (nW, f, o, p)
# need nS vectors
vectors = model.ops.allocate((ids.shape[0], model.nO, model.nP))
for i, feats in enumerate(ids):
for j, id_ in enumerate(feats):
vectors[i] += hiddens[id_, j]
vectors += model.b
if model.nP >= 2:
return model.ops.maxout(vectors)[0]
else:
return vectors * (vectors >= 0)
tol_var = 0.01
tol_mean = 0.01
t_max = 10
t_i = 0
for t_i in range(t_max):
acts1 = predict(ids, tokvecs)
var = numpy.var(acts1)
mean = numpy.mean(acts1)
if abs(var - 1.0) >= tol_var:
model.W /= numpy.sqrt(var)
elif abs(mean) >= tol_mean:
model.b -= mean
else:
break
def link_vectors_to_models(vocab):
@ -228,9 +242,10 @@ def Tok2Vec(width, embed_size, **kwargs):
tok2vec = (
FeatureExtracter(cols)
>> with_flatten(
embed >> (convolution ** 4), pad=4)
embed
>> convolution ** 4, pad=4
)
)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
tok2vec.nO = width
tok2vec.embed = embed
@ -265,34 +280,6 @@ def asarray(ops, dtype):
return layerize(forward)
def rebatch(size, layer):
ops = layer.ops
def forward(X, drop=0.):
if X.shape[0] < size:
return layer.begin_update(X)
parts = _divide_array(X, size)
results, bp_results = zip(*[layer.begin_update(p, drop=drop)
for p in parts])
y = ops.flatten(results)
def backward(dy, sgd=None):
d_parts = [bp(y, sgd=sgd) for bp, y in
zip(bp_results, _divide_array(dy, size))]
try:
dX = ops.flatten(d_parts)
except TypeError:
dX = None
except ValueError:
dX = None
return dX
return y, backward
model = layerize(forward)
model._layers.append(layer)
return model
def _divide_array(X, size):
parts = []
index = 0

View File

@ -1,8 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
import bz2
import gzip
try:
import bz2
import gzip
except ImportError:
pass
import math
from ast import literal_eval
from pathlib import Path

View File

@ -30,6 +30,10 @@ try:
except ImportError:
cupy = None
try:
from thinc.neural.optimizers import Optimizer
except ImportError:
from thinc.neural.optimizers import Adam as Optimizer
pickle = pickle
copy_reg = copy_reg

View File

@ -110,7 +110,7 @@ cdef cppclass StateC:
ids[3] = this.S(1)
ids[4] = this.H(this.S(0))
ids[5] = this.L(this.B(0), 1)
ids[6] = this.L(this.S(0), 2)
ids[6] = this.L(this.S(0), 1)
ids[7] = this.R(this.S(0), 1)
elif n == 13:
ids[0] = this.B(0)

View File

@ -16,5 +16,6 @@ cdef class Parser:
cdef public object _multitasks
cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* hW, const float* hb,
const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil

View File

@ -1,5 +1,4 @@
# cython: infer_types=True
# cython: profile=True
# cython: cdivision=True
# cython: boundscheck=False
# coding: utf-8
@ -27,8 +26,9 @@ from thinc.v2v import Model, Maxout, Affine
from thinc.misc import LayerNorm
from thinc.neural.ops import CupyOps
from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec
from .._ml import zero_init, PrecomputableMaxouts, Tok2Vec, flatten
from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
from .._ml import link_vectors_to_models
from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc
@ -74,6 +74,7 @@ cdef class precompute_hiddens:
cdef public object ops
cdef np.ndarray _features
cdef np.ndarray _cached
cdef np.ndarray bias
cdef object _cuda_stream
cdef object _bp_hiddens
@ -89,9 +90,10 @@ cdef class precompute_hiddens:
else:
cached = gpu_cached
self.nF = cached.shape[1]
self.nO = cached.shape[2]
self.nP = getattr(lower_model, 'nP', 1)
self.nO = cached.shape[2]
self.ops = lower_model.ops
self.bias = lower_model.b
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
@ -108,7 +110,7 @@ cdef class precompute_hiddens:
def begin_update(self, token_ids, drop=0.):
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO*self.nP), dtype='f')
(token_ids.shape[0], self.nO, self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
@ -119,15 +121,15 @@ cdef class precompute_hiddens:
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0, 0],
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector, sgd=None):
if bp_nonlinearity is not None:
d_state_vector = bp_nonlinearity(d_state_vector, sgd)
# This will usually be on GPU
if isinstance(d_state_vector, numpy.ndarray):
if not isinstance(d_state_vector, self.ops.xp.ndarray):
d_state_vector = self.ops.xp.array(d_state_vector)
d_tokens = bp_hiddens((d_state_vector, token_ids), sgd)
return d_tokens
@ -135,25 +137,32 @@ cdef class precompute_hiddens:
def _nonlinearity(self, state_vector):
if self.nP == 1:
return state_vector, None
state_vector = state_vector.reshape(
(state_vector.shape[0], state_vector.shape[1]//self.nP, self.nP))
best, which = self.ops.maxout(state_vector)
state_vector = state_vector.reshape(state_vector.shape[:-1])
mask = state_vector >= 0.
state_vector *= mask
else:
state_vector, mask = self.ops.maxout(state_vector)
def backprop(d_best, sgd=None):
return self.ops.backprop_maxout(d_best, which, self.nP)
return best, backprop
def backprop_nonlinearity(d_best, sgd=None):
if self.nP == 1:
d_best *= mask
d_best = d_best.reshape((d_best.shape + (1,)))
return d_best
else:
return self.ops.backprop_maxout(d_best, mask, self.nP)
return state_vector, backprop_nonlinearity
cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
cdef const float* feature
padding = cached - (F * O)
for b in range(B):
for f in range(F):
if token_ids[f] < 0:
continue
feature = &padding[f*O]
else:
idx = token_ids[f] * F * O + f*O
feature = &cached[idx]
for i in range(O):
@ -220,13 +229,9 @@ cdef class Parser:
raise ValueError("Currently parser depth is hard-coded to 1.")
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2))
if parser_maxout_pieces != 2:
raise ValueError("Currently parser_maxout_pieces is hard-coded "
"to 2")
token_vector_width = util.env_opt('token_vector_width',
cfg.get('token_vector_width', 128))
hidden_width = util.env_opt('hidden_width',
cfg.get('hidden_width', 200))
hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 200))
embed_size = util.env_opt('embed_size', cfg.get('embed_size', 7000))
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
@ -237,9 +242,10 @@ cdef class Parser:
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=cfg.get('pretrained_dims', 0))
tok2vec = chain(tok2vec, flatten)
lower = PrecomputableMaxouts(hidden_width if depth >= 1 else nr_class,
nF=cls.nr_feature, nP=parser_maxout_pieces,
nI=token_vector_width)
lower = PrecomputableAffine(hidden_width,
nF=cls.nr_feature, nI=token_vector_width,
nP=parser_maxout_pieces)
lower.nP = parser_maxout_pieces
with Model.use_device('cpu'):
upper = chain(
@ -391,19 +397,20 @@ cdef class Parser:
hW = <float*>hidden_weights.data
hb = <float*>hidden_bias.data
bias = <float*>state2vec.bias.data
cdef int nr_hidden = hidden_weights.shape[0]
cdef int nr_task = states.size()
with nogil:
for i in cython.parallel.prange(nr_task, num_threads=2,
schedule='guided'):
for i in range(nr_task):
self._parseC(states[i],
feat_weights, hW, hb,
feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece)
PyErr_CheckSignals()
return state_objs
cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* hW, const float* hb,
const float* feat_weights, const float* bias,
const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
token_ids = <int*>calloc(nr_feat, sizeof(int))
is_valid = <int*>calloc(nr_class, sizeof(int))
@ -413,17 +420,24 @@ cdef class Parser:
with gil:
PyErr_SetFromErrno(MemoryError)
PyErr_CheckSignals()
cdef float feature
while not state.is_final():
state.set_context_tokens(token_ids, nr_feat)
memset(vectors, 0, nr_hidden * nr_piece * sizeof(float))
memset(scores, 0, nr_class * sizeof(float))
sum_state_features(vectors,
feat_weights, token_ids, 1, nr_feat, nr_hidden * nr_piece)
for i in range(nr_hidden * nr_piece):
vectors[i] += bias[i]
V = vectors
W = hW
for i in range(nr_hidden):
if nr_piece == 1:
feature = V[0] if V[0] >= 0. else 0.
elif nr_piece == 2:
feature = V[0] if V[0] >= V[1] else V[1]
else:
feature = Vec.max(V, nr_piece)
for j in range(nr_class):
scores[j] += feature * W[j]
W += nr_class
@ -644,9 +658,10 @@ cdef class Parser:
xp = get_array_module(d_tokvecs)
for ids, d_vector, bp_vector in backprops:
d_state_features = bp_vector(d_vector, sgd=sgd)
mask = ids >= 0
d_state_features *= mask.reshape(ids.shape + (1,))
self.model[0].ops.scatter_add(d_tokvecs, ids * mask,
ids = ids.flatten()
d_state_features = d_state_features.reshape(
(ids.size, d_state_features.shape[2]))
self.model[0].ops.scatter_add(d_tokvecs, ids,
d_state_features)
bp_tokvecs(d_tokvecs, sgd=sgd)
@ -665,7 +680,7 @@ cdef class Parser:
lower, stream, drop=0.0)
return (tokvecs, bp_tokvecs), state2vec, upper
nr_feature = 8
nr_feature = 13
def get_token_ids(self, states):
cdef StateClass state

View File

@ -40,6 +40,8 @@ def parser(vocab):
def test_init_parser(parser):
pass
# TODO: This is flakey, because it depends on what the parser first learns.
@pytest.mark.xfail
def test_add_label(parser):
doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd'])
doc = parser(doc)