Merge pull request #10103 from svlandeg/refactor/parser-gpu

Consolidate parser refactor branches
This commit is contained in:
Daniël de Kok 2022-01-21 18:15:18 +01:00 committed by GitHub
commit ec0cae9db8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
26 changed files with 761 additions and 1289 deletions

View File

@ -30,12 +30,9 @@ MOD_NAMES = [
"spacy.vocab", "spacy.vocab",
"spacy.attrs", "spacy.attrs",
"spacy.kb", "spacy.kb",
"spacy.ml.parser_model",
"spacy.morphology", "spacy.morphology",
"spacy.pipeline.dep_parser",
"spacy.pipeline.morphologizer", "spacy.pipeline.morphologizer",
"spacy.pipeline.multitask", "spacy.pipeline.multitask",
"spacy.pipeline.ner",
"spacy.pipeline.pipe", "spacy.pipeline.pipe",
"spacy.pipeline.trainable_pipe", "spacy.pipeline.trainable_pipe",
"spacy.pipeline.sentencizer", "spacy.pipeline.sentencizer",
@ -205,7 +202,11 @@ def setup_package():
for name in MOD_NAMES: for name in MOD_NAMES:
mod_path = name.replace(".", "/") + ".pyx" mod_path = name.replace(".", "/") + ".pyx"
ext = Extension( ext = Extension(
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"] name,
[mod_path],
language="c++",
include_dirs=include_dirs,
extra_compile_args=["-std=c++11"],
) )
ext_modules.append(ext) ext_modules.append(ext)
print("Cythonizing sources") print("Cythonizing sources")

View File

@ -81,12 +81,11 @@ grad_factor = 1.0
factory = "parser" factory = "parser"
[components.parser.model] [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser" state_type = "parser"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 128 hidden_width = 128
maxout_pieces = 3 maxout_pieces = 3
use_upper = false
nO = null nO = null
[components.parser.model.tok2vec] [components.parser.model.tok2vec]
@ -102,12 +101,11 @@ grad_factor = 1.0
factory = "ner" factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner" state_type = "ner"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
use_upper = false
nO = null nO = null
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@ -259,12 +257,11 @@ width = ${components.tok2vec.model.encode.width}
factory = "parser" factory = "parser"
[components.parser.model] [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser" state_type = "parser"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 128 hidden_width = 128
maxout_pieces = 3 maxout_pieces = 3
use_upper = true
nO = null nO = null
[components.parser.model.tok2vec] [components.parser.model.tok2vec]
@ -277,12 +274,11 @@ width = ${components.tok2vec.model.encode.width}
factory = "ner" factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner" state_type = "ner"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
use_upper = true
nO = null nO = null
[components.ner.model.tok2vec] [components.ner.model.tok2vec]

View File

@ -1,158 +1,2 @@
from thinc.api import Model, normal_init class PrecomputableAffine:
pass
from ..util import registry
@registry.layers("spacy.PrecomputableAffine.v1")
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
model = Model(
"precomputable_affine",
forward,
init=init,
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
params={"W": None, "b": None, "pad": None},
attrs={"dropout_rate": dropout},
)
return model
def forward(model, X, is_train):
nF = model.get_dim("nF")
nO = model.get_dim("nO")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.get_param("W")
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
def backward(dY_ids):
# This backprop is particularly tricky, because we get back a different
# thing from what we put out. We put out an array of shape:
# (nB, nF, nO, nP), and get back:
# (nB, nO, nP) and ids (nB, nF)
# The ids tell us the values of nF, so we would have:
#
# dYf = zeros((nB, nF, nO, nP))
# for b in range(nB):
# for f in range(nF):
# dYf[b, ids[b, f]] += dY[b]
#
# However, we avoid building that array for efficiency -- and just pass
# in the indices.
dY, ids = dY_ids
assert dY.ndim == 3
assert dY.shape[1] == nO, dY.shape
assert dY.shape[2] == nP, dY.shape
# nB = dY.shape[0]
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
Xf = X[ids]
Xf = Xf.reshape((Xf.shape[0], nF * nI))
model.inc_grad("b", dY.sum(axis=0))
dY = dY.reshape((dY.shape[0], nO * nP))
Wopfi = W.transpose((1, 2, 0, 3))
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
# (o, p, f, i) --> (f, o, p, i)
dWopfi = dWopfi.transpose((2, 0, 1, 3))
model.inc_grad("W", dWopfi)
return dXf.reshape((dXf.shape[0], nF, nI))
return Yf, backward
def _backprop_precomputable_affine_padding(model, dY, ids):
nB = dY.shape[0]
nF = model.get_dim("nF")
nP = model.get_dim("nP")
nO = model.get_dim("nO")
# Backprop the "padding", used as a filler for missing values.
# Values that are missing are set to -1, and each state vector could
# have multiple missing values. The padding has different values for
# different missing features. The gradient of the padding vector is:
#
# for b in range(nB):
# for f in range(nF):
# if ids[b, f] < 0:
# d_pad[f] += dY[b]
#
# Which can be rewritten as:
#
# (ids < 0).T @ dY
mask = model.ops.asarray(ids < 0, dtype="f")
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
return d_pad.reshape((1, nF, nO, nP))
def init(model, X=None, Y=None):
"""This is like the 'layer sequential unit variance', but instead
of taking the actual inputs, we randomly generate whitened data.
Why's this all so complicated? We have a huge number of inputs,
and the maxout unit makes guessing the dynamics tricky. Instead
we set the maxout weights to values that empirically result in
whitened outputs given whitened inputs.
"""
if model.has_param("W") and model.get_param("W").any():
return
nF = model.get_dim("nF")
nO = model.get_dim("nO")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.ops.alloc4f(nF, nO, nP, nI)
b = model.ops.alloc2f(nO, nP)
pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
pad = normal_init(ops, pad.shape, mean=1.0)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
ids = ops.alloc((5000, nF), dtype="f")
ids += ops.xp.random.uniform(0, 1000, ids.shape)
ids = ops.asarray(ids, dtype="i")
tokvecs = ops.alloc((5000, nI), dtype="f")
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
tokvecs.shape
)
def predict(ids, tokvecs):
# nS ids. nW tokvecs. Exclude the padding array.
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
# need nS vectors
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
vectors = vectors.reshape((vectors.shape[0], nO, nP))
vectors += b
vectors = model.ops.asarray(vectors)
if nP >= 2:
return model.ops.maxout(vectors)[0]
else:
return vectors * (vectors >= 0)
tol_var = 0.01
tol_mean = 0.01
t_max = 10
W = model.get_param("W").copy()
b = model.get_param("b").copy()
for t_i in range(t_max):
acts1 = predict(ids, tokvecs)
var = model.ops.xp.var(acts1)
mean = model.ops.xp.mean(acts1)
if abs(var - 1.0) >= tol_var:
W /= model.ops.xp.sqrt(var)
model.set_param("W", W)
elif abs(mean) >= tol_mean:
b -= mean
model.set_param("b", b)
else:
break

View File

@ -1,23 +1,42 @@
from typing import Optional, List, cast from typing import Optional, List, Tuple, Any
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from thinc.types import Floats2d from thinc.types import Floats2d
from thinc.api import Model
from ...errors import Errors from ...errors import Errors
from ...compat import Literal from ...compat import Literal
from ...util import registry from ...util import registry
from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel from ..tb_framework import TransitionModel
from ...tokens import Doc from ...tokens.doc import Doc
TransitionSystem = Any # TODO
State = Any # TODO
@registry.architectures.register("spacy.TransitionBasedParser.v3")
def transition_parser_v3(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int,
maxout_pieces: int,
nO: Optional[int] = None,
) -> Model:
return build_tb_parser_model(
tok2vec,
state_type,
extra_state_tokens,
hidden_width,
maxout_pieces,
nO=nO,
)
@registry.architectures("spacy.TransitionBasedParser.v2")
def build_tb_parser_model( def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"], state_type: Literal["parser", "ner"],
extra_state_tokens: bool, extra_state_tokens: bool,
hidden_width: int, hidden_width: int,
maxout_pieces: int, maxout_pieces: int,
use_upper: bool,
nO: Optional[int] = None, nO: Optional[int] = None,
) -> Model: ) -> Model:
""" """
@ -51,14 +70,7 @@ def build_tb_parser_model(
feature sets (for the NER) or 13 (for the parser). feature sets (for the NER) or 13 (for the parser).
hidden_width (int): The width of the hidden layer. hidden_width (int): The width of the hidden layer.
maxout_pieces (int): How many pieces to use in the state prediction layer. maxout_pieces (int): How many pieces to use in the state prediction layer.
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity Recommended values are 1, 2 or 3.
is replaced with a ReLu non-linearity if use_upper=True, and no
non-linearity if use_upper=False.
use_upper (bool): Whether to use an additional hidden layer after the state
vector in order to predict the action scores. It is recommended to set
this to False for large pretrained models such as transformers, and True
for smaller networks. The upper layer is computed on CPU, which becomes
a bottleneck on larger GPU-based models, where it's also less necessary.
nO (int or None): The number of actions the model will predict between. nO (int or None): The number of actions the model will predict between.
Usually inferred from data at the beginning of training, or loaded from Usually inferred from data at the beginning of training, or loaded from
disk. disk.
@ -69,106 +81,11 @@ def build_tb_parser_model(
nr_feature_tokens = 6 if extra_state_tokens else 3 nr_feature_tokens = 6 if extra_state_tokens else 3
else: else:
raise ValueError(Errors.E917.format(value=state_type)) raise ValueError(Errors.E917.format(value=state_type))
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None return TransitionModel(
tok2vec = chain( tok2vec=tok2vec,
tok2vec, state_tokens=nr_feature_tokens,
cast(Model[List["Floats2d"], Floats2d], list2array()), hidden_width=hidden_width,
Linear(hidden_width, t2v_width), maxout_pieces=maxout_pieces,
nO=nO,
unseen_classes=set(),
) )
tok2vec.set_dim("nO", hidden_width)
lower = _define_lower(
nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
)
upper = None
if use_upper:
with use_ops("cpu"):
# Initialize weights at zero, as it's a classification layer.
upper = _define_upper(nO=nO, nI=None)
return TransitionModel(tok2vec, lower, upper, resize_output)
def _define_upper(nO, nI):
return Linear(nO=nO, nI=nI, init_W=zero_init)
def _define_lower(nO, nF, nI, nP):
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
def resize_output(model, new_nO):
if model.attrs["has_upper"]:
return _resize_upper(model, new_nO)
return _resize_lower(model, new_nO)
def _resize_upper(model, new_nO):
upper = model.get_ref("upper")
if upper.has_dim("nO") is None:
upper.set_dim("nO", new_nO)
return model
elif new_nO == upper.get_dim("nO"):
return model
smaller = upper
nI = smaller.maybe_get_dim("nI")
with use_ops("cpu"):
larger = _define_upper(nO=new_nO, nI=nI)
# it could be that the model is not initialized yet, then skip this bit
if smaller.has_param("W"):
larger_W = larger.ops.alloc2f(new_nO, nI)
larger_b = larger.ops.alloc1f(new_nO)
smaller_W = smaller.get_param("W")
smaller_b = smaller.get_param("b")
# Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here.
if smaller.has_dim("nO"):
old_nO = smaller.get_dim("nO")
larger_W[:old_nO] = smaller_W
larger_b[:old_nO] = smaller_b
for i in range(old_nO, new_nO):
model.attrs["unseen_classes"].add(i)
larger.set_param("W", larger_W)
larger.set_param("b", larger_b)
model._layers[-1] = larger
model.set_ref("upper", larger)
return model
def _resize_lower(model, new_nO):
lower = model.get_ref("lower")
if lower.has_dim("nO") is None:
lower.set_dim("nO", new_nO)
return model
smaller = lower
nI = smaller.maybe_get_dim("nI")
nF = smaller.maybe_get_dim("nF")
nP = smaller.maybe_get_dim("nP")
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
# it could be that the model is not initialized yet, then skip this bit
if smaller.has_param("W"):
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
larger_b = larger.ops.alloc2f(new_nO, nP)
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
smaller_W = smaller.get_param("W")
smaller_b = smaller.get_param("b")
smaller_pad = smaller.get_param("pad")
# Copy the old weights and padding into the new layer
if smaller.has_dim("nO"):
old_nO = smaller.get_dim("nO")
larger_W[:, 0:old_nO, :, :] = smaller_W
larger_pad[:, :, 0:old_nO, :] = smaller_pad
larger_b[0:old_nO, :] = smaller_b
for i in range(old_nO, new_nO):
model.attrs["unseen_classes"].add(i)
larger.set_param("W", larger_W)
larger.set_param("b", larger_b)
larger.set_param("pad", larger_pad)
model._layers[1] = larger
model.set_ref("lower", larger)
return model

View File

@ -1,48 +0,0 @@
from libc.string cimport memset, memcpy
from ..typedefs cimport weight_t, hash_t
from ..pipeline._parser_internals._state cimport StateC
cdef struct SizesC:
int states
int classes
int hiddens
int pieces
int feats
int embed_width
cdef struct WeightsC:
const float* feat_weights
const float* feat_bias
const float* hidden_bias
const float* hidden_weights
const float* seen_classes
cdef struct ActivationsC:
int* token_ids
float* unmaxed
float* scores
float* hiddens
int* is_valid
int _curr_size
int _max_size
cdef WeightsC get_c_weights(model) except *
cdef SizesC get_c_sizes(model, int batch_size) except *
cdef ActivationsC alloc_activations(SizesC n) nogil
cdef void free_activations(const ActivationsC* A) nogil
cdef void predict_states(ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
cdef void cpu_log_loss(float* d_scores,
const float* costs, const int* is_valid, const float* scores, int O) nogil

View File

@ -1,489 +0,0 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
cimport numpy as np
from libc.math cimport exp
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from thinc.backends.linalg cimport Vec, VecVec
cimport blis.cy
import numpy
import numpy.random
from thinc.api import Model, CupyOps, NumpyOps
from .. import util
from ..typedefs cimport weight_t, class_t, hash_t
from ..pipeline._parser_internals.stateclass cimport StateClass
cdef WeightsC get_c_weights(model) except *:
cdef WeightsC output
cdef precompute_hiddens state2vec = model.state2vec
output.feat_weights = state2vec.get_feat_weights()
output.feat_bias = <const float*>state2vec.bias.data
cdef np.ndarray vec2scores_W
cdef np.ndarray vec2scores_b
if model.vec2scores is None:
output.hidden_weights = NULL
output.hidden_bias = NULL
else:
vec2scores_W = model.vec2scores.get_param("W")
vec2scores_b = model.vec2scores.get_param("b")
output.hidden_weights = <const float*>vec2scores_W.data
output.hidden_bias = <const float*>vec2scores_b.data
cdef np.ndarray class_mask = model._class_mask
output.seen_classes = <const float*>class_mask.data
return output
cdef SizesC get_c_sizes(model, int batch_size) except *:
cdef SizesC output
output.states = batch_size
if model.vec2scores is None:
output.classes = model.state2vec.get_dim("nO")
else:
output.classes = model.vec2scores.get_dim("nO")
output.hiddens = model.state2vec.get_dim("nO")
output.pieces = model.state2vec.get_dim("nP")
output.feats = model.state2vec.get_dim("nF")
output.embed_width = model.tokvecs.shape[1]
return output
cdef ActivationsC alloc_activations(SizesC n) nogil:
cdef ActivationsC A
memset(&A, 0, sizeof(A))
resize_activations(&A, n)
return A
cdef void free_activations(const ActivationsC* A) nogil:
free(A.token_ids)
free(A.scores)
free(A.unmaxed)
free(A.hiddens)
free(A.is_valid)
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
if n.states <= A._max_size:
A._curr_size = n.states
return
if A._max_size == 0:
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
A._max_size = n.states
else:
A.token_ids = <int*>realloc(A.token_ids,
n.states * n.feats * sizeof(A.token_ids[0]))
A.scores = <float*>realloc(A.scores,
n.states * n.classes * sizeof(A.scores[0]))
A.unmaxed = <float*>realloc(A.unmaxed,
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
A.hiddens = <float*>realloc(A.hiddens,
n.states * n.hiddens * sizeof(A.hiddens[0]))
A.is_valid = <int*>realloc(A.is_valid,
n.states * n.classes * sizeof(A.is_valid[0]))
A._max_size = n.states
A._curr_size = n.states
cdef void predict_states(ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil:
cdef double one = 1.0
resize_activations(A, n)
for i in range(n.states):
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
sum_state_features(A.unmaxed,
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
for i in range(n.states):
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
W.feat_bias, 1., n.hiddens * n.pieces)
for j in range(n.hiddens):
index = i * n.hiddens * n.pieces + j * n.pieces
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
memset(A.scores, 0, n.states * n.classes * sizeof(float))
if W.hidden_weights == NULL:
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
else:
# Compute hidden-to-output
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
n.states, n.classes, n.hiddens, one,
<float*>A.hiddens, n.hiddens, 1,
<float*>W.hidden_weights, n.hiddens, 1,
one,
<float*>A.scores, n.classes, 1)
# Add bias
for i in range(n.states):
VecVec.add_i(&A.scores[i*n.classes],
W.hidden_bias, 1., n.classes)
# Set unseen classes to minimum value
i = 0
min_ = A.scores[0]
for i in range(1, n.states * n.classes):
if A.scores[i] < min_:
min_ = A.scores[i]
for i in range(n.states):
for j in range(n.classes):
if not W.seen_classes[j]:
A.scores[i*n.classes+j] = min_
cdef void sum_state_features(float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
cdef const float* feature
padding = cached
cached += F * O
cdef int id_stride = F*O
cdef float one = 1.
for b in range(B):
for f in range(F):
if token_ids[f] < 0:
feature = &padding[f*O]
else:
idx = token_ids[f] * id_stride + f*O
feature = &cached[idx]
blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one,
<float*>feature, 1,
&output[b*O], 1)
token_ids += F
cdef void cpu_log_loss(float* d_scores,
const float* costs, const int* is_valid, const float* scores,
int O) nogil:
"""Do multi-label log loss"""
cdef double max_, gmax, Z, gZ
best = arg_max_if_gold(scores, costs, is_valid, O)
guess = Vec.arg_max(scores, O)
if best == -1 or guess == -1:
# These shouldn't happen, but if they do, we want to make sure we don't
# cause an OOB access.
return
Z = 1e-10
gZ = 1e-10
max_ = scores[guess]
gmax = scores[best]
for i in range(O):
Z += exp(scores[i] - max_)
if costs[i] <= costs[best]:
gZ += exp(scores[i] - gmax)
for i in range(O):
if costs[i] <= costs[best]:
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
else:
d_scores[i] = exp(scores[i]-max_) / Z
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
const int* is_valid, int n) nogil:
# Find minimum cost
cdef float cost = 1
for i in range(n):
if is_valid[i] and costs[i] < cost:
cost = costs[i]
# Now find best-scoring with that cost
cdef int best = -1
for i in range(n):
if costs[i] <= cost and is_valid[i]:
if best == -1 or scores[i] > scores[best]:
best = i
return best
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
cdef int best = -1
for i in range(n):
if is_valid[i] >= 1:
if best == -1 or scores[i] > scores[best]:
best = i
return best
class ParserStepModel(Model):
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
dropout=0.1):
Model.__init__(self, name="parser_step_model", forward=step_forward)
self.attrs["has_upper"] = has_upper
self.attrs["dropout_rate"] = dropout
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
if layers[1].get_dim("nP") >= 2:
activation = "maxout"
elif has_upper:
activation = None
else:
activation = "relu"
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
activation=activation, train=train)
if has_upper:
self.vec2scores = layers[-1]
else:
self.vec2scores = None
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
self.backprops = []
self._class_mask = numpy.zeros((self.nO,), dtype='f')
self._class_mask.fill(1)
if unseen_classes is not None:
for class_ in unseen_classes:
self._class_mask[class_] = 0.
def clear_memory(self):
del self.tokvecs
del self.bp_tokvecs
del self.state2vec
del self.backprops
del self._class_mask
@property
def nO(self):
if self.attrs["has_upper"]:
return self.vec2scores.get_dim("nO")
else:
return self.state2vec.get_dim("nO")
def class_is_unseen(self, class_):
return self._class_mask[class_]
def mark_class_unseen(self, class_):
self._class_mask[class_] = 0
def mark_class_seen(self, class_):
self._class_mask[class_] = 1
def get_token_ids(self, states):
cdef StateClass state
states = [state for state in states if not state.is_final()]
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
dtype='i', order='C')
ids.fill(-1)
c_ids = <int*>ids.data
for state in states:
state.c.set_context_tokens(c_ids, ids.shape[1])
c_ids += ids.shape[1]
return ids
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((
util.get_async(self.cuda_stream, token_ids),
util.get_async(self.cuda_stream, d_vector),
get_d_tokvecs
))
else:
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient.
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
# Tells CUDA to block, so our async copies complete.
if self.cuda_stream is not None:
self.cuda_stream.synchronize()
for ids, d_vector, bp_vector in self.backprops:
d_state_features = bp_vector((d_vector, ids))
ids = ids.flatten()
d_state_features = d_state_features.reshape(
(ids.size, d_state_features.shape[2]))
self.ops.scatter_add(d_tokvecs, ids,
d_state_features)
# Padded -- see update()
self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs
NUMPY_OPS = NumpyOps()
def step_forward(model: ParserStepModel, states, is_train):
token_ids = model.get_token_ids(states)
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
mask = None
if model.attrs["has_upper"]:
dropout_rate = model.attrs["dropout_rate"]
if is_train and dropout_rate > 0:
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
vector *= mask
scores, get_d_vector = model.vec2scores(vector, is_train)
else:
scores = NumpyOps().asarray(vector)
get_d_vector = lambda d_scores: d_scores
# If the class is unseen, make sure its score is minimum
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
def backprop_parser_step(d_scores):
# Zero vectors for unseen classes
d_scores *= model._class_mask
d_vector = get_d_vector(d_scores)
if mask is not None:
d_vector *= mask
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
return None
return scores, backprop_parser_step
cdef class precompute_hiddens:
"""Allow a model to be "primed" by pre-computing input features in bulk.
This is used for the parser, where we want to take a batch of documents,
and compute vectors for each (token, position) pair. These vectors can then
be reused, especially for beam-search.
Let's say we're using 12 features for each state, e.g. word at start of
buffer, three words on stack, their children, etc. In the normal arc-eager
system, a document of length N is processed in 2*N states. This means we'll
create 2*N*12 feature vectors --- but if we pre-compute, we only need
N*12 vector computations. The saving for beam-search is much better:
if we have a beam of k, we'll normally make 2*N*12*K computations --
so we can save the factor k. This also gives a nice CPU/GPU division:
we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU.
"""
cdef readonly int nF, nO, nP
cdef bint _is_synchronized
cdef public object ops
cdef public object numpy_ops
cdef np.ndarray _features
cdef np.ndarray _cached
cdef np.ndarray bias
cdef object _cuda_stream
cdef object _bp_hiddens
cdef object activation
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
activation="maxout", train=False):
gpu_cached, bp_features = lower_model(tokvecs, train)
cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray):
# Note the passing of cuda_stream here: it lets
# cupy make the copy asynchronously.
# We then have to block before first use.
cached = gpu_cached.get(stream=cuda_stream)
else:
cached = gpu_cached
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
else:
self.bias = lower_model.get_param("b")
self.nF = cached.shape[1]
if lower_model.has_dim("nP"):
self.nP = lower_model.get_dim("nP")
else:
self.nP = 1
self.nO = cached.shape[2]
self.ops = lower_model.ops
self.numpy_ops = NumpyOps()
assert activation in (None, "relu", "maxout")
self.activation = activation
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized and self._cuda_stream is not None:
self._cuda_stream.synchronize()
self._is_synchronized = True
return <float*>self._cached.data
def has_dim(self, name):
if name == "nF":
return self.nF if self.nF is not None else True
elif name == "nP":
return self.nP if self.nP is not None else True
elif name == "nO":
return self.nO if self.nO is not None else True
else:
return False
def get_dim(self, name):
if name == "nF":
return self.nF
elif name == "nP":
return self.nP
elif name == "nO":
return self.nO
else:
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
def set_dim(self, name, value):
if name == "nF":
self.nF = value
elif name == "nP":
self.nP = value
elif name == "nO":
self.nO = value
else:
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
def __call__(self, X, bint is_train):
if is_train:
return self.begin_update(X)
else:
return self.predict(X), lambda X: X
def predict(self, X):
return self.begin_update(X)[0]
def begin_update(self, token_ids):
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO, self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
# - Input to backward on GPU!
# - Output from backward on GPU
bp_hiddens = self._bp_hiddens
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
sum_state_features(<float*>state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector_ids):
d_state_vector, token_ids = d_state_vector_ids
d_state_vector = bp_nonlinearity(d_state_vector)
d_tokens = bp_hiddens((d_state_vector, token_ids))
return d_tokens
return state_vector, backward
def _nonlinearity(self, state_vector):
if self.activation == "maxout":
return self._maxout_nonlinearity(state_vector)
else:
return self._relu_nonlinearity(state_vector)
def _maxout_nonlinearity(self, state_vector):
state_vector, mask = self.numpy_ops.maxout(state_vector)
# We're outputting to CPU, but we need this variable on GPU for the
# backward pass.
mask = self.ops.asarray(mask)
def backprop_maxout(d_best):
return self.ops.backprop_maxout(d_best, mask, self.nP)
return state_vector, backprop_maxout
def _relu_nonlinearity(self, state_vector):
state_vector = state_vector.reshape((state_vector.shape[0], -1))
mask = state_vector >= 0.
state_vector *= mask
# We're outputting to CPU, but we need this variable on GPU for the
# backward pass.
mask = self.ops.asarray(mask)
def backprop_relu(d_best):
d_best *= mask
return d_best.reshape((d_best.shape + (1,)))
return state_vector, backprop_relu

View File

@ -1,50 +1,438 @@
from thinc.api import Model, noop from typing import List, Tuple, Any, Optional, cast
from .parser_model import ParserStepModel from thinc.api import Ops, Model, normal_init, chain, list2array, Linear
from thinc.api import uniform_init, glorot_uniform_init, zero_init
from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d
import numpy
from ..tokens.doc import Doc
from ..util import registry from ..util import registry
@registry.layers("spacy.TransitionModel.v1") TransitionSystem = Any # TODO
State = Any # TODO
@registry.layers("spacy.TransitionModel.v2")
def TransitionModel( def TransitionModel(
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set() *,
): tok2vec: Model[List[Doc], List[Floats2d]],
"""Set up a stepwise transition-based model""" state_tokens: int,
if upper is None: hidden_width: int,
has_upper = False maxout_pieces: int,
upper = noop() nO: Optional[int] = None,
else: unseen_classes=set(),
has_upper = True ) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
# don't define nO for this object, because we can't dynamically change it """Set up a transition-based parsing model, using a maxout hidden
layer and a linear output layer.
"""
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
tok2vec_projected.set_dim("nO", hidden_width)
return Model( return Model(
name="parser_model", name="parser_model",
forward=forward, forward=forward,
dims={"nI": tok2vec.maybe_get_dim("nI")},
layers=[tok2vec, lower, upper],
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
init=init, init=init,
layers=[tok2vec_projected],
refs={"tok2vec": tok2vec_projected},
params={
"lower_W": None, # Floats2d W for the hidden layer
"lower_b": None, # Floats1d bias for the hidden layer
"lower_pad": None, # Floats1d padding for the hidden layer
"upper_W": None, # Floats2d W for the output layer
"upper_b": None, # Floats1d bias for the output layer
},
dims={
"nO": None, # Output size
"nP": maxout_pieces,
"nH": hidden_width,
"nI": tok2vec_projected.maybe_get_dim("nO"),
"nF": state_tokens,
},
attrs={ attrs={
"has_upper": has_upper,
"unseen_classes": set(unseen_classes), "unseen_classes": set(unseen_classes),
"resize_output": resize_output, "resize_output": resize_output,
}, },
) )
def forward(model, X, is_train): def resize_output(model: Model, new_nO: int) -> Model:
step_model = ParserStepModel( old_nO = model.maybe_get_dim("nO")
X, if old_nO is None:
model.layers, model.set_dim("nO", new_nO)
unseen_classes=model.attrs["unseen_classes"], return model
train=is_train, elif new_nO <= old_nO:
has_upper=model.attrs["has_upper"], return model
elif model.has_param("upper_W"):
nH = model.get_dim("nH")
new_W = model.ops.alloc2f(new_nO, nH)
new_b = model.ops.alloc1f(new_nO)
old_W = model.get_param("upper_W")
old_b = model.get_param("upper_b")
new_W[:old_nO] = old_W # type: ignore
new_b[:old_nO] = old_b # type: ignore
for i in range(old_nO, new_nO):
model.attrs["unseen_classes"].add(i)
model.set_param("upper_W", new_W)
model.set_param("upper_b", new_b)
# TODO: Avoid this private intrusion
model._dims["nO"] = new_nO
if model.has_grad("upper_W"):
model.set_grad("upper_W", model.get_param("upper_W") * 0)
if model.has_grad("upper_b"):
model.set_grad("upper_b", model.get_param("upper_b") * 0)
return model
def init(
model,
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
):
if X is not None:
docs, moves = X
model.get_ref("tok2vec").initialize(X=docs)
else:
model.get_ref("tok2vec").initialize()
inferred_nO = _infer_nO(Y)
if inferred_nO is not None:
current_nO = model.maybe_get_dim("nO")
if current_nO is None:
model.set_dim("nO", inferred_nO)
elif current_nO != inferred_nO:
model.attrs["resize_output"](model, inferred_nO)
nO = model.get_dim("nO")
nP = model.get_dim("nP")
nH = model.get_dim("nH")
nI = model.get_dim("nI")
nF = model.get_dim("nF")
ops = model.ops
Wl = ops.alloc2f(nH * nP, nF * nI)
bl = ops.alloc1f(nH * nP)
padl = ops.alloc1f(nI)
Wu = ops.alloc2f(nO, nH)
bu = ops.alloc1f(nO)
Wu = zero_init(ops, Wu.shape)
# Wl = zero_init(ops, Wl.shape)
Wl = glorot_uniform_init(ops, Wl.shape)
padl = uniform_init(ops, padl.shape) # type: ignore
# TODO: Experiment with whether better to initialize upper_W
model.set_param("lower_W", Wl)
model.set_param("lower_b", bl)
model.set_param("lower_pad", padl)
model.set_param("upper_W", Wu)
model.set_param("upper_b", bu)
# model = _lsuv_init(model)
return model
def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool):
nF = model.get_dim("nF")
tok2vec = model.get_ref("tok2vec")
lower_pad = model.get_param("lower_pad")
lower_W = model.get_param("lower_W")
lower_b = model.get_param("lower_b")
upper_W = model.get_param("upper_W")
upper_b = model.get_param("upper_b")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
nO = model.get_dim("nO")
nI = model.get_dim("nI")
ops = model.ops
docs, moves = docs_moves
states = moves.init_batch(docs)
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
tokvecs = model.ops.xp.vstack((tokvecs, lower_pad))
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
all_ids = []
all_which = []
all_statevecs = []
all_scores = []
next_states = [s for s in states if not s.is_final()]
unseen_mask = _get_unseen_mask(model)
ids = numpy.zeros((len(states), nF), dtype="i")
arange = model.ops.xp.arange(nF)
while next_states:
ids = ids[: len(next_states)]
for i, state in enumerate(next_states):
state.set_context_tokens(ids, i, nF)
# Sum the state features, add the bias and apply the activation (maxout)
# to create the state vectors.
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
preacts2f += lower_b
preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
assert preacts.shape[0] == len(next_states), preacts.shape
statevecs, which = ops.maxout(preacts)
# Multiply the state-vector by the scores weights and add the bias,
# to get the logits.
scores = model.ops.gemm(statevecs, upper_W, trans2=True)
scores += upper_b
scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores)
# Transition the states, filtering out any that are finished.
next_states = moves.transition_states(next_states, scores)
all_scores.append(scores)
if is_train:
# Remember intermediate results for the backprop.
all_ids.append(ids.copy())
all_statevecs.append(statevecs)
all_which.append(which)
def backprop_parser(d_states_d_scores):
d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
ids = model.ops.xp.vstack(all_ids)
which = ops.xp.vstack(all_which)
statevecs = model.ops.xp.vstack(all_statevecs)
_, d_scores = d_states_d_scores
if model.attrs.get("unseen_classes"):
# If we have a negative gradient (i.e. the probability should
# increase) on any classes we filtered out as unseen, mark
# them as seen.
for clas in set(model.attrs["unseen_classes"]):
if (d_scores[:, clas] < 0).any():
model.attrs["unseen_classes"].remove(clas)
d_scores *= unseen_mask
# Calculate the gradients for the parameters of the upper layer.
# The weight gemm is (nS, nO) @ (nS, nH).T
model.inc_grad("upper_b", d_scores.sum(axis=0))
model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True))
# Now calculate d_statevecs, by backproping through the upper linear layer.
# This gemm is (nS, nO) @ (nO, nH)
d_statevecs = model.ops.gemm(d_scores, upper_W)
# Backprop through the maxout activation
d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP)
d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
model.inc_grad("lower_b", d_preacts2f.sum(axis=0))
# We don't need to backprop the summation, because we pass back the IDs instead
d_state_features = backprop_feats((d_preacts2f, ids))
d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
model.ops.scatter_add(d_tokvecs, ids, d_state_features)
model.inc_grad("lower_pad", d_tokvecs[-1])
return (backprop_tok2vec(d_tokvecs[:-1]), None)
return (states, all_scores), backprop_parser
def _forward_reference(
model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool
):
"""Slow reference implementation, without the precomputation"""
nF = model.get_dim("nF")
tok2vec = model.get_ref("tok2vec")
lower_pad = model.get_param("lower_pad")
lower_W = model.get_param("lower_W")
lower_b = model.get_param("lower_b")
upper_W = model.get_param("upper_W")
upper_b = model.get_param("upper_b")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
nO = model.get_dim("nO")
nI = model.get_dim("nI")
ops = model.ops
docs, moves = docs_moves
states = moves.init_batch(docs)
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
tokvecs = model.ops.xp.vstack((tokvecs, lower_pad))
all_ids = []
all_which = []
all_statevecs = []
all_scores = []
all_tokfeats = []
next_states = [s for s in states if not s.is_final()]
unseen_mask = _get_unseen_mask(model)
ids = numpy.zeros((len(states), nF), dtype="i")
while next_states:
ids = ids[: len(next_states)]
for i, state in enumerate(next_states):
state.set_context_tokens(ids, i, nF)
# Sum the state features, add the bias and apply the activation (maxout)
# to create the state vectors.
tokfeats3f = tokvecs[ids]
tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1)
preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True)
preacts2f += lower_b
preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
statevecs, which = ops.maxout(preacts)
# Multiply the state-vector by the scores weights and add the bias,
# to get the logits.
scores = model.ops.gemm(statevecs, upper_W, trans2=True)
scores += upper_b
scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores)
# Transition the states, filtering out any that are finished.
next_states = moves.transition_states(next_states, scores)
all_scores.append(scores)
if is_train:
# Remember intermediate results for the backprop.
all_tokfeats.append(tokfeats)
all_ids.append(ids.copy())
all_statevecs.append(statevecs)
all_which.append(which)
nS = sum(len(s.history) for s in states)
def backprop_parser(d_states_d_scores):
d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
ids = model.ops.xp.vstack(all_ids)
which = ops.xp.vstack(all_which)
statevecs = model.ops.xp.vstack(all_statevecs)
tokfeats = model.ops.xp.vstack(all_tokfeats)
_, d_scores = d_states_d_scores
if model.attrs.get("unseen_classes"):
# If we have a negative gradient (i.e. the probability should
# increase) on any classes we filtered out as unseen, mark
# them as seen.
for clas in set(model.attrs["unseen_classes"]):
if (d_scores[:, clas] < 0).any():
model.attrs["unseen_classes"].remove(clas)
d_scores *= unseen_mask
assert statevecs.shape == (nS, nH), statevecs.shape
assert d_scores.shape == (nS, nO), d_scores.shape
# Calculate the gradients for the parameters of the upper layer.
# The weight gemm is (nS, nO) @ (nS, nH).T
model.inc_grad("upper_b", d_scores.sum(axis=0))
model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True))
# Now calculate d_statevecs, by backproping through the upper linear layer.
# This gemm is (nS, nO) @ (nO, nH)
d_statevecs = model.ops.gemm(d_scores, upper_W)
# Backprop through the maxout activation
d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP)
d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
# Now increment the gradients for the lower layer.
# The gemm here is (nS, nH*nP) @ (nS, nF*nI)
model.inc_grad("lower_b", d_preacts2f.sum(axis=0))
model.inc_grad("lower_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True))
# Caclulate d_tokfeats
# The gemm here is (nS, nH*nP) @ (nH*nP, nF*nI)
d_tokfeats = model.ops.gemm(d_preacts2f, lower_W)
# Get the gradients of the tokvecs and the padding
d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI)
model.ops.scatter_add(d_tokvecs, ids, d_tokfeats3f)
model.inc_grad("lower_pad", d_tokvecs[-1])
return (backprop_tok2vec(d_tokvecs[:-1]), None)
return (states, all_scores), backprop_parser
def _get_unseen_mask(model: Model) -> Floats1d:
mask = model.ops.alloc1f(model.get_dim("nO"))
mask.fill(1)
for class_ in model.attrs.get("unseen_classes", set()):
mask[class_] = 0
return mask
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
W: Floats2d = model.get_param("lower_W")
nF = model.get_dim("nF")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
W3f = W3f.transpose((1, 0, 2))
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
assert X.shape == (X.shape[0], nI), X.shape
Yf_ = model.ops.gemm(X, W2f, trans2=True)
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
# This backprop is particularly tricky, because we get back a different
# thing from what we put out. We put out an array of shape:
# (nB, nF, nH, nP), and get back:
# (nB, nH, nP) and ids (nB, nF)
# The ids tell us the values of nF, so we would have:
#
# dYf = zeros((nB, nF, nH, nP))
# for b in range(nB):
# for f in range(nF):
# dYf[b, ids[b, f]] += dY[b]
#
# However, we avoid building that array for efficiency -- and just pass
# in the indices.
dY, ids = dY_ids
dXf = model.ops.gemm(dY, W)
Xf = X[ids].reshape((ids.shape[0], -1))
dW = model.ops.gemm(dY, Xf, trans1=True)
model.inc_grad("lower_W", dW)
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
return Yf, backward
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
if Y is None:
return None
_, scores = Y
if len(scores) == 0:
return None
assert scores[0].shape[0] >= 1
assert len(scores[0].shape) == 2
return scores[0].shape[1]
def _lsuv_init(model: Model):
"""This is like the 'layer sequential unit variance', but instead
of taking the actual inputs, we randomly generate whitened data.
Why's this all so complicated? We have a huge number of inputs,
and the maxout unit makes guessing the dynamics tricky. Instead
we set the maxout weights to values that empirically result in
whitened outputs given whitened inputs.
"""
W = model.maybe_get_param("lower_W")
if W is not None and W.any():
return
nF = model.get_dim("nF")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.ops.alloc4f(nF, nH, nP, nI)
b = model.ops.alloc2f(nH, nP)
pad = model.ops.alloc4f(1, nF, nH, nP)
ops = model.ops
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
pad = normal_init(ops, pad.shape, mean=1.0)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
ids = ops.alloc_f((5000, nF), dtype="f")
ids += ops.xp.random.uniform(0, 1000, ids.shape)
ids = ops.asarray(ids, dtype="i")
tokvecs = ops.alloc_f((5000, nI), dtype="f")
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
tokvecs.shape
) )
return step_model, step_model.finish_steps def predict(ids, tokvecs):
# nS ids. nW tokvecs. Exclude the padding array.
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
# need nS vectors
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
vectors3f += b
return model.ops.maxout(vectors3f)[0]
tol_var = 0.01
def init(model, X=None, Y=None): tol_mean = 0.01
model.get_ref("tok2vec").initialize(X=X) t_max = 10
lower = model.get_ref("lower") W = cast(Floats4d, model.get_param("lower_W").copy())
lower.initialize() b = cast(Floats2d, model.get_param("lower_b").copy())
if model.attrs["has_upper"]: for t_i in range(t_max):
statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) acts1 = predict(ids, tokvecs)
model.get_ref("upper").initialize(X=statevecs) var = model.ops.xp.var(acts1)
mean = model.ops.xp.mean(acts1)
if abs(var - 1.0) >= tol_var:
W /= model.ops.xp.sqrt(var)
model.set_param("lower_W", W)
elif abs(mean) >= tol_mean:
b -= mean
model.set_param("lower_b", b)
else:
break
return model

View File

@ -33,6 +33,7 @@ cdef cppclass StateC:
vector[ArcC] _left_arcs vector[ArcC] _left_arcs
vector[ArcC] _right_arcs vector[ArcC] _right_arcs
vector[libcpp.bool] _unshiftable vector[libcpp.bool] _unshiftable
vector[int] history
set[int] _sent_starts set[int] _sent_starts
TokenC _empty_token TokenC _empty_token
int length int length
@ -387,3 +388,4 @@ cdef cppclass StateC:
this._b_i = src._b_i this._b_i = src._b_i
this.offset = src.offset this.offset = src.offset
this._empty_token = src._empty_token this._empty_token = src._empty_token
this.history = src.history

View File

@ -772,6 +772,8 @@ cdef class ArcEager(TransitionSystem):
return list(arcs) return list(arcs)
def has_gold(self, Example eg, start=0, end=None): def has_gold(self, Example eg, start=0, end=None):
if end is not None and end < 0:
end = None
for word in eg.y[start:end]: for word in eg.y[start:end]:
if word.dep != 0: if word.dep != 0:
return True return True
@ -857,6 +859,7 @@ cdef class ArcEager(TransitionSystem):
state.print_state() state.print_state()
))) )))
action.do(state.c, action.label) action.do(state.c, action.label)
state.c.history.push_back(i)
break break
else: else:
failed = False failed = False

View File

@ -307,6 +307,8 @@ cdef class BiluoPushDown(TransitionSystem):
for span in eg.y.spans.get(neg_key, []): for span in eg.y.spans.get(neg_key, []):
if span.start >= start and span.end <= end: if span.start >= start and span.end <= end:
return True return True
if end is not None and end < 0:
end = None
for word in eg.y[start:end]: for word in eg.y[start:end]:
if word.ent_iob != 0: if word.ent_iob != 0:
return True return True
@ -387,9 +389,9 @@ cdef class Begin:
elif st.B_(1).ent_iob == 3: elif st.B_(1).ent_iob == 3:
# If the next word is B, we can't B now # If the next word is B, we can't B now
return False return False
elif st.B_(1).sent_start == 1: #elif st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries # # Don't allow entities to extend across sentence boundaries
return False # return False
# Don't allow entities to start on whitespace # Don't allow entities to start on whitespace
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE): elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
return False return False
@ -465,9 +467,9 @@ cdef class In:
# Otherwise, force acceptance, even if we're across a sentence # Otherwise, force acceptance, even if we're across a sentence
# boundary or the token is whitespace. # boundary or the token is whitespace.
return True return True
elif st.B(1) != -1 and st.B_(1).sent_start == 1: #elif st.B(1) != -1 and st.B_(1).sent_start == 1:
# Don't allow entities to extend across sentence boundaries # # Don't allow entities to extend across sentence boundaries
return False # return False
else: else:
return True return True

View File

@ -20,6 +20,10 @@ cdef class StateClass:
if self._borrowed != 1: if self._borrowed != 1:
del self.c del self.c
@property
def history(self):
return list(self.c.history)
@property @property
def stack(self): def stack(self):
return [self.S(i) for i in range(self.c.stack_depth())] return [self.S(i) for i in range(self.c.stack_depth())]
@ -176,3 +180,6 @@ cdef class StateClass:
def clone(self, StateClass src): def clone(self, StateClass src):
self.c.clone(src.c) self.c.clone(src.c)
def set_context_tokens(self, int[:, :] output, int row, int n_feats):
self.c.set_context_tokens(&output[row, 0], n_feats)

View File

@ -1,6 +1,8 @@
# cython: infer_types=True # cython: infer_types=True
from __future__ import print_function from __future__ import print_function
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from libc.stdlib cimport calloc, free
from libcpp.vector cimport vector
from collections import Counter from collections import Counter
import srsly import srsly
@ -73,7 +75,18 @@ cdef class TransitionSystem:
offset += len(doc) offset += len(doc)
return states return states
def follow_history(self, doc, history):
cdef int clas
cdef StateClass state = StateClass(doc)
for clas in history:
action = self.c[clas]
action.do(state.c, action.label)
state.c.history.push_back(clas)
return state
def get_oracle_sequence(self, Example example, _debug=False): def get_oracle_sequence(self, Example example, _debug=False):
if not self.has_gold(example):
return []
states, golds, _ = self.init_gold_batch([example]) states, golds, _ = self.init_gold_batch([example])
if not states: if not states:
return [] return []
@ -85,6 +98,8 @@ cdef class TransitionSystem:
return self.get_oracle_sequence_from_state(state, gold) return self.get_oracle_sequence_from_state(state, gold)
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None): def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
if state.is_final():
return []
cdef Pool mem = Pool() cdef Pool mem = Pool()
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.n_moves > 0 assert self.n_moves > 0
@ -110,6 +125,7 @@ cdef class TransitionSystem:
"S0 head?", str(state.has_head(state.S(0))), "S0 head?", str(state.has_head(state.S(0))),
))) )))
action.do(state.c, action.label) action.do(state.c, action.label)
state.c.history.push_back(i)
break break
else: else:
if _debug: if _debug:
@ -137,6 +153,17 @@ cdef class TransitionSystem:
raise ValueError(Errors.E170.format(name=name)) raise ValueError(Errors.E170.format(name=name))
action = self.lookup_transition(name) action = self.lookup_transition(name)
action.do(state.c, action.label) action.do(state.c, action.label)
state.c.history.push_back(action.clas)
def transition_states(self, states, float[:, ::1] scores):
assert len(states) == scores.shape[0]
cdef StateClass state
cdef float* c_scores = &scores[0, 0]
cdef vector[StateC*] c_states
for state in states:
c_states.push_back(state.c)
c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
return [state for state in states if not state.c.is_final()]
cdef Transition lookup_transition(self, object name) except *: cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError raise NotImplementedError
@ -250,3 +277,31 @@ cdef class TransitionSystem:
self.cfg.update(msg['cfg']) self.cfg.update(msg['cfg'])
self.initialize_actions(labels) self.initialize_actions(labels)
return self return self
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
int nr_class, int batch_size) nogil:
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
cdef int i, guess
cdef Transition action
for i in range(batch_size):
moves.set_valid(is_valid, states[i])
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
if guess == -1:
# This shouldn't happen, but it's hard to raise an error here,
# and we don't want to infinite loop. So, force to end state.
states[i].force_final()
else:
action = moves.c[guess]
action.do(states[i], action.label)
states[i].history.push_back(guess)
free(is_valid)
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
cdef int best = -1
for i in range(n):
if is_valid[i] >= 1:
if best == -1 or scores[i] > scores[best]:
best = i
return best

View File

@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
from thinc.api import Model, Config from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser from .transition_parser import Parser
from ._parser_internals.arc_eager cimport ArcEager from ._parser_internals.arc_eager import ArcEager
from .functions import merge_subtokens from .functions import merge_subtokens
from ..language import Language from ..language import Language
@ -17,12 +17,11 @@ from ..util import registry
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser" state_type = "parser"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
use_upper = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2" @architectures = "spacy.HashEmbedCNN.v2"
@ -122,6 +121,7 @@ def make_parser(
scorer=scorer, scorer=scorer,
) )
@Language.factory( @Language.factory(
"beam_parser", "beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"], assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@ -227,6 +227,7 @@ def parser_score(examples, **kwargs):
DOCS: https://spacy.io/api/dependencyparser#score DOCS: https://spacy.io/api/dependencyparser#score
""" """
def has_sents(doc): def has_sents(doc):
return doc.has_annotation("SENT_START") return doc.has_annotation("SENT_START")
@ -234,8 +235,11 @@ def parser_score(examples, **kwargs):
dep = getattr(token, attr) dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower() dep = token.vocab.strings.as_string(dep).lower()
return dep return dep
results = {} results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)) results.update(
Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
)
kwargs.setdefault("getter", dep_getter) kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct")) kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs)) results.update(Scorer.score_deps(examples, "dep", **kwargs))
@ -248,11 +252,12 @@ def make_parser_scorer():
return parser_score return parser_score
cdef class DependencyParser(Parser): class DependencyParser(Parser):
"""Pipeline component for dependency parsing. """Pipeline component for dependency parsing.
DOCS: https://spacy.io/api/dependencyparser DOCS: https://spacy.io/api/dependencyparser
""" """
TransitionSystem = ArcEager TransitionSystem = ArcEager
def __init__( def __init__(
@ -272,8 +277,7 @@ cdef class DependencyParser(Parser):
incorrect_spans_key=None, incorrect_spans_key=None,
scorer=parser_score, scorer=parser_score,
): ):
"""Create a DependencyParser. """Create a DependencyParser."""
"""
super().__init__( super().__init__(
vocab, vocab,
model, model,

View File

@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
from thinc.api import Model, Config from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser from .transition_parser import Parser
from ._parser_internals.ner cimport BiluoPushDown from ._parser_internals.ner import BiluoPushDown
from ..language import Language from ..language import Language
from ..scorer import get_ner_prf, PRFScore from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples
from ..util import registry from ..util import registry
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner" state_type = "ner"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
use_upper = true
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2" @architectures = "spacy.HashEmbedCNN.v2"
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"incorrect_spans_key": None, "incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"}, "scorer": {"@scorers": "spacy.ner_scorer.v1"},
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
) )
def make_ner( def make_ner(
nlp: Language, nlp: Language,
@ -98,6 +102,7 @@ def make_ner(
scorer=scorer, scorer=scorer,
) )
@Language.factory( @Language.factory(
"beam_ner", "beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"], assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@ -111,7 +116,12 @@ def make_ner(
"incorrect_spans_key": None, "incorrect_spans_key": None,
"scorer": None, "scorer": None,
}, },
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None}, default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
) )
def make_beam_ner( def make_beam_ner(
nlp: Language, nlp: Language,
@ -185,11 +195,12 @@ def make_ner_scorer():
return ner_score return ner_score
cdef class EntityRecognizer(Parser): class EntityRecognizer(Parser):
"""Pipeline component for named entity recognition. """Pipeline component for named entity recognition.
DOCS: https://spacy.io/api/entityrecognizer DOCS: https://spacy.io/api/entityrecognizer
""" """
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
def __init__( def __init__(
@ -207,8 +218,7 @@ cdef class EntityRecognizer(Parser):
incorrect_spans_key=None, incorrect_spans_key=None,
scorer=ner_score, scorer=ner_score,
): ):
"""Create an EntityRecognizer. """Create an EntityRecognizer."""
"""
super().__init__( super().__init__(
vocab, vocab,
model, model,
@ -242,8 +252,11 @@ cdef class EntityRecognizer(Parser):
def labels(self): def labels(self):
# Get the labels from the model by looking at the available moves, e.g. # Get the labels from the model by looking at the available moves, e.g.
# B-PERSON, I-PERSON, L-PERSON, U-PERSON # B-PERSON, I-PERSON, L-PERSON, U-PERSON
labels = set(move.split("-")[1] for move in self.move_names labels = set(
if move[0] in ("B", "I", "L", "U")) move.split("-")[1]
for move in self.move_names
if move[0] in ("B", "I", "L", "U")
)
return tuple(sorted(labels)) return tuple(sorted(labels))
def scored_ents(self, beams): def scored_ents(self, beams):

View File

@ -1,19 +0,0 @@
from cymem.cymem cimport Pool
from ..vocab cimport Vocab
from .trainable_pipe cimport TrainablePipe
from ._parser_internals.transition_system cimport Transition, TransitionSystem
from ._parser_internals._state cimport StateC
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
cdef class Parser(TrainablePipe):
cdef public object _rehearsal_model
cdef readonly TransitionSystem moves
cdef public object _multitasks
cdef void _parseC(self, StateC** states,
WeightsC weights, SizesC sizes) nogil
cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil

View File

@ -7,30 +7,29 @@ from libcpp.vector cimport vector
from libc.string cimport memset, memcpy from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free from libc.stdlib cimport calloc, free
import random import random
import contextlib
import srsly import srsly
from thinc.api import set_dropout_rate, CupyOps from thinc.api import set_dropout_rate, CupyOps, get_array_module
from thinc.extra.search cimport Beam from thinc.extra.search cimport Beam
import numpy.random import numpy.random
import numpy import numpy
import warnings import warnings
from ._parser_internals.stateclass cimport StateClass from ._parser_internals.stateclass cimport StateClass
from ..ml.parser_model cimport alloc_activations, free_activations
from ..ml.parser_model cimport predict_states, arg_max_if_valid
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ..ml.parser_model cimport get_c_weights, get_c_sizes
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from .trainable_pipe import TrainablePipe from .trainable_pipe import TrainablePipe
from ._parser_internals cimport _beam_utils from ._parser_internals cimport _beam_utils
from ._parser_internals import _beam_utils from ._parser_internals import _beam_utils
from ..vocab cimport Vocab
from ._parser_internals.transition_system cimport TransitionSystem
from ..training import validate_examples, validate_get_examples from ..training import validate_examples, validate_get_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .. import util from .. import util
cdef class Parser(TrainablePipe): class Parser(TrainablePipe):
""" """
Base class of the DependencyParser and EntityRecognizer. Base class of the DependencyParser and EntityRecognizer.
""" """
@ -129,8 +128,9 @@ cdef class Parser(TrainablePipe):
@property @property
def move_names(self): def move_names(self):
names = [] names = []
cdef TransitionSystem moves = self.moves
for i in range(self.moves.n_moves): for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label) name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
# Explicitly removing the internal "U-" token used for blocking entities # Explicitly removing the internal "U-" token used for blocking entities
if name != "U-": if name != "U-":
names.append(name) names.append(name)
@ -219,9 +219,6 @@ cdef class Parser(TrainablePipe):
stream: The sequence of documents to process. stream: The sequence of documents to process.
batch_size (int): Number of documents to accumulate into a working set. batch_size (int): Number of documents to accumulate into a working set.
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
deals with a failing batch of documents. The default function just reraises
the exception.
YIELDS (Doc): Documents, in order. YIELDS (Doc): Documents, in order.
""" """
@ -243,79 +240,27 @@ cdef class Parser(TrainablePipe):
def predict(self, docs): def predict(self, docs):
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
self._ensure_labels_are_added(docs)
if not any(len(doc) for doc in docs): if not any(len(doc) for doc in docs):
result = self.moves.init_batch(docs) result = self.moves.init_batch(docs)
return result return result
if self.cfg["beam_width"] == 1: with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
return self.greedy_parse(docs, drop=0.0) states_or_beams, _ = self.model.predict((docs, self.moves))
else: return states_or_beams
return self.beam_parse(
docs,
drop=0.0,
beam_width=self.cfg["beam_width"],
beam_density=self.cfg["beam_density"]
)
def greedy_parse(self, docs, drop=0.): def greedy_parse(self, docs, drop=0.):
cdef vector[StateC*] states # TODO: Deprecated
cdef StateClass state self._resize()
self._ensure_labels_are_added(docs) with _change_attrs(self.model, beam_width=1):
set_dropout_rate(self.model, drop) states, _ = self.model.predict((docs, self.moves))
batch = self.moves.init_batch(docs) return states
model = self.model.predict(docs)
weights = get_c_weights(model)
for state in batch:
if not state.is_final():
states.push_back(state.c)
sizes = get_c_sizes(model, states.size())
with nogil:
self._parseC(&states[0],
weights, sizes)
model.clear_memory()
del model
return batch
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.): def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
cdef Beam beam # TODO: Deprecated
cdef Doc doc self._resize()
self._ensure_labels_are_added(docs) with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
batch = _beam_utils.BeamBatch( beams, _ = self.model.predict((docs, self.moves))
self.moves, return beams
self.moves.init_batch(docs),
None,
beam_width,
density=beam_density
)
model = self.model.predict(docs)
while not batch.is_done:
states = batch.get_unfinished_states()
if not states:
break
scores = model.predict(states)
batch.advance(scores)
model.clear_memory()
del model
return list(batch)
cdef void _parseC(self, StateC** states,
WeightsC weights, SizesC sizes) nogil:
cdef int i, j
cdef vector[StateC*] unfinished
cdef ActivationsC activations = alloc_activations(sizes)
while sizes.states >= 1:
predict_states(&activations,
states, &weights, sizes)
# Validate actions, argmax, take action.
self.c_transition_batch(states,
activations.scores, sizes.classes, sizes.states)
for i in range(sizes.states):
if not states[i].is_final():
unfinished.push_back(states[i])
for i in range(unfinished.size()):
states[i] = unfinished[i]
sizes.states = unfinished.size()
unfinished.clear()
free_activations(&activations)
def set_annotations(self, docs, states_or_beams): def set_annotations(self, docs, states_or_beams):
cdef StateClass state cdef StateClass state
@ -327,35 +272,6 @@ cdef class Parser(TrainablePipe):
for hook in self.postprocesses: for hook in self.postprocesses:
hook(doc) hook(doc)
def transition_states(self, states, float[:, ::1] scores):
cdef StateClass state
cdef float* c_scores = &scores[0, 0]
cdef vector[StateC*] c_states
for state in states:
c_states.push_back(state.c)
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
return [state for state in states if not state.c.is_final()]
cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil:
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
with gil:
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
cdef int i, guess
cdef Transition action
for i in range(batch_size):
self.moves.set_valid(is_valid, states[i])
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
if guess == -1:
# This shouldn't happen, but it's hard to raise an error here,
# and we don't want to infinite loop. So, force to end state.
states[i].force_final()
else:
action = self.moves.c[guess]
action.do(states[i], action.label)
free(is_valid)
def update(self, examples, *, drop=0., sgd=None, losses=None): def update(self, examples, *, drop=0., sgd=None, losses=None):
cdef StateClass state cdef StateClass state
if losses is None: if losses is None:
@ -367,166 +283,88 @@ cdef class Parser(TrainablePipe):
) )
for multitask in self._multitasks: for multitask in self._multitasks:
multitask.update(examples, drop=drop, sgd=sgd) multitask.update(examples, drop=drop, sgd=sgd)
# We need to take care to act on the whole batch, because we might be
# getting vectors via a listener.
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)]) n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
if n_examples == 0: if n_examples == 0:
return losses return losses
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
# The probability we use beam update, instead of falling back to docs = [eg.x for eg in examples if len(eg.x)]
# a greedy update (states, scores), backprop_scores = self.model.begin_update((docs, self.moves))
beam_update_prob = self.cfg["beam_update_prob"] if sum(s.shape[0] for s in scores) == 0:
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
return self.update_beam(
examples,
beam_width=self.cfg["beam_width"],
sgd=sgd,
losses=losses,
beam_density=self.cfg["beam_density"]
)
max_moves = self.cfg["update_with_oracle_cut_size"]
if max_moves >= 1:
# Chop sequences into lengths of this many words, to make the
# batch uniform length.
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
states, golds, _ = self._init_gold_batch(
examples,
max_length=max_moves
)
else:
states, golds, _ = self.moves.init_gold_batch(examples)
if not states:
return losses return losses
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples]) d_scores = self.get_loss((states, scores), examples)
backprop_scores((states, d_scores))
all_states = list(states)
states_golds = list(zip(states, golds))
n_moves = 0
while states_golds:
states, golds = zip(*states_golds)
scores, backprop = model.begin_update(states)
d_scores = self.get_batch_loss(states, golds, scores, losses)
# Note that the gradient isn't normalized by the batch size
# here, because our "samples" are really the states...But we
# can't normalize by the number of states either, as then we'd
# be getting smaller gradients for states in long sequences.
backprop(d_scores)
# Follow the predicted action
self.transition_states(states, scores)
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
if max_moves >= 1 and n_moves >= max_moves:
break
n_moves += 1
backprop_tok2vec(golds)
if sgd not in (None, False): if sgd not in (None, False):
self.finish_update(sgd) self.finish_update(sgd)
losses[self.name] += (d_scores**2).sum()
# Ugh, this is annoying. If we're working on GPU, we want to free the # Ugh, this is annoying. If we're working on GPU, we want to free the
# memory ASAP. It seems that Python doesn't necessarily get around to # memory ASAP. It seems that Python doesn't necessarily get around to
# removing these in time if we don't explicitly delete? It's confusing. # removing these in time if we don't explicitly delete? It's confusing.
del backprop del backprop_scores
del backprop_tok2vec
model.clear_memory()
del model
return losses return losses
def get_loss(self, states_scores, examples):
states, scores = states_scores
scores = self.model.ops.xp.vstack(scores)
costs = self._get_costs_from_histories(
examples,
[list(state.history) for state in states]
)
xp = get_array_module(scores)
best_costs = costs.min(axis=1, keepdims=True)
gscores = scores.copy()
min_score = scores.min() - 1000
assert costs.shape == scores.shape, (costs.shape, scores.shape)
gscores[costs > best_costs] = min_score
max_ = scores.max(axis=1, keepdims=True)
gmax = gscores.max(axis=1, keepdims=True)
exp_scores = xp.exp(scores - max_)
exp_gscores = xp.exp(gscores - gmax)
Z = exp_scores.sum(axis=1, keepdims=True)
gZ = exp_gscores.sum(axis=1, keepdims=True)
d_scores = exp_scores / Z
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
return d_scores
def _get_costs_from_histories(self, examples, histories):
cdef TransitionSystem moves = self.moves
cdef StateClass state
cdef int clas
cdef int nF = self.model.get_dim("nF")
cdef int nO = moves.n_moves
cdef int nS = sum([len(history) for history in histories])
cdef Pool mem = Pool()
is_valid = <int*>mem.alloc(nO, sizeof(int))
c_costs = <float*>mem.alloc(nO, sizeof(float))
states = moves.init_batch([eg.x for eg in examples])
batch = []
for eg, s, h in zip(examples, states, histories):
if not s.is_final():
gold = moves.init_gold(s, eg)
batch.append((eg, s, h, gold))
output = []
while batch:
costs = numpy.zeros((len(batch), nO), dtype="f")
for i, (eg, state, history, gold) in enumerate(batch):
clas = history.pop(0)
moves.set_costs(is_valid, c_costs, state.c, gold)
action = moves.c[clas]
action.do(state.c, action.label)
state.c.history.push_back(clas)
for j in range(nO):
costs[i, j] = c_costs[j]
output.append(costs)
batch = [(eg, s, h, g) for eg, s, h, g in batch if len(h) != 0]
return self.model.ops.xp.vstack(output)
def rehearse(self, examples, sgd=None, losses=None, **cfg): def rehearse(self, examples, sgd=None, losses=None, **cfg):
"""Perform a "rehearsal" update, to prevent catastrophic forgetting.""" """Perform a "rehearsal" update, to prevent catastrophic forgetting."""
if losses is None: raise NotImplementedError
losses = {}
for multitask in self._multitasks:
if hasattr(multitask, 'rehearse'):
multitask.rehearse(examples, losses=losses, sgd=sgd)
if self._rehearsal_model is None:
return None
losses.setdefault(self.name, 0.)
validate_examples(examples, "Parser.rehearse")
docs = [eg.predicted for eg in examples]
states = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self._resize()
# Prepare the stepwise model, and get the callback for finishing the batch
set_dropout_rate(self._rehearsal_model, 0.0)
set_dropout_rate(self.model, 0.0)
tutor, _ = self._rehearsal_model.begin_update(docs)
model, backprop_tok2vec = self.model.begin_update(docs)
n_scores = 0.
loss = 0.
while states:
targets, _ = tutor.begin_update(states)
guesses, backprop = model.begin_update(states)
d_scores = (guesses - targets) / targets.shape[0]
# If all weights for an output are 0 in the original model, don't
# supervise that output. This allows us to add classes.
loss += (d_scores**2).sum()
backprop(d_scores)
# Follow the predicted action
self.transition_states(states, guesses)
states = [state for state in states if not state.is_final()]
n_scores += d_scores.size
# Do the backprop
backprop_tok2vec(docs)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss / n_scores
del backprop
del backprop_tok2vec
model.clear_memory()
tutor.clear_memory()
del model
del tutor
return losses
def update_beam(self, examples, *, beam_width, def update_beam(self, examples, *, beam_width,
drop=0., sgd=None, losses=None, beam_density=0.0): drop=0., sgd=None, losses=None, beam_density=0.0):
states, golds, _ = self.moves.init_gold_batch(examples) raise NotImplementedError
if not states:
return losses
# Prepare the stepwise model, and get the callback for finishing the batch
model, backprop_tok2vec = self.model.begin_update(
[eg.predicted for eg in examples])
loss = _beam_utils.update_beam(
self.moves,
states,
golds,
model,
beam_width,
beam_density=beam_density,
)
losses[self.name] += loss
backprop_tok2vec(golds)
if sgd is not None:
self.finish_update(sgd)
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
cdef StateClass state
cdef Pool mem = Pool()
cdef int i
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
dtype='f', order='C')
c_d_scores = <float*>d_scores.data
unseen_classes = self.model.attrs["unseen_classes"]
for i, (state, gold) in enumerate(zip(states, golds)):
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
memset(costs, 0, self.moves.n_moves * sizeof(float))
self.moves.set_costs(is_valid, costs, state.c, gold)
for j in range(self.moves.n_moves):
if costs[j] <= 0.0 and j in unseen_classes:
unseen_classes.remove(j)
cpu_log_loss(c_d_scores,
costs, is_valid, &scores[i, 0], d_scores.shape[1])
c_d_scores += d_scores.shape[1]
# Note that we don't normalize this. See comment in update() for why.
if losses is not None:
losses.setdefault(self.name, 0.)
losses[self.name] += (d_scores**2).sum()
return d_scores
def set_output(self, nO): def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO) self.model.attrs["resize_output"](self.model, nO)
@ -565,7 +403,7 @@ cdef class Parser(TrainablePipe):
for example in islice(get_examples(), 10): for example in islice(get_examples(), 10):
doc_sample.append(example.predicted) doc_sample.append(example.predicted)
assert len(doc_sample) > 0, Errors.E923.format(name=self.name) assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(doc_sample) self.model.initialize((doc_sample, self.moves))
if nlp is not None: if nlp is not None:
self.init_multitask_objectives(get_examples, nlp.pipeline) self.init_multitask_objectives(get_examples, nlp.pipeline)
@ -622,44 +460,18 @@ cdef class Parser(TrainablePipe):
raise ValueError(Errors.E149) from None raise ValueError(Errors.E149) from None
return self return self
def _init_gold_batch(self, examples, max_length):
"""Make a square batch, of length equal to the shortest transition @contextlib.contextmanager
sequence or a cap. A long def _change_attrs(model, **kwargs):
doc will get multiple states. Let's say we have a doc of length 2*N, """Temporarily modify a thinc model's attributes."""
where N is the shortest doc. We'll make two states, one representing unset = object()
long_doc[:N], and another representing long_doc[N:].""" old_attrs = {}
cdef: for key, value in kwargs.items():
StateClass start_state old_attrs[key] = model.attrs.get(key, unset)
StateClass state model.attrs[key] = value
Transition action yield model
all_states = self.moves.init_batch([eg.predicted for eg in examples]) for key, value in old_attrs.items():
states = [] if value is unset:
golds = [] model.attrs.pop(key)
to_cut = []
for state, eg in zip(all_states, examples):
if self.moves.has_gold(eg) and not state.is_final():
gold = self.moves.init_gold(state, eg)
if len(eg.x) < max_length:
states.append(state)
golds.append(gold)
else: else:
oracle_actions = self.moves.get_oracle_sequence_from_state( model.attrs[key] = value
state.copy(), gold)
to_cut.append((eg, state, gold, oracle_actions))
if not to_cut:
return states, golds, 0
cdef int clas
for eg, state, gold, oracle_actions in to_cut:
for i in range(0, len(oracle_actions), max_length):
start_state = state.copy()
for clas in oracle_actions[i:i+max_length]:
action = self.moves.c[clas]
action.do(state.c, action.label)
if state.is_final():
break
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
states.append(start_state)
golds.append(gold)
if state.is_final():
break
return states, golds, max_length

View File

@ -123,6 +123,7 @@ def test_ner_labels_added_implicitly_on_predict():
assert "D" in ner.labels assert "D" in ner.labels
@pytest.mark.skip(reason="Not yet supported")
def test_ner_labels_added_implicitly_on_beam_parse(): def test_ner_labels_added_implicitly_on_beam_parse():
nlp = Language() nlp = Language()
ner = nlp.add_pipe("beam_ner") ner = nlp.add_pipe("beam_ner")
@ -134,6 +135,7 @@ def test_ner_labels_added_implicitly_on_beam_parse():
assert "D" in ner.labels assert "D" in ner.labels
@pytest.mark.skip(reason="greedy_parse is deprecated")
def test_ner_labels_added_implicitly_on_greedy_parse(): def test_ner_labels_added_implicitly_on_greedy_parse():
nlp = Language() nlp = Language()
ner = nlp.add_pipe("beam_ner") ner = nlp.add_pipe("beam_ner")

View File

@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.training import Example, iob_to_biluo from spacy.training import Example, iob_to_biluo
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from thinc.api import fix_random_seed
import logging import logging
from ..util import make_tempdir from ..util import make_tempdir
@ -180,6 +181,7 @@ def test_issue4267():
assert token.ent_iob == 2 assert token.ent_iob == 2
@pytest.mark.xfail(reason="no beam parser yet")
@pytest.mark.issue(4313) @pytest.mark.issue(4313)
def test_issue4313(): def test_issue4313():
"""This should not crash or exit with some strange error code""" """This should not crash or exit with some strange error code"""
@ -391,7 +393,7 @@ def test_train_empty():
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True) ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON") ner.add_label("PERSON")
nlp.initialize() nlp.initialize(get_examples=lambda: train_examples)
for itn in range(2): for itn in range(2):
losses = {} losses = {}
batches = util.minibatch(train_examples, size=8) batches = util.minibatch(train_examples, size=8)
@ -518,11 +520,11 @@ def test_block_ner():
assert [token.ent_type_ for token in doc] == expected_types assert [token.ent_type_ for token in doc] == expected_types
@pytest.mark.parametrize("use_upper", [True, False]) def test_overfitting_IO():
def test_overfitting_IO(use_upper): fix_random_seed(1)
# Simple test to try and quickly overfit the NER component # Simple test to try and quickly overfit the NER component
nlp = English() nlp = English()
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}}) ner = nlp.add_pipe("ner", config={"model": {}})
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -533,7 +535,7 @@ def test_overfitting_IO(use_upper):
for i in range(50): for i in range(50):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.00001 assert losses["ner"] < 0.001
# test the trained model # test the trained model
test_text = "I like London." test_text = "I like London."
@ -554,7 +556,6 @@ def test_overfitting_IO(use_upper):
assert ents2[0].label_ == "LOC" assert ents2[0].label_ == "LOC"
# Ensure that the predictions are still the same, even after adding a new label # Ensure that the predictions are still the same, even after adding a new label
ner2 = nlp2.get_pipe("ner") ner2 = nlp2.get_pipe("ner")
assert ner2.model.attrs["has_upper"] == use_upper
ner2.add_label("RANDOM_NEW_LABEL") ner2.add_label("RANDOM_NEW_LABEL")
doc3 = nlp2(test_text) doc3 = nlp2(test_text)
ents3 = doc3.ents ents3 = doc3.ents
@ -596,6 +597,7 @@ def test_overfitting_IO(use_upper):
assert ents[1].kb_id == 0 assert ents[1].kb_id == 0
@pytest.mark.xfail(reason="no beam parser yet")
def test_beam_ner_scores(): def test_beam_ner_scores():
# Test that we can get confidence values out of the beam_ner pipe # Test that we can get confidence values out of the beam_ner pipe
beam_width = 16 beam_width = 16
@ -631,6 +633,7 @@ def test_beam_ner_scores():
assert 0 - eps <= score <= 1 + eps assert 0 - eps <= score <= 1 + eps
@pytest.mark.xfail(reason="no beam parser yet")
def test_beam_overfitting_IO(neg_key): def test_beam_overfitting_IO(neg_key):
# Simple test to try and quickly overfit the Beam NER component # Simple test to try and quickly overfit the Beam NER component
nlp = English() nlp = English()

View File

@ -118,6 +118,7 @@ def test_beam_advance_too_few_scores(beam, scores):
beam.advance(scores[:-1]) beam.advance(scores[:-1])
@pytest.mark.xfail(reason="no beam parser yet")
def test_beam_parse(examples, beam_width): def test_beam_parse(examples, beam_width):
nlp = Language() nlp = Language()
parser = nlp.add_pipe("beam_parser") parser = nlp.add_pipe("beam_parser")
@ -128,6 +129,7 @@ def test_beam_parse(examples, beam_width):
parser(doc) parser(doc)
@pytest.mark.xfail(reason="no beam parser yet")
@hypothesis.given(hyp=hypothesis.strategies.data()) @hypothesis.given(hyp=hypothesis.strategies.data())
def test_beam_density(moves, examples, beam_width, hyp): def test_beam_density(moves, examples, beam_width, hyp):
beam_density = float(hyp.draw(hypothesis.strategies.floats(0.0, 1.0, width=32))) beam_density = float(hyp.draw(hypothesis.strategies.floats(0.0, 1.0, width=32)))

View File

@ -5,9 +5,11 @@ from thinc.api import Adam
from spacy import registry, util from spacy import registry, util
from spacy.attrs import DEP, NORM from spacy.attrs import DEP, NORM
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc
from spacy.training import Example from spacy.training import Example
from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy import util, registry
from thinc.api import fix_random_seed
from ...pipeline import DependencyParser from ...pipeline import DependencyParser
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
@ -58,6 +60,8 @@ PARTIAL_DATA = [
), ),
] ]
PARSERS = ["parser"] # TODO: Test beam_parser when ready
eps = 0.1 eps = 0.1
@ -318,7 +322,7 @@ def test_parser_constructor(en_vocab):
DependencyParser(en_vocab, model) DependencyParser(en_vocab, model)
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) @pytest.mark.parametrize("pipe_name", PARSERS)
def test_incomplete_data(pipe_name): def test_incomplete_data(pipe_name):
# Test that the parser works with incomplete information # Test that the parser works with incomplete information
nlp = English() nlp = English()
@ -344,8 +348,9 @@ def test_incomplete_data(pipe_name):
assert doc[2].head.i == 1 assert doc[2].head.i == 1
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"]) @pytest.mark.parametrize("pipe_name", PARSERS)
def test_overfitting_IO(pipe_name): def test_overfitting_IO(pipe_name):
fix_random_seed(0)
# Simple test to try and quickly overfit the dependency parser (normal or beam) # Simple test to try and quickly overfit the dependency parser (normal or beam)
nlp = English() nlp = English()
parser = nlp.add_pipe(pipe_name) parser = nlp.add_pipe(pipe_name)
@ -354,6 +359,7 @@ def test_overfitting_IO(pipe_name):
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for dep in annotations.get("deps", []): for dep in annotations.get("deps", []):
parser.add_label(dep) parser.add_label(dep)
# train_examples = train_examples[:1]
optimizer = nlp.initialize() optimizer = nlp.initialize()
# run overfitting # run overfitting
for i in range(200): for i in range(200):
@ -395,6 +401,7 @@ def test_overfitting_IO(pipe_name):
assert_equal(batch_deps_1, no_batch_deps) assert_equal(batch_deps_1, no_batch_deps)
@pytest.mark.xfail(reason="no beam parser yet")
def test_beam_parser_scores(): def test_beam_parser_scores():
# Test that we can get confidence values out of the beam_parser pipe # Test that we can get confidence values out of the beam_parser pipe
beam_width = 16 beam_width = 16
@ -433,6 +440,7 @@ def test_beam_parser_scores():
assert 0 - eps <= head_score <= 1 + eps assert 0 - eps <= head_score <= 1 + eps
@pytest.mark.xfail(reason="no beam parser yet")
def test_beam_overfitting_IO(): def test_beam_overfitting_IO():
# Simple test to try and quickly overfit the Beam dependency parser # Simple test to try and quickly overfit the Beam dependency parser
nlp = English() nlp = English()

View File

@ -255,7 +255,7 @@ cfg_string_multi = """
factory = "ner" factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"

View File

@ -122,33 +122,11 @@ width = ${components.tok2vec.model.width}
parser_config_string_upper = """ parser_config_string_upper = """
[model] [model]
@architectures = "spacy.TransitionBasedParser.v2" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser" state_type = "parser"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 66 hidden_width = 66
maxout_pieces = 2 maxout_pieces = 2
use_upper = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 333
depth = 4
embed_size = 5555
window_size = 1
maxout_pieces = 7
subword_features = false
"""
parser_config_string_no_upper = """
[model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 66
maxout_pieces = 2
use_upper = false
[model.tok2vec] [model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v1"
@ -179,7 +157,6 @@ def my_parser():
extra_state_tokens=True, extra_state_tokens=True,
hidden_width=65, hidden_width=65,
maxout_pieces=5, maxout_pieces=5,
use_upper=True,
) )
return parser return parser
@ -285,15 +262,14 @@ def test_serialize_custom_nlp():
nlp.to_disk(d) nlp.to_disk(d)
nlp2 = spacy.load(d) nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model model = nlp2.get_pipe("parser").model
model.get_ref("tok2vec") assert model.get_ref("tok2vec") is not None
# check that we have the correct settings, not the default ones assert model.has_param("lower_W")
assert model.get_ref("upper").get_dim("nI") == 65 assert model.has_param("upper_W")
assert model.get_ref("lower").get_dim("nI") == 65 assert model.has_param("lower_b")
assert model.has_param("upper_b")
@pytest.mark.parametrize( @pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
)
def test_serialize_parser(parser_config_string): def test_serialize_parser(parser_config_string):
"""Create a non-default parser config to check nlp serializes it correctly""" """Create a non-default parser config to check nlp serializes it correctly"""
nlp = English() nlp = English()
@ -306,11 +282,11 @@ def test_serialize_parser(parser_config_string):
nlp.to_disk(d) nlp.to_disk(d)
nlp2 = spacy.load(d) nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model model = nlp2.get_pipe("parser").model
model.get_ref("tok2vec") assert model.get_ref("tok2vec") is not None
# check that we have the correct settings, not the default ones assert model.has_param("lower_W")
if model.attrs["has_upper"]: assert model.has_param("upper_W")
assert model.get_ref("upper").get_dim("nI") == 66 assert model.has_param("lower_b")
assert model.get_ref("lower").get_dim("nI") == 66 assert model.has_param("upper_b")
def test_config_nlp_roundtrip(): def test_config_nlp_roundtrip():
@ -457,9 +433,7 @@ def test_config_auto_fill_extra_fields():
load_model_from_config(nlp.config) load_model_from_config(nlp.config)
@pytest.mark.parametrize( @pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
)
def test_config_validate_literal(parser_config_string): def test_config_validate_literal(parser_config_string):
nlp = English() nlp = English()
config = Config().from_str(parser_config_string) config = Config().from_str(parser_config_string)

View File

@ -5,10 +5,7 @@ from pathlib import Path
from spacy.about import __version__ as spacy_version from spacy.about import __version__ as spacy_version
from spacy import util from spacy import util
from spacy import prefer_gpu, require_gpu, require_cpu from spacy import prefer_gpu, require_gpu, require_cpu
from spacy.ml._precomputable_affine import PrecomputableAffine from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList, import_file
from spacy.util import to_ternary_int
from thinc.api import Config, Optimizer, ConfigValidationError from thinc.api import Config, Optimizer, ConfigValidationError
from thinc.api import set_current_ops from thinc.api import set_current_ops
from spacy.training.batchers import minibatch_by_words from spacy.training.batchers import minibatch_by_words
@ -81,32 +78,33 @@ def test_util_get_package_path(package):
assert isinstance(path, Path) assert isinstance(path, Path)
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): # @pytest.mark.skip(reason="No precomputable affine")
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize() # def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
assert model.get_param("W").shape == (nF, nO, nP, nI) # model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
tensor = model.ops.alloc((10, nI)) # assert model.get_param("W").shape == (nF, nO, nP, nI)
Y, get_dX = model.begin_update(tensor) # tensor = model.ops.alloc((10, nI))
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP) # Y, get_dX = model.begin_update(tensor)
dY = model.ops.alloc((15, nO, nP)) # assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
ids = model.ops.alloc((15, nF)) # dY = model.ops.alloc((15, nO, nP))
ids[1, 2] = -1 # ids = model.ops.alloc((15, nF))
dY[1] = 1 # ids[1, 2] = -1
assert not model.has_grad("pad") # dY[1] = 1
d_pad = _backprop_precomputable_affine_padding(model, dY, ids) # assert not model.has_grad("pad")
assert d_pad[0, 2, 0, 0] == 1.0 # d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
ids.fill(0.0) # assert d_pad[0, 2, 0, 0] == 1.0
dY.fill(0.0) # ids.fill(0.0)
dY[0] = 0 # dY.fill(0.0)
ids[1, 2] = 0 # dY[0] = 0
ids[1, 1] = -1 # ids[1, 2] = 0
ids[1, 0] = -1 # ids[1, 1] = -1
dY[1] = 1 # ids[1, 0] = -1
ids[2, 0] = -1 # dY[1] = 1
dY[2] = 5 # ids[2, 0] = -1
d_pad = _backprop_precomputable_affine_padding(model, dY, ids) # dY[2] = 5
assert d_pad[0, 0, 0, 0] == 6 # d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
assert d_pad[0, 1, 0, 0] == 1 # assert d_pad[0, 0, 0, 0] == 6
assert d_pad[0, 2, 0, 0] == 0 # assert d_pad[0, 1, 0, 0] == 1
# assert d_pad[0, 2, 0, 0] == 0
def test_prefer_gpu(): def test_prefer_gpu():

View File

@ -1,5 +1,4 @@
from collections.abc import Iterable as IterableInstance from collections.abc import Iterable as IterableInstance
import warnings
import numpy import numpy
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64

View File

@ -552,18 +552,17 @@ for a Tok2Vec layer.
## Parser & NER architectures {#parser} ## Parser & NER architectures {#parser}
### spacy.TransitionBasedParser.v2 {#TransitionBasedParser source="spacy/ml/models/parser.py"} ### spacy.TransitionBasedParser.v3 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
> #### Example Config > #### Example Config
> >
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TransitionBasedParser.v2" > @architectures = "spacy.TransitionBasedParser.v3"
> state_type = "ner" > state_type = "ner"
> extra_state_tokens = false > extra_state_tokens = false
> hidden_width = 64 > hidden_width = 64
> maxout_pieces = 2 > maxout_pieces = 2
> use_upper = true
> >
> [model.tok2vec] > [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v2" > @architectures = "spacy.HashEmbedCNN.v2"
@ -594,13 +593,12 @@ consists of either two or three subnetworks:
as action scores directly. as action scores directly.
| Name | Description | | Name | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ | | `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ | | `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
| `hidden_width` | The width of the hidden layer. ~~int~~ | | `hidden_width` | The width of the hidden layer. ~~int~~ |
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~ |
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |

View File

@ -141,7 +141,7 @@ factory = "tok2vec"
factory = "ner" factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v3"
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
@ -158,7 +158,7 @@ same. This makes them fully independent and doesn't require an upstream
factory = "ner" factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v3"
[components.ner.model.tok2vec] [components.ner.model.tok2vec]
@architectures = "spacy.Tok2Vec.v2" @architectures = "spacy.Tok2Vec.v2"
@ -482,7 +482,7 @@ sneakily delegates to the `Transformer` pipeline component.
factory = "ner" factory = "ner"
[nlp.pipeline.ner.model] [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner" state_type = "ner"
extra_state_tokens = false extra_state_tokens = false
hidden_width = 128 hidden_width = 128