mirror of
https://github.com/explosion/spaCy.git
synced 2025-11-11 13:25:43 +03:00
In order to support Python 3.13, we had to migrate to Cython 3.0. This caused some tricky interaction with our Pydantic usage, because Cython 3 uses the from __future__ import annotations semantics, which causes type annotations to be saved as strings. The end result is that we can't have Language.factory decorated functions in Cython modules anymore, as the Language.factory decorator expects to inspect the signature of the functions and build a Pydantic model. If the function is implemented in Cython, an error is raised because the type is not resolved. To address this I've moved the factory functions into a new module, spacy.pipeline.factories. I've added __getattr__ importlib hooks to the previous locations, in case anyone was importing these functions directly. The change should have no backwards compatibility implications. Along the way I've also refactored the registration of functions for the config. Previously these ran as import-time side-effects, using the registry decorator. I've created instead a new module spacy.registrations. When the registry is accessed it calls a function ensure_populated(), which cases the registrations to occur. I've made a similar change to the Language.factory registrations in the new spacy.pipeline.factories module. I want to remove these import-time side-effects so that we can speed up the loading time of the library, which can be especially painful on the CLI. I also find that I'm often working to track down the implementations of functions referenced by strings in the config. Having the registrations all happen in one place will make this easier. With these changes I've fortunately avoided the need to migrate to Pydantic v2 properly --- we're still using the v1 compatibility shim. We might not be able to hold out forever though: Pydantic (reasonably) aren't actively supporting the v1 shims. I put a lot of work into v2 migration when investigating the 3.13 support, and it's definitely challenging. In any case, it's a relief that we don't have to do the v2 migration at the same time as the Cython 3.0/Python 3.13 support.
534 lines
18 KiB
Cython
534 lines
18 KiB
Cython
# cython: infer_types=True, cdivision=True, boundscheck=False
|
|
# cython: profile=False
|
|
cimport numpy as np
|
|
from libc.math cimport exp
|
|
from libc.stdlib cimport calloc, free, realloc
|
|
from libc.string cimport memcpy, memset
|
|
from thinc.backends.cblas cimport saxpy, sgemm
|
|
from thinc.backends.linalg cimport Vec, VecVec
|
|
|
|
import numpy
|
|
import numpy.random
|
|
from thinc.api import CupyOps, Model, NumpyOps
|
|
|
|
from .. import util
|
|
from ..errors import Errors
|
|
|
|
from ..pipeline._parser_internals.stateclass cimport StateClass
|
|
from ..typedefs cimport weight_t
|
|
|
|
|
|
cdef WeightsC get_c_weights(model) except *:
|
|
cdef WeightsC output
|
|
cdef precompute_hiddens state2vec = model.state2vec
|
|
output.feat_weights = state2vec.get_feat_weights()
|
|
output.feat_bias = <const float*>state2vec.bias.data
|
|
cdef np.ndarray vec2scores_W
|
|
cdef np.ndarray vec2scores_b
|
|
if model.vec2scores is None:
|
|
output.hidden_weights = NULL
|
|
output.hidden_bias = NULL
|
|
else:
|
|
vec2scores_W = model.vec2scores.get_param("W")
|
|
vec2scores_b = model.vec2scores.get_param("b")
|
|
output.hidden_weights = <const float*>vec2scores_W.data
|
|
output.hidden_bias = <const float*>vec2scores_b.data
|
|
cdef np.ndarray class_mask = model._class_mask
|
|
output.seen_classes = <const float*>class_mask.data
|
|
return output
|
|
|
|
|
|
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
|
cdef SizesC output
|
|
output.states = batch_size
|
|
if model.vec2scores is None:
|
|
output.classes = model.state2vec.get_dim("nO")
|
|
else:
|
|
output.classes = model.vec2scores.get_dim("nO")
|
|
output.hiddens = model.state2vec.get_dim("nO")
|
|
output.pieces = model.state2vec.get_dim("nP")
|
|
output.feats = model.state2vec.get_dim("nF")
|
|
output.embed_width = model.tokvecs.shape[1]
|
|
return output
|
|
|
|
|
|
cdef ActivationsC alloc_activations(SizesC n) noexcept nogil:
|
|
cdef ActivationsC A
|
|
memset(&A, 0, sizeof(A))
|
|
resize_activations(&A, n)
|
|
return A
|
|
|
|
|
|
cdef void free_activations(const ActivationsC* A) noexcept nogil:
|
|
free(A.token_ids)
|
|
free(A.scores)
|
|
free(A.unmaxed)
|
|
free(A.hiddens)
|
|
free(A.is_valid)
|
|
|
|
|
|
cdef void resize_activations(ActivationsC* A, SizesC n) noexcept nogil:
|
|
if n.states <= A._max_size:
|
|
A._curr_size = n.states
|
|
return
|
|
if A._max_size == 0:
|
|
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
|
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
|
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
|
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
|
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
|
A._max_size = n.states
|
|
else:
|
|
A.token_ids = <int*>realloc(
|
|
A.token_ids, n.states * n.feats * sizeof(A.token_ids[0])
|
|
)
|
|
A.scores = <float*>realloc(
|
|
A.scores, n.states * n.classes * sizeof(A.scores[0])
|
|
)
|
|
A.unmaxed = <float*>realloc(
|
|
A.unmaxed, n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0])
|
|
)
|
|
A.hiddens = <float*>realloc(
|
|
A.hiddens, n.states * n.hiddens * sizeof(A.hiddens[0])
|
|
)
|
|
A.is_valid = <int*>realloc(
|
|
A.is_valid, n.states * n.classes * sizeof(A.is_valid[0])
|
|
)
|
|
A._max_size = n.states
|
|
A._curr_size = n.states
|
|
|
|
|
|
cdef void predict_states(
|
|
CBlas cblas, ActivationsC* A, StateC** states, const WeightsC* W, SizesC n
|
|
) noexcept nogil:
|
|
resize_activations(A, n)
|
|
for i in range(n.states):
|
|
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
|
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
|
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
|
sum_state_features(
|
|
cblas,
|
|
A.unmaxed,
|
|
W.feat_weights,
|
|
A.token_ids,
|
|
n.states,
|
|
n.feats,
|
|
n.hiddens * n.pieces
|
|
)
|
|
for i in range(n.states):
|
|
VecVec.add_i(
|
|
&A.unmaxed[i*n.hiddens*n.pieces],
|
|
W.feat_bias, 1.,
|
|
n.hiddens * n.pieces
|
|
)
|
|
for j in range(n.hiddens):
|
|
index = i * n.hiddens * n.pieces + j * n.pieces
|
|
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
|
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
|
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
|
if W.hidden_weights == NULL:
|
|
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
|
else:
|
|
# Compute hidden-to-output
|
|
sgemm(cblas)(
|
|
False, True, n.states, n.classes, n.hiddens,
|
|
1.0, <const float *>A.hiddens, n.hiddens,
|
|
<const float *>W.hidden_weights, n.hiddens,
|
|
0.0, A.scores, n.classes
|
|
)
|
|
# Add bias
|
|
for i in range(n.states):
|
|
VecVec.add_i(&A.scores[i*n.classes], W.hidden_bias, 1., n.classes)
|
|
# Set unseen classes to minimum value
|
|
i = 0
|
|
min_ = A.scores[0]
|
|
for i in range(1, n.states * n.classes):
|
|
if A.scores[i] < min_:
|
|
min_ = A.scores[i]
|
|
for i in range(n.states):
|
|
for j in range(n.classes):
|
|
if not W.seen_classes[j]:
|
|
A.scores[i*n.classes+j] = min_
|
|
|
|
|
|
cdef void sum_state_features(
|
|
CBlas cblas,
|
|
float* output,
|
|
const float* cached,
|
|
const int* token_ids,
|
|
int B,
|
|
int F,
|
|
int O
|
|
) noexcept nogil:
|
|
cdef int idx, b, f
|
|
cdef const float* feature
|
|
padding = cached
|
|
cached += F * O
|
|
cdef int id_stride = F*O
|
|
cdef float one = 1.
|
|
for b in range(B):
|
|
for f in range(F):
|
|
if token_ids[f] < 0:
|
|
feature = &padding[f*O]
|
|
else:
|
|
idx = token_ids[f] * id_stride + f*O
|
|
feature = &cached[idx]
|
|
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
|
token_ids += F
|
|
|
|
|
|
cdef void cpu_log_loss(
|
|
float* d_scores,
|
|
const float* costs,
|
|
const int* is_valid,
|
|
const float* scores,
|
|
int O
|
|
) noexcept nogil:
|
|
"""Do multi-label log loss"""
|
|
cdef double max_, gmax, Z, gZ
|
|
best = arg_max_if_gold(scores, costs, is_valid, O)
|
|
guess = Vec.arg_max(scores, O)
|
|
if best == -1 or guess == -1:
|
|
# These shouldn't happen, but if they do, we want to make sure we don't
|
|
# cause an OOB access.
|
|
return
|
|
Z = 1e-10
|
|
gZ = 1e-10
|
|
max_ = scores[guess]
|
|
gmax = scores[best]
|
|
for i in range(O):
|
|
Z += exp(scores[i] - max_)
|
|
if costs[i] <= costs[best]:
|
|
gZ += exp(scores[i] - gmax)
|
|
for i in range(O):
|
|
if costs[i] <= costs[best]:
|
|
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
|
else:
|
|
d_scores[i] = exp(scores[i]-max_) / Z
|
|
|
|
|
|
cdef int arg_max_if_gold(
|
|
const weight_t* scores, const weight_t* costs, const int* is_valid, int n
|
|
) noexcept nogil:
|
|
# Find minimum cost
|
|
cdef float cost = 1
|
|
for i in range(n):
|
|
if is_valid[i] and costs[i] < cost:
|
|
cost = costs[i]
|
|
# Now find best-scoring with that cost
|
|
cdef int best = -1
|
|
for i in range(n):
|
|
if costs[i] <= cost and is_valid[i]:
|
|
if best == -1 or scores[i] > scores[best]:
|
|
best = i
|
|
return best
|
|
|
|
|
|
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) noexcept nogil:
|
|
cdef int best = -1
|
|
for i in range(n):
|
|
if is_valid[i] >= 1:
|
|
if best == -1 or scores[i] > scores[best]:
|
|
best = i
|
|
return best
|
|
|
|
|
|
class ParserStepModel(Model):
|
|
def __init__(
|
|
self,
|
|
docs,
|
|
layers,
|
|
*,
|
|
has_upper,
|
|
unseen_classes=None,
|
|
train=True,
|
|
dropout=0.1
|
|
):
|
|
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
|
self.attrs["has_upper"] = has_upper
|
|
self.attrs["dropout_rate"] = dropout
|
|
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
|
if layers[1].get_dim("nP") >= 2:
|
|
activation = "maxout"
|
|
elif has_upper:
|
|
activation = None
|
|
else:
|
|
activation = "relu"
|
|
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
|
activation=activation, train=train)
|
|
if has_upper:
|
|
self.vec2scores = layers[-1]
|
|
else:
|
|
self.vec2scores = None
|
|
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
|
self.backprops = []
|
|
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
|
self._class_mask.fill(1)
|
|
if unseen_classes is not None:
|
|
for class_ in unseen_classes:
|
|
self._class_mask[class_] = 0.
|
|
|
|
def clear_memory(self):
|
|
del self.tokvecs
|
|
del self.bp_tokvecs
|
|
del self.state2vec
|
|
del self.backprops
|
|
del self._class_mask
|
|
|
|
@property
|
|
def nO(self):
|
|
if self.attrs["has_upper"]:
|
|
return self.vec2scores.get_dim("nO")
|
|
else:
|
|
return self.state2vec.get_dim("nO")
|
|
|
|
def class_is_unseen(self, class_):
|
|
return self._class_mask[class_]
|
|
|
|
def mark_class_unseen(self, class_):
|
|
self._class_mask[class_] = 0
|
|
|
|
def mark_class_seen(self, class_):
|
|
self._class_mask[class_] = 1
|
|
|
|
def get_token_ids(self, states):
|
|
cdef StateClass state
|
|
states = [state for state in states if not state.is_final()]
|
|
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
|
dtype='i', order='C')
|
|
ids.fill(-1)
|
|
c_ids = <int*>ids.data
|
|
for state in states:
|
|
state.c.set_context_tokens(c_ids, ids.shape[1])
|
|
c_ids += ids.shape[1]
|
|
return ids
|
|
|
|
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
|
if (
|
|
isinstance(self.state2vec.ops, CupyOps)
|
|
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray)
|
|
):
|
|
# Move token_ids and d_vector to GPU, asynchronously
|
|
self.backprops.append((
|
|
util.get_async(self.cuda_stream, token_ids),
|
|
util.get_async(self.cuda_stream, d_vector),
|
|
get_d_tokvecs
|
|
))
|
|
else:
|
|
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
|
|
|
def finish_steps(self, golds):
|
|
# Add a padding vector to the d_tokvecs gradient, so that missing
|
|
# values don't affect the real gradient.
|
|
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
|
# Tells CUDA to block, so our async copies complete.
|
|
if self.cuda_stream is not None:
|
|
self.cuda_stream.synchronize()
|
|
for ids, d_vector, bp_vector in self.backprops:
|
|
d_state_features = bp_vector((d_vector, ids))
|
|
ids = ids.flatten()
|
|
d_state_features = d_state_features.reshape(
|
|
(ids.size, d_state_features.shape[2]))
|
|
self.ops.scatter_add(d_tokvecs, ids, d_state_features)
|
|
# Padded -- see update()
|
|
self.bp_tokvecs(d_tokvecs[:-1])
|
|
return d_tokvecs
|
|
|
|
|
|
NUMPY_OPS = NumpyOps()
|
|
|
|
|
|
def step_forward(model: ParserStepModel, states, is_train):
|
|
token_ids = model.get_token_ids(states)
|
|
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
|
mask = None
|
|
if model.attrs["has_upper"]:
|
|
dropout_rate = model.attrs["dropout_rate"]
|
|
if is_train and dropout_rate > 0:
|
|
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
|
vector *= mask
|
|
scores, get_d_vector = model.vec2scores(vector, is_train)
|
|
else:
|
|
scores = NumpyOps().asarray(vector)
|
|
get_d_vector = lambda d_scores: d_scores # no-cython-lint: E731
|
|
# If the class is unseen, make sure its score is minimum
|
|
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
|
|
|
def backprop_parser_step(d_scores):
|
|
# Zero vectors for unseen classes
|
|
d_scores *= model._class_mask
|
|
d_vector = get_d_vector(d_scores)
|
|
if mask is not None:
|
|
d_vector *= mask
|
|
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
|
return None
|
|
return scores, backprop_parser_step
|
|
|
|
|
|
cdef class precompute_hiddens:
|
|
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
|
|
|
This is used for the parser, where we want to take a batch of documents,
|
|
and compute vectors for each (token, position) pair. These vectors can then
|
|
be reused, especially for beam-search.
|
|
|
|
Let's say we're using 12 features for each state, e.g. word at start of
|
|
buffer, three words on stack, their children, etc. In the normal arc-eager
|
|
system, a document of length N is processed in 2*N states. This means we'll
|
|
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
|
N*12 vector computations. The saving for beam-search is much better:
|
|
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
|
so we can save the factor k. This also gives a nice CPU/GPU division:
|
|
we can do all our hard maths up front, packed into large multiplications,
|
|
and do the hard-to-program parsing on the CPU.
|
|
"""
|
|
cdef readonly int nF, nO, nP
|
|
cdef bint _is_synchronized
|
|
cdef public object ops
|
|
cdef public object numpy_ops
|
|
cdef np.ndarray _features
|
|
cdef np.ndarray _cached
|
|
cdef np.ndarray bias
|
|
cdef object _cuda_stream
|
|
cdef object _bp_hiddens
|
|
cdef object activation
|
|
|
|
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
|
activation="maxout", train=False):
|
|
gpu_cached, bp_features = lower_model(tokvecs, train)
|
|
cdef np.ndarray cached
|
|
if not isinstance(gpu_cached, numpy.ndarray):
|
|
# Note the passing of cuda_stream here: it lets
|
|
# cupy make the copy asynchronously.
|
|
# We then have to block before first use.
|
|
cached = gpu_cached.get(stream=cuda_stream)
|
|
else:
|
|
cached = gpu_cached
|
|
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
|
|
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
|
|
else:
|
|
self.bias = lower_model.get_param("b")
|
|
self.nF = cached.shape[1]
|
|
if lower_model.has_dim("nP"):
|
|
self.nP = lower_model.get_dim("nP")
|
|
else:
|
|
self.nP = 1
|
|
self.nO = cached.shape[2]
|
|
self.ops = lower_model.ops
|
|
self.numpy_ops = NumpyOps()
|
|
assert activation in (None, "relu", "maxout")
|
|
self.activation = activation
|
|
self._is_synchronized = False
|
|
self._cuda_stream = cuda_stream
|
|
self._cached = cached
|
|
self._bp_hiddens = bp_features
|
|
|
|
cdef const float* get_feat_weights(self) except NULL:
|
|
if not self._is_synchronized and self._cuda_stream is not None:
|
|
self._cuda_stream.synchronize()
|
|
self._is_synchronized = True
|
|
return <float*>self._cached.data
|
|
|
|
def has_dim(self, name):
|
|
if name == "nF":
|
|
return self.nF if self.nF is not None else True
|
|
elif name == "nP":
|
|
return self.nP if self.nP is not None else True
|
|
elif name == "nO":
|
|
return self.nO if self.nO is not None else True
|
|
else:
|
|
return False
|
|
|
|
def get_dim(self, name):
|
|
if name == "nF":
|
|
return self.nF
|
|
elif name == "nP":
|
|
return self.nP
|
|
elif name == "nO":
|
|
return self.nO
|
|
else:
|
|
raise ValueError(Errors.E1033.format(name=name))
|
|
|
|
def set_dim(self, name, value):
|
|
if name == "nF":
|
|
self.nF = value
|
|
elif name == "nP":
|
|
self.nP = value
|
|
elif name == "nO":
|
|
self.nO = value
|
|
else:
|
|
raise ValueError(Errors.E1033.format(name=name))
|
|
|
|
def __call__(self, X, bint is_train):
|
|
if is_train:
|
|
return self.begin_update(X)
|
|
else:
|
|
return self.predict(X), lambda X: X
|
|
|
|
def predict(self, X):
|
|
return self.begin_update(X)[0]
|
|
|
|
def begin_update(self, token_ids):
|
|
cdef np.ndarray state_vector = numpy.zeros(
|
|
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
|
# This is tricky, but (assuming GPU available);
|
|
# - Input to forward on CPU
|
|
# - Output from forward on CPU
|
|
# - Input to backward on GPU!
|
|
# - Output from backward on GPU
|
|
bp_hiddens = self._bp_hiddens
|
|
|
|
cdef CBlas cblas
|
|
if isinstance(self.ops, CupyOps):
|
|
cblas = NUMPY_OPS.cblas()
|
|
else:
|
|
cblas = self.ops.cblas()
|
|
|
|
feat_weights = self.get_feat_weights()
|
|
cdef int[:, ::1] ids = token_ids
|
|
sum_state_features(
|
|
cblas, <float*>state_vector.data,
|
|
feat_weights, &ids[0, 0],
|
|
token_ids.shape[0], self.nF, self.nO*self.nP
|
|
)
|
|
state_vector += self.bias
|
|
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
|
|
|
def backward(d_state_vector_ids):
|
|
d_state_vector, token_ids = d_state_vector_ids
|
|
d_state_vector = bp_nonlinearity(d_state_vector)
|
|
d_tokens = bp_hiddens((d_state_vector, token_ids))
|
|
return d_tokens
|
|
return state_vector, backward
|
|
|
|
def _nonlinearity(self, state_vector):
|
|
if self.activation == "maxout":
|
|
return self._maxout_nonlinearity(state_vector)
|
|
else:
|
|
return self._relu_nonlinearity(state_vector)
|
|
|
|
def _maxout_nonlinearity(self, state_vector):
|
|
state_vector, mask = self.numpy_ops.maxout(state_vector)
|
|
# We're outputting to CPU, but we need this variable on GPU for the
|
|
# backward pass.
|
|
mask = self.ops.asarray(mask)
|
|
|
|
def backprop_maxout(d_best):
|
|
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
|
|
|
return state_vector, backprop_maxout
|
|
|
|
def _relu_nonlinearity(self, state_vector):
|
|
state_vector = state_vector.reshape((state_vector.shape[0], -1))
|
|
mask = state_vector >= 0.
|
|
state_vector *= mask
|
|
# We're outputting to CPU, but we need this variable on GPU for the
|
|
# backward pass.
|
|
mask = self.ops.asarray(mask)
|
|
|
|
def backprop_relu(d_best):
|
|
d_best *= mask
|
|
return d_best.reshape((d_best.shape + (1,)))
|
|
|
|
return state_vector, backprop_relu
|