mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-01 04:46:38 +03:00
622 lines
23 KiB
Cython
622 lines
23 KiB
Cython
|
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||
|
from typing import List, Tuple, Any, Optional, TypeVar, cast
|
||
|
from libc.string cimport memset, memcpy
|
||
|
from libc.stdlib cimport calloc, free, realloc
|
||
|
from libcpp.vector cimport vector
|
||
|
import numpy
|
||
|
cimport numpy as np
|
||
|
from thinc.api import Model, normal_init, chain, list2array, Linear
|
||
|
from thinc.api import uniform_init, glorot_uniform_init, zero_init
|
||
|
from thinc.api import NumpyOps
|
||
|
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
|
||
|
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
|
||
|
from thinc.types import Ints1d, Ints2d
|
||
|
|
||
|
from ..errors import Errors
|
||
|
from ..pipeline._parser_internals import _beam_utils
|
||
|
from ..pipeline._parser_internals.batch import GreedyBatch
|
||
|
from ..pipeline._parser_internals._parser_utils cimport arg_max
|
||
|
from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
|
||
|
from ..pipeline._parser_internals.transition_system cimport TransitionSystem
|
||
|
from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
|
||
|
from ..tokens.doc import Doc
|
||
|
from ..util import registry
|
||
|
|
||
|
|
||
|
State = Any # TODO
|
||
|
|
||
|
|
||
|
@registry.layers("spacy.TransitionModel.v2")
|
||
|
def TransitionModel(
|
||
|
*,
|
||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||
|
beam_width: int = 1,
|
||
|
beam_density: float = 0.0,
|
||
|
state_tokens: int,
|
||
|
hidden_width: int,
|
||
|
maxout_pieces: int,
|
||
|
nO: Optional[int] = None,
|
||
|
unseen_classes=set(),
|
||
|
) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
|
||
|
"""Set up a transition-based parsing model, using a maxout hidden
|
||
|
layer and a linear output layer.
|
||
|
"""
|
||
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||
|
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
|
||
|
tok2vec_projected.set_dim("nO", hidden_width)
|
||
|
|
||
|
# FIXME: we use `output` as a container for the output layer's
|
||
|
# weights and biases. Thinc optimizers cannot handle resizing
|
||
|
# of parameters. So, when the parser model is resized, we
|
||
|
# construct a new `output` layer, which has a different key in
|
||
|
# the optimizer. Once the optimizer supports parameter resizing,
|
||
|
# we can replace the `output` layer by `output_W` and `output_b`
|
||
|
# parameters in this model.
|
||
|
output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
|
||
|
|
||
|
return Model(
|
||
|
name="parser_model",
|
||
|
forward=forward,
|
||
|
init=init,
|
||
|
layers=[tok2vec_projected, output],
|
||
|
refs={
|
||
|
"tok2vec": tok2vec_projected,
|
||
|
"output": output,
|
||
|
},
|
||
|
params={
|
||
|
"hidden_W": None, # Floats2d W for the hidden layer
|
||
|
"hidden_b": None, # Floats1d bias for the hidden layer
|
||
|
"hidden_pad": None, # Floats1d padding for the hidden layer
|
||
|
},
|
||
|
dims={
|
||
|
"nO": None, # Output size
|
||
|
"nP": maxout_pieces,
|
||
|
"nH": hidden_width,
|
||
|
"nI": tok2vec_projected.maybe_get_dim("nO"),
|
||
|
"nF": state_tokens,
|
||
|
},
|
||
|
attrs={
|
||
|
"beam_width": beam_width,
|
||
|
"beam_density": beam_density,
|
||
|
"unseen_classes": set(unseen_classes),
|
||
|
"resize_output": resize_output,
|
||
|
},
|
||
|
)
|
||
|
|
||
|
|
||
|
def resize_output(model: Model, new_nO: int) -> Model:
|
||
|
old_nO = model.maybe_get_dim("nO")
|
||
|
output = model.get_ref("output")
|
||
|
if old_nO is None:
|
||
|
model.set_dim("nO", new_nO)
|
||
|
output.set_dim("nO", new_nO)
|
||
|
output.initialize()
|
||
|
return model
|
||
|
elif new_nO <= old_nO:
|
||
|
return model
|
||
|
elif output.has_param("W"):
|
||
|
nH = model.get_dim("nH")
|
||
|
new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
|
||
|
new_output.initialize()
|
||
|
new_W = new_output.get_param("W")
|
||
|
new_b = new_output.get_param("b")
|
||
|
old_W = output.get_param("W")
|
||
|
old_b = output.get_param("b")
|
||
|
new_W[:old_nO] = old_W # type: ignore
|
||
|
new_b[:old_nO] = old_b # type: ignore
|
||
|
for i in range(old_nO, new_nO):
|
||
|
model.attrs["unseen_classes"].add(i)
|
||
|
model.layers[-1] = new_output
|
||
|
model.set_ref("output", new_output)
|
||
|
# TODO: Avoid this private intrusion
|
||
|
model._dims["nO"] = new_nO
|
||
|
return model
|
||
|
|
||
|
|
||
|
def init(
|
||
|
model,
|
||
|
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
|
||
|
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
|
||
|
):
|
||
|
if X is not None:
|
||
|
docs, moves = X
|
||
|
model.get_ref("tok2vec").initialize(X=docs)
|
||
|
else:
|
||
|
model.get_ref("tok2vec").initialize()
|
||
|
inferred_nO = _infer_nO(Y)
|
||
|
if inferred_nO is not None:
|
||
|
current_nO = model.maybe_get_dim("nO")
|
||
|
if current_nO is None or current_nO != inferred_nO:
|
||
|
model.attrs["resize_output"](model, inferred_nO)
|
||
|
nO = model.get_dim("nO")
|
||
|
nP = model.get_dim("nP")
|
||
|
nH = model.get_dim("nH")
|
||
|
nI = model.get_dim("nI")
|
||
|
nF = model.get_dim("nF")
|
||
|
ops = model.ops
|
||
|
|
||
|
Wl = ops.alloc2f(nH * nP, nF * nI)
|
||
|
bl = ops.alloc1f(nH * nP)
|
||
|
padl = ops.alloc1f(nI)
|
||
|
# Wl = zero_init(ops, Wl.shape)
|
||
|
Wl = glorot_uniform_init(ops, Wl.shape)
|
||
|
padl = uniform_init(ops, padl.shape) # type: ignore
|
||
|
# TODO: Experiment with whether better to initialize output_W
|
||
|
model.set_param("hidden_W", Wl)
|
||
|
model.set_param("hidden_b", bl)
|
||
|
model.set_param("hidden_pad", padl)
|
||
|
# model = _lsuv_init(model)
|
||
|
return model
|
||
|
|
||
|
|
||
|
class TransitionModelInputs:
|
||
|
"""
|
||
|
Input to transition model.
|
||
|
"""
|
||
|
|
||
|
# dataclass annotation is not yet supported in Cython 0.29.x,
|
||
|
# so, we'll do something close to it.
|
||
|
|
||
|
actions: Optional[List[Ints1d]]
|
||
|
docs: List[Doc]
|
||
|
max_moves: int
|
||
|
moves: TransitionSystem
|
||
|
states: Optional[List[State]]
|
||
|
|
||
|
__slots__ = [
|
||
|
"actions",
|
||
|
"docs",
|
||
|
"max_moves",
|
||
|
"moves",
|
||
|
"states",
|
||
|
]
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
docs: List[Doc],
|
||
|
moves: TransitionSystem,
|
||
|
actions: Optional[List[Ints1d]]=None,
|
||
|
max_moves: int=0,
|
||
|
states: Optional[List[State]]=None):
|
||
|
"""
|
||
|
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
|
||
|
docs (List[Doc]): Docs to predict transition sequences for.
|
||
|
max_moves: (int): the maximum number of moves to apply, values less
|
||
|
than 1 will apply moves to states until they are final states.
|
||
|
moves (TransitionSystem): the transition system to use when predicting
|
||
|
the transition sequences.
|
||
|
states (Optional[List[States]]): the initial states to predict the
|
||
|
transition sequences for. When absent, the initial states are
|
||
|
initialized from the provided Docs.
|
||
|
"""
|
||
|
self.actions = actions
|
||
|
self.docs = docs
|
||
|
self.moves = moves
|
||
|
self.max_moves = max_moves
|
||
|
self.states = states
|
||
|
|
||
|
|
||
|
def forward(model, inputs: TransitionModelInputs, is_train: bool):
|
||
|
docs = inputs.docs
|
||
|
moves = inputs.moves
|
||
|
actions = inputs.actions
|
||
|
|
||
|
beam_width = model.attrs["beam_width"]
|
||
|
hidden_pad = model.get_param("hidden_pad")
|
||
|
tok2vec = model.get_ref("tok2vec")
|
||
|
|
||
|
states = moves.init_batch(docs) if inputs.states is None else inputs.states
|
||
|
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
|
||
|
tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
|
||
|
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
|
||
|
seen_mask = _get_seen_mask(model)
|
||
|
|
||
|
if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
|
||
|
# Note: max_moves is only used during training, so we don't need to
|
||
|
# pass it to the greedy inference path.
|
||
|
return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
|
||
|
else:
|
||
|
return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
|
||
|
feats, backprop_feats, seen_mask, is_train, actions=actions,
|
||
|
max_moves=inputs.max_moves)
|
||
|
|
||
|
|
||
|
def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
|
||
|
np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
|
||
|
cdef vector[StateC*] c_states
|
||
|
cdef StateClass state
|
||
|
for state in states:
|
||
|
if not state.is_final():
|
||
|
c_states.push_back(state.c)
|
||
|
weights = _get_c_weights(model, <float*>feats.data, seen_mask)
|
||
|
# Precomputed features have rows for each token, plus one for padding.
|
||
|
cdef int n_tokens = feats.shape[0] - 1
|
||
|
sizes = _get_c_sizes(model, c_states.size(), n_tokens)
|
||
|
cdef CBlas cblas = model.ops.cblas()
|
||
|
scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
|
||
|
|
||
|
def backprop(dY):
|
||
|
raise ValueError(Errors.E4004)
|
||
|
|
||
|
return (states, scores), backprop
|
||
|
|
||
|
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
||
|
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
|
||
|
cdef int i, j
|
||
|
cdef vector[StateC *] unfinished
|
||
|
cdef ActivationsC activations = _alloc_activations(sizes)
|
||
|
cdef np.ndarray step_scores
|
||
|
cdef np.ndarray step_actions
|
||
|
|
||
|
scores = []
|
||
|
while sizes.states >= 1:
|
||
|
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
|
||
|
step_actions = actions[0] if actions is not None else None
|
||
|
with nogil:
|
||
|
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
|
||
|
if actions is None:
|
||
|
# Validate actions, argmax, take action.
|
||
|
c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
|
||
|
sizes.states)
|
||
|
else:
|
||
|
c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
|
||
|
for i in range(sizes.states):
|
||
|
if not states[i].is_final():
|
||
|
unfinished.push_back(states[i])
|
||
|
for i in range(unfinished.size()):
|
||
|
states[i] = unfinished[i]
|
||
|
sizes.states = unfinished.size()
|
||
|
scores.append(step_scores)
|
||
|
unfinished.clear()
|
||
|
actions = actions[1:] if actions is not None else None
|
||
|
_free_activations(&activations)
|
||
|
|
||
|
return scores
|
||
|
|
||
|
|
||
|
def _forward_fallback(
|
||
|
model: Model,
|
||
|
moves: TransitionSystem,
|
||
|
states: List[StateClass],
|
||
|
tokvecs, backprop_tok2vec,
|
||
|
feats,
|
||
|
backprop_feats,
|
||
|
seen_mask,
|
||
|
is_train: bool,
|
||
|
actions: Optional[List[Ints1d]]=None,
|
||
|
max_moves: int=0):
|
||
|
nF = model.get_dim("nF")
|
||
|
output = model.get_ref("output")
|
||
|
hidden_b = model.get_param("hidden_b")
|
||
|
nH = model.get_dim("nH")
|
||
|
nP = model.get_dim("nP")
|
||
|
|
||
|
beam_width = model.attrs["beam_width"]
|
||
|
beam_density = model.attrs["beam_density"]
|
||
|
|
||
|
ops = model.ops
|
||
|
|
||
|
all_ids = []
|
||
|
all_which = []
|
||
|
all_statevecs = []
|
||
|
all_scores = []
|
||
|
if beam_width == 1:
|
||
|
batch = GreedyBatch(moves, states, None)
|
||
|
else:
|
||
|
batch = _beam_utils.BeamBatch(
|
||
|
moves, states, None, width=beam_width, density=beam_density
|
||
|
)
|
||
|
arange = ops.xp.arange(nF)
|
||
|
n_moves = 0
|
||
|
while not batch.is_done:
|
||
|
ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
|
||
|
for i, state in enumerate(batch.get_unfinished_states()):
|
||
|
state.set_context_tokens(ids, i, nF)
|
||
|
# Sum the state features, add the bias and apply the activation (maxout)
|
||
|
# to create the state vectors.
|
||
|
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
|
||
|
preacts2f += hidden_b
|
||
|
preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
|
||
|
assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
|
||
|
statevecs, which = ops.maxout(preacts)
|
||
|
# We don't use output's backprop, since we want to backprop for
|
||
|
# all states at once, rather than a single state.
|
||
|
scores = output.predict(statevecs)
|
||
|
scores[:, seen_mask] = ops.xp.nanmin(scores)
|
||
|
# Transition the states, filtering out any that are finished.
|
||
|
cpu_scores = ops.to_numpy(scores)
|
||
|
if actions is None:
|
||
|
batch.advance(cpu_scores)
|
||
|
else:
|
||
|
batch.advance_with_actions(actions[0])
|
||
|
actions = actions[1:]
|
||
|
all_scores.append(scores)
|
||
|
if is_train:
|
||
|
# Remember intermediate results for the backprop.
|
||
|
all_ids.append(ids)
|
||
|
all_statevecs.append(statevecs)
|
||
|
all_which.append(which)
|
||
|
if n_moves >= max_moves >= 1:
|
||
|
break
|
||
|
n_moves += 1
|
||
|
|
||
|
def backprop_parser(d_states_d_scores):
|
||
|
ids = ops.xp.vstack(all_ids)
|
||
|
which = ops.xp.vstack(all_which)
|
||
|
statevecs = ops.xp.vstack(all_statevecs)
|
||
|
_, d_scores = d_states_d_scores
|
||
|
if model.attrs.get("unseen_classes"):
|
||
|
# If we have a negative gradient (i.e. the probability should
|
||
|
# increase) on any classes we filtered out as unseen, mark
|
||
|
# them as seen.
|
||
|
for clas in set(model.attrs["unseen_classes"]):
|
||
|
if (d_scores[:, clas] < 0).any():
|
||
|
model.attrs["unseen_classes"].remove(clas)
|
||
|
d_scores *= seen_mask == False
|
||
|
# Calculate the gradients for the parameters of the output layer.
|
||
|
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||
|
output.inc_grad("b", d_scores.sum(axis=0))
|
||
|
output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
|
||
|
# Now calculate d_statevecs, by backproping through the output linear layer.
|
||
|
# This gemm is (nS, nO) @ (nO, nH)
|
||
|
output_W = output.get_param("W")
|
||
|
d_statevecs = ops.gemm(d_scores, output_W)
|
||
|
# Backprop through the maxout activation
|
||
|
d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
|
||
|
d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
|
||
|
model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
|
||
|
# We don't need to backprop the summation, because we pass back the IDs instead
|
||
|
d_state_features = backprop_feats((d_preacts2f, ids))
|
||
|
d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||
|
ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||
|
model.inc_grad("hidden_pad", d_tokvecs[-1])
|
||
|
return (backprop_tok2vec(d_tokvecs[:-1]), None)
|
||
|
|
||
|
return (list(batch), all_scores), backprop_parser
|
||
|
|
||
|
|
||
|
def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
|
||
|
mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
|
||
|
for class_ in model.attrs.get("unseen_classes", set()):
|
||
|
mask[class_] = True
|
||
|
return mask
|
||
|
|
||
|
|
||
|
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
||
|
W: Floats2d = model.get_param("hidden_W")
|
||
|
nF = model.get_dim("nF")
|
||
|
nH = model.get_dim("nH")
|
||
|
nP = model.get_dim("nP")
|
||
|
nI = model.get_dim("nI")
|
||
|
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
|
||
|
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
|
||
|
W3f = W3f.transpose((1, 0, 2))
|
||
|
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
|
||
|
assert X.shape == (X.shape[0], nI), X.shape
|
||
|
Yf_ = model.ops.gemm(X, W2f, trans2=True)
|
||
|
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
|
||
|
|
||
|
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
|
||
|
# This backprop is particularly tricky, because we get back a different
|
||
|
# thing from what we put out. We put out an array of shape:
|
||
|
# (nB, nF, nH, nP), and get back:
|
||
|
# (nB, nH, nP) and ids (nB, nF)
|
||
|
# The ids tell us the values of nF, so we would have:
|
||
|
#
|
||
|
# dYf = zeros((nB, nF, nH, nP))
|
||
|
# for b in range(nB):
|
||
|
# for f in range(nF):
|
||
|
# dYf[b, ids[b, f]] += dY[b]
|
||
|
#
|
||
|
# However, we avoid building that array for efficiency -- and just pass
|
||
|
# in the indices.
|
||
|
dY, ids = dY_ids
|
||
|
dXf = model.ops.gemm(dY, W)
|
||
|
Xf = X[ids].reshape((ids.shape[0], -1))
|
||
|
dW = model.ops.gemm(dY, Xf, trans1=True)
|
||
|
model.inc_grad("hidden_W", dW)
|
||
|
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
|
||
|
|
||
|
return Yf, backward
|
||
|
|
||
|
|
||
|
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
||
|
if Y is None:
|
||
|
return None
|
||
|
_, scores = Y
|
||
|
if len(scores) == 0:
|
||
|
return None
|
||
|
assert scores[0].shape[0] >= 1
|
||
|
assert len(scores[0].shape) == 2
|
||
|
return scores[0].shape[1]
|
||
|
|
||
|
|
||
|
def _lsuv_init(model: Model):
|
||
|
"""This is like the 'layer sequential unit variance', but instead
|
||
|
of taking the actual inputs, we randomly generate whitened data.
|
||
|
|
||
|
Why's this all so complicated? We have a huge number of inputs,
|
||
|
and the maxout unit makes guessing the dynamics tricky. Instead
|
||
|
we set the maxout weights to values that empirically result in
|
||
|
whitened outputs given whitened inputs.
|
||
|
"""
|
||
|
W = model.maybe_get_param("hidden_W")
|
||
|
if W is not None and W.any():
|
||
|
return
|
||
|
|
||
|
nF = model.get_dim("nF")
|
||
|
nH = model.get_dim("nH")
|
||
|
nP = model.get_dim("nP")
|
||
|
nI = model.get_dim("nI")
|
||
|
W = model.ops.alloc4f(nF, nH, nP, nI)
|
||
|
b = model.ops.alloc2f(nH, nP)
|
||
|
pad = model.ops.alloc4f(1, nF, nH, nP)
|
||
|
|
||
|
ops = model.ops
|
||
|
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||
|
pad = normal_init(ops, pad.shape, mean=1.0)
|
||
|
model.set_param("W", W)
|
||
|
model.set_param("b", b)
|
||
|
model.set_param("pad", pad)
|
||
|
|
||
|
ids = ops.alloc_f((5000, nF), dtype="f")
|
||
|
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||
|
ids = ops.asarray(ids, dtype="i")
|
||
|
tokvecs = ops.alloc_f((5000, nI), dtype="f")
|
||
|
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||
|
tokvecs.shape
|
||
|
)
|
||
|
|
||
|
def predict(ids, tokvecs):
|
||
|
# nS ids. nW tokvecs. Exclude the padding array.
|
||
|
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
|
||
|
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
|
||
|
# need nS vectors
|
||
|
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
|
||
|
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||
|
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
|
||
|
vectors3f += b
|
||
|
return model.ops.maxout(vectors3f)[0]
|
||
|
|
||
|
tol_var = 0.01
|
||
|
tol_mean = 0.01
|
||
|
t_max = 10
|
||
|
W = cast(Floats4d, model.get_param("hidden_W").copy())
|
||
|
b = cast(Floats2d, model.get_param("hidden_b").copy())
|
||
|
for t_i in range(t_max):
|
||
|
acts1 = predict(ids, tokvecs)
|
||
|
var = model.ops.xp.var(acts1)
|
||
|
mean = model.ops.xp.mean(acts1)
|
||
|
if abs(var - 1.0) >= tol_var:
|
||
|
W /= model.ops.xp.sqrt(var)
|
||
|
model.set_param("hidden_W", W)
|
||
|
elif abs(mean) >= tol_mean:
|
||
|
b -= mean
|
||
|
model.set_param("hidden_b", b)
|
||
|
else:
|
||
|
break
|
||
|
return model
|
||
|
|
||
|
|
||
|
cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
|
||
|
output = model.get_ref("output")
|
||
|
cdef np.ndarray hidden_b = model.get_param("hidden_b")
|
||
|
cdef np.ndarray output_W = output.get_param("W")
|
||
|
cdef np.ndarray output_b = output.get_param("b")
|
||
|
|
||
|
cdef WeightsC weights
|
||
|
weights.feat_weights = feats
|
||
|
weights.feat_bias = <const float*>hidden_b.data
|
||
|
weights.hidden_weights = <const float *> output_W.data
|
||
|
weights.hidden_bias = <const float *> output_b.data
|
||
|
weights.seen_mask = <const int8_t*> seen_mask.data
|
||
|
|
||
|
return weights
|
||
|
|
||
|
|
||
|
cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
|
||
|
cdef SizesC sizes
|
||
|
sizes.states = batch_size
|
||
|
sizes.classes = model.get_dim("nO")
|
||
|
sizes.hiddens = model.get_dim("nH")
|
||
|
sizes.pieces = model.get_dim("nP")
|
||
|
sizes.feats = model.get_dim("nF")
|
||
|
sizes.embed_width = model.get_dim("nI")
|
||
|
sizes.tokens = tokens
|
||
|
return sizes
|
||
|
|
||
|
|
||
|
cdef ActivationsC _alloc_activations(SizesC n) nogil:
|
||
|
cdef ActivationsC A
|
||
|
memset(&A, 0, sizeof(A))
|
||
|
_resize_activations(&A, n)
|
||
|
return A
|
||
|
|
||
|
|
||
|
cdef void _free_activations(const ActivationsC* A) nogil:
|
||
|
free(A.token_ids)
|
||
|
free(A.unmaxed)
|
||
|
free(A.hiddens)
|
||
|
free(A.is_valid)
|
||
|
|
||
|
|
||
|
cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
|
||
|
if n.states <= A._max_size:
|
||
|
A._curr_size = n.states
|
||
|
return
|
||
|
if A._max_size == 0:
|
||
|
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||
|
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||
|
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||
|
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||
|
A._max_size = n.states
|
||
|
else:
|
||
|
A.token_ids = <int*>realloc(A.token_ids,
|
||
|
n.states * n.feats * sizeof(A.token_ids[0]))
|
||
|
A.unmaxed = <float*>realloc(A.unmaxed,
|
||
|
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||
|
A.hiddens = <float*>realloc(A.hiddens,
|
||
|
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||
|
A.is_valid = <int*>realloc(A.is_valid,
|
||
|
n.states * n.classes * sizeof(A.is_valid[0]))
|
||
|
A._max_size = n.states
|
||
|
A._curr_size = n.states
|
||
|
|
||
|
|
||
|
cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
|
||
|
_resize_activations(A, n)
|
||
|
for i in range(n.states):
|
||
|
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||
|
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||
|
_sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
|
||
|
for i in range(n.states):
|
||
|
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
|
||
|
for j in range(n.hiddens):
|
||
|
index = i * n.hiddens * n.pieces + j * n.pieces
|
||
|
which = arg_max(&A.unmaxed[index], n.pieces)
|
||
|
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||
|
if W.hidden_weights == NULL:
|
||
|
memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||
|
else:
|
||
|
# Compute hidden-to-output
|
||
|
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||
|
1.0, <const float *>A.hiddens, n.hiddens,
|
||
|
<const float *>W.hidden_weights, n.hiddens,
|
||
|
0.0, scores, n.classes)
|
||
|
# Add bias
|
||
|
for i in range(n.states):
|
||
|
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
|
||
|
# Set unseen classes to minimum value
|
||
|
i = 0
|
||
|
min_ = scores[0]
|
||
|
for i in range(1, n.states * n.classes):
|
||
|
if scores[i] < min_:
|
||
|
min_ = scores[i]
|
||
|
for i in range(n.states):
|
||
|
for j in range(n.classes):
|
||
|
if W.seen_mask[j]:
|
||
|
scores[i*n.classes+j] = min_
|
||
|
|
||
|
|
||
|
cdef void _sum_state_features(CBlas cblas, float* output,
|
||
|
const float* cached, const int* token_ids, SizesC n) nogil:
|
||
|
cdef int idx, b, f, i
|
||
|
cdef const float* feature
|
||
|
cdef int B = n.states
|
||
|
cdef int O = n.hiddens * n.pieces
|
||
|
cdef int F = n.feats
|
||
|
cdef int T = n.tokens
|
||
|
padding = cached + (T * F * O)
|
||
|
cdef int id_stride = F*O
|
||
|
cdef float one = 1.
|
||
|
for b in range(B):
|
||
|
for f in range(F):
|
||
|
if token_ids[f] < 0:
|
||
|
feature = &padding[f*O]
|
||
|
else:
|
||
|
idx = token_ids[f] * id_stride + f*O
|
||
|
feature = &cached[idx]
|
||
|
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||
|
token_ids += F
|
||
|
|