mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-04 20:30:24 +03:00
Merge pull request #10103 from svlandeg/refactor/parser-gpu
Consolidate parser refactor branches
This commit is contained in:
commit
ec0cae9db8
9
setup.py
9
setup.py
|
@ -30,12 +30,9 @@ MOD_NAMES = [
|
|||
"spacy.vocab",
|
||||
"spacy.attrs",
|
||||
"spacy.kb",
|
||||
"spacy.ml.parser_model",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline.dep_parser",
|
||||
"spacy.pipeline.morphologizer",
|
||||
"spacy.pipeline.multitask",
|
||||
"spacy.pipeline.ner",
|
||||
"spacy.pipeline.pipe",
|
||||
"spacy.pipeline.trainable_pipe",
|
||||
"spacy.pipeline.sentencizer",
|
||||
|
@ -205,7 +202,11 @@ def setup_package():
|
|||
for name in MOD_NAMES:
|
||||
mod_path = name.replace(".", "/") + ".pyx"
|
||||
ext = Extension(
|
||||
name, [mod_path], language="c++", include_dirs=include_dirs, extra_compile_args=["-std=c++11"]
|
||||
name,
|
||||
[mod_path],
|
||||
language="c++",
|
||||
include_dirs=include_dirs,
|
||||
extra_compile_args=["-std=c++11"],
|
||||
)
|
||||
ext_modules.append(ext)
|
||||
print("Cythonizing sources")
|
||||
|
|
|
@ -81,12 +81,11 @@ grad_factor = 1.0
|
|||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
|
@ -102,12 +101,11 @@ grad_factor = 1.0
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = false
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
|
@ -259,12 +257,11 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "parser"
|
||||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.parser.model.tok2vec]
|
||||
|
@ -277,12 +274,11 @@ width = ${components.tok2vec.model.encode.width}
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
nO = null
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
|
|
|
@ -1,158 +1,2 @@
|
|||
from thinc.api import Model, normal_init
|
||||
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.PrecomputableAffine.v1")
|
||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
||||
model = Model(
|
||||
"precomputable_affine",
|
||||
forward,
|
||||
init=init,
|
||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
||||
params={"W": None, "b": None, "pad": None},
|
||||
attrs={"dropout_rate": dropout},
|
||||
)
|
||||
return model
|
||||
|
||||
|
||||
def forward(model, X, is_train):
|
||||
nF = model.get_dim("nF")
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.get_param("W")
|
||||
Yf = model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True)
|
||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
||||
Yf = model.ops.xp.vstack((model.get_param("pad"), Yf))
|
||||
|
||||
def backward(dY_ids):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
# thing from what we put out. We put out an array of shape:
|
||||
# (nB, nF, nO, nP), and get back:
|
||||
# (nB, nO, nP) and ids (nB, nF)
|
||||
# The ids tell us the values of nF, so we would have:
|
||||
#
|
||||
# dYf = zeros((nB, nF, nO, nP))
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# dYf[b, ids[b, f]] += dY[b]
|
||||
#
|
||||
# However, we avoid building that array for efficiency -- and just pass
|
||||
# in the indices.
|
||||
dY, ids = dY_ids
|
||||
assert dY.ndim == 3
|
||||
assert dY.shape[1] == nO, dY.shape
|
||||
assert dY.shape[2] == nP, dY.shape
|
||||
# nB = dY.shape[0]
|
||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
||||
Xf = X[ids]
|
||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
||||
|
||||
model.inc_grad("b", dY.sum(axis=0))
|
||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
||||
|
||||
Wopfi = W.transpose((1, 2, 0, 3))
|
||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
||||
|
||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
||||
# (o, p, f, i) --> (f, o, p, i)
|
||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
||||
model.inc_grad("W", dWopfi)
|
||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
||||
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _backprop_precomputable_affine_padding(model, dY, ids):
|
||||
nB = dY.shape[0]
|
||||
nF = model.get_dim("nF")
|
||||
nP = model.get_dim("nP")
|
||||
nO = model.get_dim("nO")
|
||||
# Backprop the "padding", used as a filler for missing values.
|
||||
# Values that are missing are set to -1, and each state vector could
|
||||
# have multiple missing values. The padding has different values for
|
||||
# different missing features. The gradient of the padding vector is:
|
||||
#
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# if ids[b, f] < 0:
|
||||
# d_pad[f] += dY[b]
|
||||
#
|
||||
# Which can be rewritten as:
|
||||
#
|
||||
# (ids < 0).T @ dY
|
||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
||||
return d_pad.reshape((1, nF, nO, nP))
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
if model.has_param("W") and model.get_param("W").any():
|
||||
return
|
||||
|
||||
nF = model.get_dim("nF")
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.ops.alloc4f(nF, nO, nP, nI)
|
||||
b = model.ops.alloc2f(nO, nP)
|
||||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
||||
ids = ops.alloc((5000, nF), dtype="f")
|
||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.alloc((5000, nI), dtype="f")
|
||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
|
||||
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors = vectors.reshape((vectors.shape[0], nO, nP))
|
||||
vectors += b
|
||||
vectors = model.ops.asarray(vectors)
|
||||
if nP >= 2:
|
||||
return model.ops.maxout(vectors)[0]
|
||||
else:
|
||||
return vectors * (vectors >= 0)
|
||||
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
W = model.get_param("W").copy()
|
||||
b = model.get_param("b").copy()
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
W /= model.ops.xp.sqrt(var)
|
||||
model.set_param("W", W)
|
||||
elif abs(mean) >= tol_mean:
|
||||
b -= mean
|
||||
model.set_param("b", b)
|
||||
else:
|
||||
break
|
||||
class PrecomputableAffine:
|
||||
pass
|
||||
|
|
|
@ -1,23 +1,42 @@
|
|||
from typing import Optional, List, cast
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
from typing import Optional, List, Tuple, Any
|
||||
from thinc.types import Floats2d
|
||||
from thinc.api import Model
|
||||
|
||||
from ...errors import Errors
|
||||
from ...compat import Literal
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
from ..tb_framework import TransitionModel
|
||||
from ...tokens import Doc
|
||||
from ...tokens.doc import Doc
|
||||
|
||||
TransitionSystem = Any # TODO
|
||||
State = Any # TODO
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TransitionBasedParser.v3")
|
||||
def transition_parser_v3(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
return build_tb_parser_model(
|
||||
tok2vec,
|
||||
state_type,
|
||||
extra_state_tokens,
|
||||
hidden_width,
|
||||
maxout_pieces,
|
||||
nO=nO,
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures("spacy.TransitionBasedParser.v2")
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
use_upper: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
"""
|
||||
|
@ -51,14 +70,7 @@ def build_tb_parser_model(
|
|||
feature sets (for the NER) or 13 (for the parser).
|
||||
hidden_width (int): The width of the hidden layer.
|
||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||
is replaced with a ReLu non-linearity if use_upper=True, and no
|
||||
non-linearity if use_upper=False.
|
||||
use_upper (bool): Whether to use an additional hidden layer after the state
|
||||
vector in order to predict the action scores. It is recommended to set
|
||||
this to False for large pretrained models such as transformers, and True
|
||||
for smaller networks. The upper layer is computed on CPU, which becomes
|
||||
a bottleneck on larger GPU-based models, where it's also less necessary.
|
||||
Recommended values are 1, 2 or 3.
|
||||
nO (int or None): The number of actions the model will predict between.
|
||||
Usually inferred from data at the beginning of training, or loaded from
|
||||
disk.
|
||||
|
@ -69,106 +81,11 @@ def build_tb_parser_model(
|
|||
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||
else:
|
||||
raise ValueError(Errors.E917.format(value=state_type))
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
cast(Model[List["Floats2d"], Floats2d], list2array()),
|
||||
Linear(hidden_width, t2v_width),
|
||||
return TransitionModel(
|
||||
tok2vec=tok2vec,
|
||||
state_tokens=nr_feature_tokens,
|
||||
hidden_width=hidden_width,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nO=nO,
|
||||
unseen_classes=set(),
|
||||
)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
lower = _define_lower(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
nF=nr_feature_tokens,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
)
|
||||
upper = None
|
||||
if use_upper:
|
||||
with use_ops("cpu"):
|
||||
# Initialize weights at zero, as it's a classification layer.
|
||||
upper = _define_upper(nO=nO, nI=None)
|
||||
return TransitionModel(tok2vec, lower, upper, resize_output)
|
||||
|
||||
|
||||
def _define_upper(nO, nI):
|
||||
return Linear(nO=nO, nI=nI, init_W=zero_init)
|
||||
|
||||
|
||||
def _define_lower(nO, nF, nI, nP):
|
||||
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
|
||||
|
||||
|
||||
def resize_output(model, new_nO):
|
||||
if model.attrs["has_upper"]:
|
||||
return _resize_upper(model, new_nO)
|
||||
return _resize_lower(model, new_nO)
|
||||
|
||||
|
||||
def _resize_upper(model, new_nO):
|
||||
upper = model.get_ref("upper")
|
||||
if upper.has_dim("nO") is None:
|
||||
upper.set_dim("nO", new_nO)
|
||||
return model
|
||||
elif new_nO == upper.get_dim("nO"):
|
||||
return model
|
||||
|
||||
smaller = upper
|
||||
nI = smaller.maybe_get_dim("nI")
|
||||
with use_ops("cpu"):
|
||||
larger = _define_upper(nO=new_nO, nI=nI)
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
if smaller.has_param("W"):
|
||||
larger_W = larger.ops.alloc2f(new_nO, nI)
|
||||
larger_b = larger.ops.alloc1f(new_nO)
|
||||
smaller_W = smaller.get_param("W")
|
||||
smaller_b = smaller.get_param("b")
|
||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
||||
# just adding rows here.
|
||||
if smaller.has_dim("nO"):
|
||||
old_nO = smaller.get_dim("nO")
|
||||
larger_W[:old_nO] = smaller_W
|
||||
larger_b[:old_nO] = smaller_b
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
larger.set_param("W", larger_W)
|
||||
larger.set_param("b", larger_b)
|
||||
model._layers[-1] = larger
|
||||
model.set_ref("upper", larger)
|
||||
return model
|
||||
|
||||
|
||||
def _resize_lower(model, new_nO):
|
||||
lower = model.get_ref("lower")
|
||||
if lower.has_dim("nO") is None:
|
||||
lower.set_dim("nO", new_nO)
|
||||
return model
|
||||
|
||||
smaller = lower
|
||||
nI = smaller.maybe_get_dim("nI")
|
||||
nF = smaller.maybe_get_dim("nF")
|
||||
nP = smaller.maybe_get_dim("nP")
|
||||
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
|
||||
# it could be that the model is not initialized yet, then skip this bit
|
||||
if smaller.has_param("W"):
|
||||
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
|
||||
larger_b = larger.ops.alloc2f(new_nO, nP)
|
||||
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
|
||||
smaller_W = smaller.get_param("W")
|
||||
smaller_b = smaller.get_param("b")
|
||||
smaller_pad = smaller.get_param("pad")
|
||||
# Copy the old weights and padding into the new layer
|
||||
if smaller.has_dim("nO"):
|
||||
old_nO = smaller.get_dim("nO")
|
||||
larger_W[:, 0:old_nO, :, :] = smaller_W
|
||||
larger_pad[:, :, 0:old_nO, :] = smaller_pad
|
||||
larger_b[0:old_nO, :] = smaller_b
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
|
||||
larger.set_param("W", larger_W)
|
||||
larger.set_param("b", larger_b)
|
||||
larger.set_param("pad", larger_pad)
|
||||
model._layers[1] = larger
|
||||
model.set_ref("lower", larger)
|
||||
return model
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
from libc.string cimport memset, memcpy
|
||||
from ..typedefs cimport weight_t, hash_t
|
||||
from ..pipeline._parser_internals._state cimport StateC
|
||||
|
||||
|
||||
cdef struct SizesC:
|
||||
int states
|
||||
int classes
|
||||
int hiddens
|
||||
int pieces
|
||||
int feats
|
||||
int embed_width
|
||||
|
||||
|
||||
cdef struct WeightsC:
|
||||
const float* feat_weights
|
||||
const float* feat_bias
|
||||
const float* hidden_bias
|
||||
const float* hidden_weights
|
||||
const float* seen_classes
|
||||
|
||||
|
||||
cdef struct ActivationsC:
|
||||
int* token_ids
|
||||
float* unmaxed
|
||||
float* scores
|
||||
float* hiddens
|
||||
int* is_valid
|
||||
int _curr_size
|
||||
int _max_size
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *
|
||||
|
||||
cdef ActivationsC alloc_activations(SizesC n) nogil
|
||||
|
||||
cdef void free_activations(const ActivationsC* A) nogil
|
||||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
||||
|
|
@ -1,489 +0,0 @@
|
|||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||
cimport numpy as np
|
||||
from libc.math cimport exp
|
||||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free, realloc
|
||||
from thinc.backends.linalg cimport Vec, VecVec
|
||||
cimport blis.cy
|
||||
|
||||
import numpy
|
||||
import numpy.random
|
||||
from thinc.api import Model, CupyOps, NumpyOps
|
||||
|
||||
from .. import util
|
||||
from ..typedefs cimport weight_t, class_t, hash_t
|
||||
from ..pipeline._parser_internals.stateclass cimport StateClass
|
||||
|
||||
|
||||
cdef WeightsC get_c_weights(model) except *:
|
||||
cdef WeightsC output
|
||||
cdef precompute_hiddens state2vec = model.state2vec
|
||||
output.feat_weights = state2vec.get_feat_weights()
|
||||
output.feat_bias = <const float*>state2vec.bias.data
|
||||
cdef np.ndarray vec2scores_W
|
||||
cdef np.ndarray vec2scores_b
|
||||
if model.vec2scores is None:
|
||||
output.hidden_weights = NULL
|
||||
output.hidden_bias = NULL
|
||||
else:
|
||||
vec2scores_W = model.vec2scores.get_param("W")
|
||||
vec2scores_b = model.vec2scores.get_param("b")
|
||||
output.hidden_weights = <const float*>vec2scores_W.data
|
||||
output.hidden_bias = <const float*>vec2scores_b.data
|
||||
cdef np.ndarray class_mask = model._class_mask
|
||||
output.seen_classes = <const float*>class_mask.data
|
||||
return output
|
||||
|
||||
|
||||
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
||||
cdef SizesC output
|
||||
output.states = batch_size
|
||||
if model.vec2scores is None:
|
||||
output.classes = model.state2vec.get_dim("nO")
|
||||
else:
|
||||
output.classes = model.vec2scores.get_dim("nO")
|
||||
output.hiddens = model.state2vec.get_dim("nO")
|
||||
output.pieces = model.state2vec.get_dim("nP")
|
||||
output.feats = model.state2vec.get_dim("nF")
|
||||
output.embed_width = model.tokvecs.shape[1]
|
||||
return output
|
||||
|
||||
|
||||
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
||||
cdef ActivationsC A
|
||||
memset(&A, 0, sizeof(A))
|
||||
resize_activations(&A, n)
|
||||
return A
|
||||
|
||||
|
||||
cdef void free_activations(const ActivationsC* A) nogil:
|
||||
free(A.token_ids)
|
||||
free(A.scores)
|
||||
free(A.unmaxed)
|
||||
free(A.hiddens)
|
||||
free(A.is_valid)
|
||||
|
||||
|
||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||
if n.states <= A._max_size:
|
||||
A._curr_size = n.states
|
||||
return
|
||||
if A._max_size == 0:
|
||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
else:
|
||||
A.token_ids = <int*>realloc(A.token_ids,
|
||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||
A.scores = <float*>realloc(A.scores,
|
||||
n.states * n.classes * sizeof(A.scores[0]))
|
||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||
A.hiddens = <float*>realloc(A.hiddens,
|
||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||
A.is_valid = <int*>realloc(A.is_valid,
|
||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||
A._max_size = n.states
|
||||
A._curr_size = n.states
|
||||
|
||||
|
||||
cdef void predict_states(ActivationsC* A, StateC** states,
|
||||
const WeightsC* W, SizesC n) nogil:
|
||||
cdef double one = 1.0
|
||||
resize_activations(A, n)
|
||||
for i in range(n.states):
|
||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
||||
sum_state_features(A.unmaxed,
|
||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.unmaxed[i*n.hiddens*n.pieces],
|
||||
W.feat_bias, 1., n.hiddens * n.pieces)
|
||||
for j in range(n.hiddens):
|
||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||
which = Vec.arg_max(&A.unmaxed[index], n.pieces)
|
||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
||||
if W.hidden_weights == NULL:
|
||||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||
else:
|
||||
# Compute hidden-to-output
|
||||
blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
|
||||
n.states, n.classes, n.hiddens, one,
|
||||
<float*>A.hiddens, n.hiddens, 1,
|
||||
<float*>W.hidden_weights, n.hiddens, 1,
|
||||
one,
|
||||
<float*>A.scores, n.classes, 1)
|
||||
# Add bias
|
||||
for i in range(n.states):
|
||||
VecVec.add_i(&A.scores[i*n.classes],
|
||||
W.hidden_bias, 1., n.classes)
|
||||
# Set unseen classes to minimum value
|
||||
i = 0
|
||||
min_ = A.scores[0]
|
||||
for i in range(1, n.states * n.classes):
|
||||
if A.scores[i] < min_:
|
||||
min_ = A.scores[i]
|
||||
for i in range(n.states):
|
||||
for j in range(n.classes):
|
||||
if not W.seen_classes[j]:
|
||||
A.scores[i*n.classes+j] = min_
|
||||
|
||||
|
||||
cdef void sum_state_features(float* output,
|
||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
||||
cdef int idx, b, f, i
|
||||
cdef const float* feature
|
||||
padding = cached
|
||||
cached += F * O
|
||||
cdef int id_stride = F*O
|
||||
cdef float one = 1.
|
||||
for b in range(B):
|
||||
for f in range(F):
|
||||
if token_ids[f] < 0:
|
||||
feature = &padding[f*O]
|
||||
else:
|
||||
idx = token_ids[f] * id_stride + f*O
|
||||
feature = &cached[idx]
|
||||
blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one,
|
||||
<float*>feature, 1,
|
||||
&output[b*O], 1)
|
||||
token_ids += F
|
||||
|
||||
|
||||
cdef void cpu_log_loss(float* d_scores,
|
||||
const float* costs, const int* is_valid, const float* scores,
|
||||
int O) nogil:
|
||||
"""Do multi-label log loss"""
|
||||
cdef double max_, gmax, Z, gZ
|
||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
||||
guess = Vec.arg_max(scores, O)
|
||||
if best == -1 or guess == -1:
|
||||
# These shouldn't happen, but if they do, we want to make sure we don't
|
||||
# cause an OOB access.
|
||||
return
|
||||
Z = 1e-10
|
||||
gZ = 1e-10
|
||||
max_ = scores[guess]
|
||||
gmax = scores[best]
|
||||
for i in range(O):
|
||||
Z += exp(scores[i] - max_)
|
||||
if costs[i] <= costs[best]:
|
||||
gZ += exp(scores[i] - gmax)
|
||||
for i in range(O):
|
||||
if costs[i] <= costs[best]:
|
||||
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
||||
else:
|
||||
d_scores[i] = exp(scores[i]-max_) / Z
|
||||
|
||||
|
||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
||||
const int* is_valid, int n) nogil:
|
||||
# Find minimum cost
|
||||
cdef float cost = 1
|
||||
for i in range(n):
|
||||
if is_valid[i] and costs[i] < cost:
|
||||
cost = costs[i]
|
||||
# Now find best-scoring with that cost
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if costs[i] <= cost and is_valid[i]:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
|
||||
class ParserStepModel(Model):
|
||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
||||
dropout=0.1):
|
||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
||||
self.attrs["has_upper"] = has_upper
|
||||
self.attrs["dropout_rate"] = dropout
|
||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
||||
if layers[1].get_dim("nP") >= 2:
|
||||
activation = "maxout"
|
||||
elif has_upper:
|
||||
activation = None
|
||||
else:
|
||||
activation = "relu"
|
||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
||||
activation=activation, train=train)
|
||||
if has_upper:
|
||||
self.vec2scores = layers[-1]
|
||||
else:
|
||||
self.vec2scores = None
|
||||
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
||||
self.backprops = []
|
||||
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
||||
self._class_mask.fill(1)
|
||||
if unseen_classes is not None:
|
||||
for class_ in unseen_classes:
|
||||
self._class_mask[class_] = 0.
|
||||
|
||||
def clear_memory(self):
|
||||
del self.tokvecs
|
||||
del self.bp_tokvecs
|
||||
del self.state2vec
|
||||
del self.backprops
|
||||
del self._class_mask
|
||||
|
||||
@property
|
||||
def nO(self):
|
||||
if self.attrs["has_upper"]:
|
||||
return self.vec2scores.get_dim("nO")
|
||||
else:
|
||||
return self.state2vec.get_dim("nO")
|
||||
|
||||
def class_is_unseen(self, class_):
|
||||
return self._class_mask[class_]
|
||||
|
||||
def mark_class_unseen(self, class_):
|
||||
self._class_mask[class_] = 0
|
||||
|
||||
def mark_class_seen(self, class_):
|
||||
self._class_mask[class_] = 1
|
||||
|
||||
def get_token_ids(self, states):
|
||||
cdef StateClass state
|
||||
states = [state for state in states if not state.is_final()]
|
||||
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
||||
dtype='i', order='C')
|
||||
ids.fill(-1)
|
||||
c_ids = <int*>ids.data
|
||||
for state in states:
|
||||
state.c.set_context_tokens(c_ids, ids.shape[1])
|
||||
c_ids += ids.shape[1]
|
||||
return ids
|
||||
|
||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
||||
if isinstance(self.state2vec.ops, CupyOps) \
|
||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
||||
# Move token_ids and d_vector to GPU, asynchronously
|
||||
self.backprops.append((
|
||||
util.get_async(self.cuda_stream, token_ids),
|
||||
util.get_async(self.cuda_stream, d_vector),
|
||||
get_d_tokvecs
|
||||
))
|
||||
else:
|
||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
||||
|
||||
|
||||
def finish_steps(self, golds):
|
||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
||||
# values don't affect the real gradient.
|
||||
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
||||
# Tells CUDA to block, so our async copies complete.
|
||||
if self.cuda_stream is not None:
|
||||
self.cuda_stream.synchronize()
|
||||
for ids, d_vector, bp_vector in self.backprops:
|
||||
d_state_features = bp_vector((d_vector, ids))
|
||||
ids = ids.flatten()
|
||||
d_state_features = d_state_features.reshape(
|
||||
(ids.size, d_state_features.shape[2]))
|
||||
self.ops.scatter_add(d_tokvecs, ids,
|
||||
d_state_features)
|
||||
# Padded -- see update()
|
||||
self.bp_tokvecs(d_tokvecs[:-1])
|
||||
return d_tokvecs
|
||||
|
||||
NUMPY_OPS = NumpyOps()
|
||||
|
||||
def step_forward(model: ParserStepModel, states, is_train):
|
||||
token_ids = model.get_token_ids(states)
|
||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
||||
mask = None
|
||||
if model.attrs["has_upper"]:
|
||||
dropout_rate = model.attrs["dropout_rate"]
|
||||
if is_train and dropout_rate > 0:
|
||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
||||
vector *= mask
|
||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
||||
else:
|
||||
scores = NumpyOps().asarray(vector)
|
||||
get_d_vector = lambda d_scores: d_scores
|
||||
# If the class is unseen, make sure its score is minimum
|
||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
||||
|
||||
def backprop_parser_step(d_scores):
|
||||
# Zero vectors for unseen classes
|
||||
d_scores *= model._class_mask
|
||||
d_vector = get_d_vector(d_scores)
|
||||
if mask is not None:
|
||||
d_vector *= mask
|
||||
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
||||
return None
|
||||
return scores, backprop_parser_step
|
||||
|
||||
|
||||
cdef class precompute_hiddens:
|
||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
||||
|
||||
This is used for the parser, where we want to take a batch of documents,
|
||||
and compute vectors for each (token, position) pair. These vectors can then
|
||||
be reused, especially for beam-search.
|
||||
|
||||
Let's say we're using 12 features for each state, e.g. word at start of
|
||||
buffer, three words on stack, their children, etc. In the normal arc-eager
|
||||
system, a document of length N is processed in 2*N states. This means we'll
|
||||
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
||||
N*12 vector computations. The saving for beam-search is much better:
|
||||
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
||||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
||||
we can do all our hard maths up front, packed into large multiplications,
|
||||
and do the hard-to-program parsing on the CPU.
|
||||
"""
|
||||
cdef readonly int nF, nO, nP
|
||||
cdef bint _is_synchronized
|
||||
cdef public object ops
|
||||
cdef public object numpy_ops
|
||||
cdef np.ndarray _features
|
||||
cdef np.ndarray _cached
|
||||
cdef np.ndarray bias
|
||||
cdef object _cuda_stream
|
||||
cdef object _bp_hiddens
|
||||
cdef object activation
|
||||
|
||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
||||
activation="maxout", train=False):
|
||||
gpu_cached, bp_features = lower_model(tokvecs, train)
|
||||
cdef np.ndarray cached
|
||||
if not isinstance(gpu_cached, numpy.ndarray):
|
||||
# Note the passing of cuda_stream here: it lets
|
||||
# cupy make the copy asynchronously.
|
||||
# We then have to block before first use.
|
||||
cached = gpu_cached.get(stream=cuda_stream)
|
||||
else:
|
||||
cached = gpu_cached
|
||||
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
|
||||
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
|
||||
else:
|
||||
self.bias = lower_model.get_param("b")
|
||||
self.nF = cached.shape[1]
|
||||
if lower_model.has_dim("nP"):
|
||||
self.nP = lower_model.get_dim("nP")
|
||||
else:
|
||||
self.nP = 1
|
||||
self.nO = cached.shape[2]
|
||||
self.ops = lower_model.ops
|
||||
self.numpy_ops = NumpyOps()
|
||||
assert activation in (None, "relu", "maxout")
|
||||
self.activation = activation
|
||||
self._is_synchronized = False
|
||||
self._cuda_stream = cuda_stream
|
||||
self._cached = cached
|
||||
self._bp_hiddens = bp_features
|
||||
|
||||
cdef const float* get_feat_weights(self) except NULL:
|
||||
if not self._is_synchronized and self._cuda_stream is not None:
|
||||
self._cuda_stream.synchronize()
|
||||
self._is_synchronized = True
|
||||
return <float*>self._cached.data
|
||||
|
||||
def has_dim(self, name):
|
||||
if name == "nF":
|
||||
return self.nF if self.nF is not None else True
|
||||
elif name == "nP":
|
||||
return self.nP if self.nP is not None else True
|
||||
elif name == "nO":
|
||||
return self.nO if self.nO is not None else True
|
||||
else:
|
||||
return False
|
||||
|
||||
def get_dim(self, name):
|
||||
if name == "nF":
|
||||
return self.nF
|
||||
elif name == "nP":
|
||||
return self.nP
|
||||
elif name == "nO":
|
||||
return self.nO
|
||||
else:
|
||||
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
|
||||
|
||||
def set_dim(self, name, value):
|
||||
if name == "nF":
|
||||
self.nF = value
|
||||
elif name == "nP":
|
||||
self.nP = value
|
||||
elif name == "nO":
|
||||
self.nO = value
|
||||
else:
|
||||
raise ValueError(f"Dimension {name} invalid -- only nO, nF, nP")
|
||||
|
||||
def __call__(self, X, bint is_train):
|
||||
if is_train:
|
||||
return self.begin_update(X)
|
||||
else:
|
||||
return self.predict(X), lambda X: X
|
||||
|
||||
def predict(self, X):
|
||||
return self.begin_update(X)[0]
|
||||
|
||||
def begin_update(self, token_ids):
|
||||
cdef np.ndarray state_vector = numpy.zeros(
|
||||
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
||||
# This is tricky, but (assuming GPU available);
|
||||
# - Input to forward on CPU
|
||||
# - Output from forward on CPU
|
||||
# - Input to backward on GPU!
|
||||
# - Output from backward on GPU
|
||||
bp_hiddens = self._bp_hiddens
|
||||
|
||||
feat_weights = self.get_feat_weights()
|
||||
cdef int[:, ::1] ids = token_ids
|
||||
sum_state_features(<float*>state_vector.data,
|
||||
feat_weights, &ids[0,0],
|
||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
||||
state_vector += self.bias
|
||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
||||
|
||||
def backward(d_state_vector_ids):
|
||||
d_state_vector, token_ids = d_state_vector_ids
|
||||
d_state_vector = bp_nonlinearity(d_state_vector)
|
||||
d_tokens = bp_hiddens((d_state_vector, token_ids))
|
||||
return d_tokens
|
||||
return state_vector, backward
|
||||
|
||||
def _nonlinearity(self, state_vector):
|
||||
if self.activation == "maxout":
|
||||
return self._maxout_nonlinearity(state_vector)
|
||||
else:
|
||||
return self._relu_nonlinearity(state_vector)
|
||||
|
||||
def _maxout_nonlinearity(self, state_vector):
|
||||
state_vector, mask = self.numpy_ops.maxout(state_vector)
|
||||
# We're outputting to CPU, but we need this variable on GPU for the
|
||||
# backward pass.
|
||||
mask = self.ops.asarray(mask)
|
||||
|
||||
def backprop_maxout(d_best):
|
||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
||||
|
||||
return state_vector, backprop_maxout
|
||||
|
||||
def _relu_nonlinearity(self, state_vector):
|
||||
state_vector = state_vector.reshape((state_vector.shape[0], -1))
|
||||
mask = state_vector >= 0.
|
||||
state_vector *= mask
|
||||
# We're outputting to CPU, but we need this variable on GPU for the
|
||||
# backward pass.
|
||||
mask = self.ops.asarray(mask)
|
||||
|
||||
def backprop_relu(d_best):
|
||||
d_best *= mask
|
||||
return d_best.reshape((d_best.shape + (1,)))
|
||||
|
||||
return state_vector, backprop_relu
|
|
@ -1,50 +1,438 @@
|
|||
from thinc.api import Model, noop
|
||||
from .parser_model import ParserStepModel
|
||||
from typing import List, Tuple, Any, Optional, cast
|
||||
from thinc.api import Ops, Model, normal_init, chain, list2array, Linear
|
||||
from thinc.api import uniform_init, glorot_uniform_init, zero_init
|
||||
from thinc.types import Floats1d, Floats2d, Floats3d, Ints2d, Floats4d
|
||||
import numpy
|
||||
from ..tokens.doc import Doc
|
||||
from ..util import registry
|
||||
|
||||
|
||||
@registry.layers("spacy.TransitionModel.v1")
|
||||
TransitionSystem = Any # TODO
|
||||
State = Any # TODO
|
||||
|
||||
|
||||
@registry.layers("spacy.TransitionModel.v2")
|
||||
def TransitionModel(
|
||||
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
||||
):
|
||||
"""Set up a stepwise transition-based model"""
|
||||
if upper is None:
|
||||
has_upper = False
|
||||
upper = noop()
|
||||
else:
|
||||
has_upper = True
|
||||
# don't define nO for this object, because we can't dynamically change it
|
||||
*,
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
state_tokens: int,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
nO: Optional[int] = None,
|
||||
unseen_classes=set(),
|
||||
) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
|
||||
"""Set up a transition-based parsing model, using a maxout hidden
|
||||
layer and a linear output layer.
|
||||
"""
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
|
||||
tok2vec_projected.set_dim("nO", hidden_width)
|
||||
|
||||
return Model(
|
||||
name="parser_model",
|
||||
forward=forward,
|
||||
dims={"nI": tok2vec.maybe_get_dim("nI")},
|
||||
layers=[tok2vec, lower, upper],
|
||||
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
|
||||
init=init,
|
||||
layers=[tok2vec_projected],
|
||||
refs={"tok2vec": tok2vec_projected},
|
||||
params={
|
||||
"lower_W": None, # Floats2d W for the hidden layer
|
||||
"lower_b": None, # Floats1d bias for the hidden layer
|
||||
"lower_pad": None, # Floats1d padding for the hidden layer
|
||||
"upper_W": None, # Floats2d W for the output layer
|
||||
"upper_b": None, # Floats1d bias for the output layer
|
||||
},
|
||||
dims={
|
||||
"nO": None, # Output size
|
||||
"nP": maxout_pieces,
|
||||
"nH": hidden_width,
|
||||
"nI": tok2vec_projected.maybe_get_dim("nO"),
|
||||
"nF": state_tokens,
|
||||
},
|
||||
attrs={
|
||||
"has_upper": has_upper,
|
||||
"unseen_classes": set(unseen_classes),
|
||||
"resize_output": resize_output,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def forward(model, X, is_train):
|
||||
step_model = ParserStepModel(
|
||||
X,
|
||||
model.layers,
|
||||
unseen_classes=model.attrs["unseen_classes"],
|
||||
train=is_train,
|
||||
has_upper=model.attrs["has_upper"],
|
||||
def resize_output(model: Model, new_nO: int) -> Model:
|
||||
old_nO = model.maybe_get_dim("nO")
|
||||
if old_nO is None:
|
||||
model.set_dim("nO", new_nO)
|
||||
return model
|
||||
elif new_nO <= old_nO:
|
||||
return model
|
||||
elif model.has_param("upper_W"):
|
||||
nH = model.get_dim("nH")
|
||||
new_W = model.ops.alloc2f(new_nO, nH)
|
||||
new_b = model.ops.alloc1f(new_nO)
|
||||
old_W = model.get_param("upper_W")
|
||||
old_b = model.get_param("upper_b")
|
||||
new_W[:old_nO] = old_W # type: ignore
|
||||
new_b[:old_nO] = old_b # type: ignore
|
||||
for i in range(old_nO, new_nO):
|
||||
model.attrs["unseen_classes"].add(i)
|
||||
model.set_param("upper_W", new_W)
|
||||
model.set_param("upper_b", new_b)
|
||||
# TODO: Avoid this private intrusion
|
||||
model._dims["nO"] = new_nO
|
||||
if model.has_grad("upper_W"):
|
||||
model.set_grad("upper_W", model.get_param("upper_W") * 0)
|
||||
if model.has_grad("upper_b"):
|
||||
model.set_grad("upper_b", model.get_param("upper_b") * 0)
|
||||
return model
|
||||
|
||||
|
||||
def init(
|
||||
model,
|
||||
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
|
||||
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
|
||||
):
|
||||
if X is not None:
|
||||
docs, moves = X
|
||||
model.get_ref("tok2vec").initialize(X=docs)
|
||||
else:
|
||||
model.get_ref("tok2vec").initialize()
|
||||
inferred_nO = _infer_nO(Y)
|
||||
if inferred_nO is not None:
|
||||
current_nO = model.maybe_get_dim("nO")
|
||||
if current_nO is None:
|
||||
model.set_dim("nO", inferred_nO)
|
||||
elif current_nO != inferred_nO:
|
||||
model.attrs["resize_output"](model, inferred_nO)
|
||||
nO = model.get_dim("nO")
|
||||
nP = model.get_dim("nP")
|
||||
nH = model.get_dim("nH")
|
||||
nI = model.get_dim("nI")
|
||||
nF = model.get_dim("nF")
|
||||
ops = model.ops
|
||||
|
||||
Wl = ops.alloc2f(nH * nP, nF * nI)
|
||||
bl = ops.alloc1f(nH * nP)
|
||||
padl = ops.alloc1f(nI)
|
||||
Wu = ops.alloc2f(nO, nH)
|
||||
bu = ops.alloc1f(nO)
|
||||
Wu = zero_init(ops, Wu.shape)
|
||||
# Wl = zero_init(ops, Wl.shape)
|
||||
Wl = glorot_uniform_init(ops, Wl.shape)
|
||||
padl = uniform_init(ops, padl.shape) # type: ignore
|
||||
# TODO: Experiment with whether better to initialize upper_W
|
||||
model.set_param("lower_W", Wl)
|
||||
model.set_param("lower_b", bl)
|
||||
model.set_param("lower_pad", padl)
|
||||
model.set_param("upper_W", Wu)
|
||||
model.set_param("upper_b", bu)
|
||||
# model = _lsuv_init(model)
|
||||
return model
|
||||
|
||||
|
||||
def forward(model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool):
|
||||
nF = model.get_dim("nF")
|
||||
tok2vec = model.get_ref("tok2vec")
|
||||
lower_pad = model.get_param("lower_pad")
|
||||
lower_W = model.get_param("lower_W")
|
||||
lower_b = model.get_param("lower_b")
|
||||
upper_W = model.get_param("upper_W")
|
||||
upper_b = model.get_param("upper_b")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nO = model.get_dim("nO")
|
||||
nI = model.get_dim("nI")
|
||||
|
||||
ops = model.ops
|
||||
docs, moves = docs_moves
|
||||
states = moves.init_batch(docs)
|
||||
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
|
||||
tokvecs = model.ops.xp.vstack((tokvecs, lower_pad))
|
||||
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
|
||||
all_ids = []
|
||||
all_which = []
|
||||
all_statevecs = []
|
||||
all_scores = []
|
||||
next_states = [s for s in states if not s.is_final()]
|
||||
unseen_mask = _get_unseen_mask(model)
|
||||
ids = numpy.zeros((len(states), nF), dtype="i")
|
||||
arange = model.ops.xp.arange(nF)
|
||||
while next_states:
|
||||
ids = ids[: len(next_states)]
|
||||
for i, state in enumerate(next_states):
|
||||
state.set_context_tokens(ids, i, nF)
|
||||
# Sum the state features, add the bias and apply the activation (maxout)
|
||||
# to create the state vectors.
|
||||
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
|
||||
preacts2f += lower_b
|
||||
preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
|
||||
assert preacts.shape[0] == len(next_states), preacts.shape
|
||||
statevecs, which = ops.maxout(preacts)
|
||||
# Multiply the state-vector by the scores weights and add the bias,
|
||||
# to get the logits.
|
||||
scores = model.ops.gemm(statevecs, upper_W, trans2=True)
|
||||
scores += upper_b
|
||||
scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores)
|
||||
# Transition the states, filtering out any that are finished.
|
||||
next_states = moves.transition_states(next_states, scores)
|
||||
all_scores.append(scores)
|
||||
if is_train:
|
||||
# Remember intermediate results for the backprop.
|
||||
all_ids.append(ids.copy())
|
||||
all_statevecs.append(statevecs)
|
||||
all_which.append(which)
|
||||
|
||||
def backprop_parser(d_states_d_scores):
|
||||
d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||||
ids = model.ops.xp.vstack(all_ids)
|
||||
which = ops.xp.vstack(all_which)
|
||||
statevecs = model.ops.xp.vstack(all_statevecs)
|
||||
_, d_scores = d_states_d_scores
|
||||
if model.attrs.get("unseen_classes"):
|
||||
# If we have a negative gradient (i.e. the probability should
|
||||
# increase) on any classes we filtered out as unseen, mark
|
||||
# them as seen.
|
||||
for clas in set(model.attrs["unseen_classes"]):
|
||||
if (d_scores[:, clas] < 0).any():
|
||||
model.attrs["unseen_classes"].remove(clas)
|
||||
d_scores *= unseen_mask
|
||||
# Calculate the gradients for the parameters of the upper layer.
|
||||
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||||
model.inc_grad("upper_b", d_scores.sum(axis=0))
|
||||
model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True))
|
||||
# Now calculate d_statevecs, by backproping through the upper linear layer.
|
||||
# This gemm is (nS, nO) @ (nO, nH)
|
||||
d_statevecs = model.ops.gemm(d_scores, upper_W)
|
||||
# Backprop through the maxout activation
|
||||
d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP)
|
||||
d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
|
||||
model.inc_grad("lower_b", d_preacts2f.sum(axis=0))
|
||||
# We don't need to backprop the summation, because we pass back the IDs instead
|
||||
d_state_features = backprop_feats((d_preacts2f, ids))
|
||||
d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||||
model.ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||||
model.inc_grad("lower_pad", d_tokvecs[-1])
|
||||
return (backprop_tok2vec(d_tokvecs[:-1]), None)
|
||||
|
||||
return (states, all_scores), backprop_parser
|
||||
|
||||
|
||||
def _forward_reference(
|
||||
model, docs_moves: Tuple[List[Doc], TransitionSystem], is_train: bool
|
||||
):
|
||||
"""Slow reference implementation, without the precomputation"""
|
||||
nF = model.get_dim("nF")
|
||||
tok2vec = model.get_ref("tok2vec")
|
||||
lower_pad = model.get_param("lower_pad")
|
||||
lower_W = model.get_param("lower_W")
|
||||
lower_b = model.get_param("lower_b")
|
||||
upper_W = model.get_param("upper_W")
|
||||
upper_b = model.get_param("upper_b")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nO = model.get_dim("nO")
|
||||
nI = model.get_dim("nI")
|
||||
|
||||
ops = model.ops
|
||||
docs, moves = docs_moves
|
||||
states = moves.init_batch(docs)
|
||||
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
|
||||
tokvecs = model.ops.xp.vstack((tokvecs, lower_pad))
|
||||
all_ids = []
|
||||
all_which = []
|
||||
all_statevecs = []
|
||||
all_scores = []
|
||||
all_tokfeats = []
|
||||
next_states = [s for s in states if not s.is_final()]
|
||||
unseen_mask = _get_unseen_mask(model)
|
||||
ids = numpy.zeros((len(states), nF), dtype="i")
|
||||
while next_states:
|
||||
ids = ids[: len(next_states)]
|
||||
for i, state in enumerate(next_states):
|
||||
state.set_context_tokens(ids, i, nF)
|
||||
# Sum the state features, add the bias and apply the activation (maxout)
|
||||
# to create the state vectors.
|
||||
tokfeats3f = tokvecs[ids]
|
||||
tokfeats = model.ops.reshape2f(tokfeats3f, tokfeats3f.shape[0], -1)
|
||||
preacts2f = model.ops.gemm(tokfeats, lower_W, trans2=True)
|
||||
preacts2f += lower_b
|
||||
preacts = model.ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
|
||||
statevecs, which = ops.maxout(preacts)
|
||||
# Multiply the state-vector by the scores weights and add the bias,
|
||||
# to get the logits.
|
||||
scores = model.ops.gemm(statevecs, upper_W, trans2=True)
|
||||
scores += upper_b
|
||||
scores[:, unseen_mask == 0] = model.ops.xp.nanmin(scores)
|
||||
# Transition the states, filtering out any that are finished.
|
||||
next_states = moves.transition_states(next_states, scores)
|
||||
all_scores.append(scores)
|
||||
if is_train:
|
||||
# Remember intermediate results for the backprop.
|
||||
all_tokfeats.append(tokfeats)
|
||||
all_ids.append(ids.copy())
|
||||
all_statevecs.append(statevecs)
|
||||
all_which.append(which)
|
||||
|
||||
nS = sum(len(s.history) for s in states)
|
||||
|
||||
def backprop_parser(d_states_d_scores):
|
||||
d_tokvecs = model.ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||||
ids = model.ops.xp.vstack(all_ids)
|
||||
which = ops.xp.vstack(all_which)
|
||||
statevecs = model.ops.xp.vstack(all_statevecs)
|
||||
tokfeats = model.ops.xp.vstack(all_tokfeats)
|
||||
_, d_scores = d_states_d_scores
|
||||
if model.attrs.get("unseen_classes"):
|
||||
# If we have a negative gradient (i.e. the probability should
|
||||
# increase) on any classes we filtered out as unseen, mark
|
||||
# them as seen.
|
||||
for clas in set(model.attrs["unseen_classes"]):
|
||||
if (d_scores[:, clas] < 0).any():
|
||||
model.attrs["unseen_classes"].remove(clas)
|
||||
d_scores *= unseen_mask
|
||||
assert statevecs.shape == (nS, nH), statevecs.shape
|
||||
assert d_scores.shape == (nS, nO), d_scores.shape
|
||||
# Calculate the gradients for the parameters of the upper layer.
|
||||
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||||
model.inc_grad("upper_b", d_scores.sum(axis=0))
|
||||
model.inc_grad("upper_W", model.ops.gemm(d_scores, statevecs, trans1=True))
|
||||
# Now calculate d_statevecs, by backproping through the upper linear layer.
|
||||
# This gemm is (nS, nO) @ (nO, nH)
|
||||
d_statevecs = model.ops.gemm(d_scores, upper_W)
|
||||
# Backprop through the maxout activation
|
||||
d_preacts = model.ops.backprop_maxout(d_statevecs, which, nP)
|
||||
d_preacts2f = model.ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
|
||||
# Now increment the gradients for the lower layer.
|
||||
# The gemm here is (nS, nH*nP) @ (nS, nF*nI)
|
||||
model.inc_grad("lower_b", d_preacts2f.sum(axis=0))
|
||||
model.inc_grad("lower_W", model.ops.gemm(d_preacts2f, tokfeats, trans1=True))
|
||||
# Caclulate d_tokfeats
|
||||
# The gemm here is (nS, nH*nP) @ (nH*nP, nF*nI)
|
||||
d_tokfeats = model.ops.gemm(d_preacts2f, lower_W)
|
||||
# Get the gradients of the tokvecs and the padding
|
||||
d_tokfeats3f = model.ops.reshape3f(d_tokfeats, nS, nF, nI)
|
||||
model.ops.scatter_add(d_tokvecs, ids, d_tokfeats3f)
|
||||
model.inc_grad("lower_pad", d_tokvecs[-1])
|
||||
return (backprop_tok2vec(d_tokvecs[:-1]), None)
|
||||
|
||||
return (states, all_scores), backprop_parser
|
||||
|
||||
|
||||
def _get_unseen_mask(model: Model) -> Floats1d:
|
||||
mask = model.ops.alloc1f(model.get_dim("nO"))
|
||||
mask.fill(1)
|
||||
for class_ in model.attrs.get("unseen_classes", set()):
|
||||
mask[class_] = 0
|
||||
return mask
|
||||
|
||||
|
||||
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
||||
W: Floats2d = model.get_param("lower_W")
|
||||
nF = model.get_dim("nF")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
|
||||
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
|
||||
W3f = W3f.transpose((1, 0, 2))
|
||||
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
|
||||
assert X.shape == (X.shape[0], nI), X.shape
|
||||
Yf_ = model.ops.gemm(X, W2f, trans2=True)
|
||||
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
|
||||
|
||||
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
|
||||
# This backprop is particularly tricky, because we get back a different
|
||||
# thing from what we put out. We put out an array of shape:
|
||||
# (nB, nF, nH, nP), and get back:
|
||||
# (nB, nH, nP) and ids (nB, nF)
|
||||
# The ids tell us the values of nF, so we would have:
|
||||
#
|
||||
# dYf = zeros((nB, nF, nH, nP))
|
||||
# for b in range(nB):
|
||||
# for f in range(nF):
|
||||
# dYf[b, ids[b, f]] += dY[b]
|
||||
#
|
||||
# However, we avoid building that array for efficiency -- and just pass
|
||||
# in the indices.
|
||||
dY, ids = dY_ids
|
||||
dXf = model.ops.gemm(dY, W)
|
||||
Xf = X[ids].reshape((ids.shape[0], -1))
|
||||
dW = model.ops.gemm(dY, Xf, trans1=True)
|
||||
model.inc_grad("lower_W", dW)
|
||||
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
|
||||
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
||||
if Y is None:
|
||||
return None
|
||||
_, scores = Y
|
||||
if len(scores) == 0:
|
||||
return None
|
||||
assert scores[0].shape[0] >= 1
|
||||
assert len(scores[0].shape) == 2
|
||||
return scores[0].shape[1]
|
||||
|
||||
|
||||
def _lsuv_init(model: Model):
|
||||
"""This is like the 'layer sequential unit variance', but instead
|
||||
of taking the actual inputs, we randomly generate whitened data.
|
||||
|
||||
Why's this all so complicated? We have a huge number of inputs,
|
||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||
we set the maxout weights to values that empirically result in
|
||||
whitened outputs given whitened inputs.
|
||||
"""
|
||||
W = model.maybe_get_param("lower_W")
|
||||
if W is not None and W.any():
|
||||
return
|
||||
|
||||
nF = model.get_dim("nF")
|
||||
nH = model.get_dim("nH")
|
||||
nP = model.get_dim("nP")
|
||||
nI = model.get_dim("nI")
|
||||
W = model.ops.alloc4f(nF, nH, nP, nI)
|
||||
b = model.ops.alloc2f(nH, nP)
|
||||
pad = model.ops.alloc4f(1, nF, nH, nP)
|
||||
|
||||
ops = model.ops
|
||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||
model.set_param("W", W)
|
||||
model.set_param("b", b)
|
||||
model.set_param("pad", pad)
|
||||
|
||||
ids = ops.alloc_f((5000, nF), dtype="f")
|
||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||
ids = ops.asarray(ids, dtype="i")
|
||||
tokvecs = ops.alloc_f((5000, nI), dtype="f")
|
||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||
tokvecs.shape
|
||||
)
|
||||
|
||||
return step_model, step_model.finish_steps
|
||||
def predict(ids, tokvecs):
|
||||
# nS ids. nW tokvecs. Exclude the padding array.
|
||||
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
|
||||
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
|
||||
# need nS vectors
|
||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
|
||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
|
||||
vectors3f += b
|
||||
return model.ops.maxout(vectors3f)[0]
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
model.get_ref("tok2vec").initialize(X=X)
|
||||
lower = model.get_ref("lower")
|
||||
lower.initialize()
|
||||
if model.attrs["has_upper"]:
|
||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
||||
model.get_ref("upper").initialize(X=statevecs)
|
||||
tol_var = 0.01
|
||||
tol_mean = 0.01
|
||||
t_max = 10
|
||||
W = cast(Floats4d, model.get_param("lower_W").copy())
|
||||
b = cast(Floats2d, model.get_param("lower_b").copy())
|
||||
for t_i in range(t_max):
|
||||
acts1 = predict(ids, tokvecs)
|
||||
var = model.ops.xp.var(acts1)
|
||||
mean = model.ops.xp.mean(acts1)
|
||||
if abs(var - 1.0) >= tol_var:
|
||||
W /= model.ops.xp.sqrt(var)
|
||||
model.set_param("lower_W", W)
|
||||
elif abs(mean) >= tol_mean:
|
||||
b -= mean
|
||||
model.set_param("lower_b", b)
|
||||
else:
|
||||
break
|
||||
return model
|
||||
|
|
|
@ -33,6 +33,7 @@ cdef cppclass StateC:
|
|||
vector[ArcC] _left_arcs
|
||||
vector[ArcC] _right_arcs
|
||||
vector[libcpp.bool] _unshiftable
|
||||
vector[int] history
|
||||
set[int] _sent_starts
|
||||
TokenC _empty_token
|
||||
int length
|
||||
|
@ -387,3 +388,4 @@ cdef cppclass StateC:
|
|||
this._b_i = src._b_i
|
||||
this.offset = src.offset
|
||||
this._empty_token = src._empty_token
|
||||
this.history = src.history
|
||||
|
|
|
@ -772,6 +772,8 @@ cdef class ArcEager(TransitionSystem):
|
|||
return list(arcs)
|
||||
|
||||
def has_gold(self, Example eg, start=0, end=None):
|
||||
if end is not None and end < 0:
|
||||
end = None
|
||||
for word in eg.y[start:end]:
|
||||
if word.dep != 0:
|
||||
return True
|
||||
|
@ -857,6 +859,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
state.print_state()
|
||||
)))
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(i)
|
||||
break
|
||||
else:
|
||||
failed = False
|
||||
|
|
|
@ -157,7 +157,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if token.ent_type:
|
||||
labels.add(token.ent_type_)
|
||||
return labels
|
||||
|
||||
|
||||
def move_name(self, int move, attr_t label):
|
||||
if move == OUT:
|
||||
return 'O'
|
||||
|
@ -307,6 +307,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
for span in eg.y.spans.get(neg_key, []):
|
||||
if span.start >= start and span.end <= end:
|
||||
return True
|
||||
if end is not None and end < 0:
|
||||
end = None
|
||||
for word in eg.y[start:end]:
|
||||
if word.ent_iob != 0:
|
||||
return True
|
||||
|
@ -387,9 +389,9 @@ cdef class Begin:
|
|||
elif st.B_(1).ent_iob == 3:
|
||||
# If the next word is B, we can't B now
|
||||
return False
|
||||
elif st.B_(1).sent_start == 1:
|
||||
# Don't allow entities to extend across sentence boundaries
|
||||
return False
|
||||
#elif st.B_(1).sent_start == 1:
|
||||
# # Don't allow entities to extend across sentence boundaries
|
||||
# return False
|
||||
# Don't allow entities to start on whitespace
|
||||
elif Lexeme.get_struct_attr(st.B_(0).lex, IS_SPACE):
|
||||
return False
|
||||
|
@ -465,9 +467,9 @@ cdef class In:
|
|||
# Otherwise, force acceptance, even if we're across a sentence
|
||||
# boundary or the token is whitespace.
|
||||
return True
|
||||
elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
||||
# Don't allow entities to extend across sentence boundaries
|
||||
return False
|
||||
#elif st.B(1) != -1 and st.B_(1).sent_start == 1:
|
||||
# # Don't allow entities to extend across sentence boundaries
|
||||
# return False
|
||||
else:
|
||||
return True
|
||||
|
||||
|
@ -643,7 +645,7 @@ cdef class Unit:
|
|||
cost += 1
|
||||
break
|
||||
return cost
|
||||
|
||||
|
||||
|
||||
|
||||
cdef class Out:
|
||||
|
|
|
@ -20,6 +20,10 @@ cdef class StateClass:
|
|||
if self._borrowed != 1:
|
||||
del self.c
|
||||
|
||||
@property
|
||||
def history(self):
|
||||
return list(self.c.history)
|
||||
|
||||
@property
|
||||
def stack(self):
|
||||
return [self.S(i) for i in range(self.c.stack_depth())]
|
||||
|
@ -176,3 +180,6 @@ cdef class StateClass:
|
|||
|
||||
def clone(self, StateClass src):
|
||||
self.c.clone(src.c)
|
||||
|
||||
def set_context_tokens(self, int[:, :] output, int row, int n_feats):
|
||||
self.c.set_context_tokens(&output[row, 0], n_feats)
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
# cython: infer_types=True
|
||||
from __future__ import print_function
|
||||
from cymem.cymem cimport Pool
|
||||
from libc.stdlib cimport calloc, free
|
||||
from libcpp.vector cimport vector
|
||||
|
||||
from collections import Counter
|
||||
import srsly
|
||||
|
@ -73,7 +75,18 @@ cdef class TransitionSystem:
|
|||
offset += len(doc)
|
||||
return states
|
||||
|
||||
def follow_history(self, doc, history):
|
||||
cdef int clas
|
||||
cdef StateClass state = StateClass(doc)
|
||||
for clas in history:
|
||||
action = self.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(clas)
|
||||
return state
|
||||
|
||||
def get_oracle_sequence(self, Example example, _debug=False):
|
||||
if not self.has_gold(example):
|
||||
return []
|
||||
states, golds, _ = self.init_gold_batch([example])
|
||||
if not states:
|
||||
return []
|
||||
|
@ -85,6 +98,8 @@ cdef class TransitionSystem:
|
|||
return self.get_oracle_sequence_from_state(state, gold)
|
||||
|
||||
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
|
||||
if state.is_final():
|
||||
return []
|
||||
cdef Pool mem = Pool()
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
assert self.n_moves > 0
|
||||
|
@ -110,6 +125,7 @@ cdef class TransitionSystem:
|
|||
"S0 head?", str(state.has_head(state.S(0))),
|
||||
)))
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(i)
|
||||
break
|
||||
else:
|
||||
if _debug:
|
||||
|
@ -137,6 +153,17 @@ cdef class TransitionSystem:
|
|||
raise ValueError(Errors.E170.format(name=name))
|
||||
action = self.lookup_transition(name)
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(action.clas)
|
||||
|
||||
def transition_states(self, states, float[:, ::1] scores):
|
||||
assert len(states) == scores.shape[0]
|
||||
cdef StateClass state
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef vector[StateC*] c_states
|
||||
for state in states:
|
||||
c_states.push_back(state.c)
|
||||
c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef Transition lookup_transition(self, object name) except *:
|
||||
raise NotImplementedError
|
||||
|
@ -250,3 +277,31 @@ cdef class TransitionSystem:
|
|||
self.cfg.update(msg['cfg'])
|
||||
self.initialize_actions(labels)
|
||||
return self
|
||||
|
||||
|
||||
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
|
||||
cdef int i, guess
|
||||
cdef Transition action
|
||||
for i in range(batch_size):
|
||||
moves.set_valid(is_valid, states[i])
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
if guess == -1:
|
||||
# This shouldn't happen, but it's hard to raise an error here,
|
||||
# and we don't want to infinite loop. So, force to end state.
|
||||
states[i].force_final()
|
||||
else:
|
||||
action = moves.c[guess]
|
||||
action.do(states[i], action.label)
|
||||
states[i].history.push_back(guess)
|
||||
free(is_valid)
|
||||
|
||||
|
||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
||||
cdef int best = -1
|
||||
for i in range(n):
|
||||
if is_valid[i] >= 1:
|
||||
if best == -1 or scores[i] > scores[best]:
|
||||
best = i
|
||||
return best
|
||||
|
|
|
@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
|
|||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.arc_eager cimport ArcEager
|
||||
from .transition_parser import Parser
|
||||
from ._parser_internals.arc_eager import ArcEager
|
||||
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language
|
||||
|
@ -17,12 +17,11 @@ from ..util import registry
|
|||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -122,6 +121,7 @@ def make_parser(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_parser",
|
||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||
|
@ -227,6 +227,7 @@ def parser_score(examples, **kwargs):
|
|||
|
||||
DOCS: https://spacy.io/api/dependencyparser#score
|
||||
"""
|
||||
|
||||
def has_sents(doc):
|
||||
return doc.has_annotation("SENT_START")
|
||||
|
||||
|
@ -234,8 +235,11 @@ def parser_score(examples, **kwargs):
|
|||
dep = getattr(token, attr)
|
||||
dep = token.vocab.strings.as_string(dep).lower()
|
||||
return dep
|
||||
|
||||
results = {}
|
||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
||||
results.update(
|
||||
Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||
)
|
||||
kwargs.setdefault("getter", dep_getter)
|
||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||
|
@ -248,11 +252,12 @@ def make_parser_scorer():
|
|||
return parser_score
|
||||
|
||||
|
||||
cdef class DependencyParser(Parser):
|
||||
class DependencyParser(Parser):
|
||||
"""Pipeline component for dependency parsing.
|
||||
|
||||
DOCS: https://spacy.io/api/dependencyparser
|
||||
"""
|
||||
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
def __init__(
|
||||
|
@ -272,8 +277,7 @@ cdef class DependencyParser(Parser):
|
|||
incorrect_spans_key=None,
|
||||
scorer=parser_score,
|
||||
):
|
||||
"""Create a DependencyParser.
|
||||
"""
|
||||
"""Create a DependencyParser."""
|
||||
super().__init__(
|
||||
vocab,
|
||||
model,
|
|
@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
|
|||
from thinc.api import Model, Config
|
||||
|
||||
from ._parser_internals.transition_system import TransitionSystem
|
||||
from .transition_parser cimport Parser
|
||||
from ._parser_internals.ner cimport BiluoPushDown
|
||||
from .transition_parser import Parser
|
||||
from ._parser_internals.ner import BiluoPushDown
|
||||
|
||||
from ..language import Language
|
||||
from ..scorer import get_ner_prf, PRFScore
|
||||
from ..training import validate_examples
|
||||
from ..util import registry
|
||||
|
||||
|
||||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"incorrect_spans_key": None,
|
||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_ner(
|
||||
nlp: Language,
|
||||
|
@ -98,6 +102,7 @@ def make_ner(
|
|||
scorer=scorer,
|
||||
)
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"beam_ner",
|
||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
|
@ -111,7 +116,12 @@ def make_ner(
|
|||
"incorrect_spans_key": None,
|
||||
"scorer": None,
|
||||
},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_beam_ner(
|
||||
nlp: Language,
|
||||
|
@ -185,11 +195,12 @@ def make_ner_scorer():
|
|||
return ner_score
|
||||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
class EntityRecognizer(Parser):
|
||||
"""Pipeline component for named entity recognition.
|
||||
|
||||
DOCS: https://spacy.io/api/entityrecognizer
|
||||
"""
|
||||
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
def __init__(
|
||||
|
@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser):
|
|||
incorrect_spans_key=None,
|
||||
scorer=ner_score,
|
||||
):
|
||||
"""Create an EntityRecognizer.
|
||||
"""
|
||||
"""Create an EntityRecognizer."""
|
||||
super().__init__(
|
||||
vocab,
|
||||
model,
|
||||
name,
|
||||
moves,
|
||||
update_with_oracle_cut_size=update_with_oracle_cut_size,
|
||||
min_action_freq=1, # not relevant for NER
|
||||
min_action_freq=1, # not relevant for NER
|
||||
learn_tokens=False, # not relevant for NER
|
||||
beam_width=beam_width,
|
||||
beam_density=beam_density,
|
||||
|
@ -242,8 +252,11 @@ cdef class EntityRecognizer(Parser):
|
|||
def labels(self):
|
||||
# Get the labels from the model by looking at the available moves, e.g.
|
||||
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
||||
labels = set(move.split("-")[1] for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U"))
|
||||
labels = set(
|
||||
move.split("-")[1]
|
||||
for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U")
|
||||
)
|
||||
return tuple(sorted(labels))
|
||||
|
||||
def scored_ents(self, beams):
|
|
@ -1,19 +0,0 @@
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from ..vocab cimport Vocab
|
||||
from .trainable_pipe cimport TrainablePipe
|
||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||
from ._parser_internals._state cimport StateC
|
||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
cdef public object _rehearsal_model
|
||||
cdef readonly TransitionSystem moves
|
||||
cdef public object _multitasks
|
||||
|
||||
cdef void _parseC(self, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil
|
|
@ -7,30 +7,29 @@ from libcpp.vector cimport vector
|
|||
from libc.string cimport memset, memcpy
|
||||
from libc.stdlib cimport calloc, free
|
||||
import random
|
||||
import contextlib
|
||||
|
||||
import srsly
|
||||
from thinc.api import set_dropout_rate, CupyOps
|
||||
from thinc.api import set_dropout_rate, CupyOps, get_array_module
|
||||
from thinc.extra.search cimport Beam
|
||||
import numpy.random
|
||||
import numpy
|
||||
import warnings
|
||||
|
||||
from ._parser_internals.stateclass cimport StateClass
|
||||
from ..ml.parser_model cimport alloc_activations, free_activations
|
||||
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
||||
from ..tokens.doc cimport Doc
|
||||
from .trainable_pipe import TrainablePipe
|
||||
from ._parser_internals cimport _beam_utils
|
||||
from ._parser_internals import _beam_utils
|
||||
from ..vocab cimport Vocab
|
||||
from ._parser_internals.transition_system cimport TransitionSystem
|
||||
|
||||
from ..training import validate_examples, validate_get_examples
|
||||
from ..errors import Errors, Warnings
|
||||
from .. import util
|
||||
|
||||
|
||||
cdef class Parser(TrainablePipe):
|
||||
class Parser(TrainablePipe):
|
||||
"""
|
||||
Base class of the DependencyParser and EntityRecognizer.
|
||||
"""
|
||||
|
@ -129,8 +128,9 @@ cdef class Parser(TrainablePipe):
|
|||
@property
|
||||
def move_names(self):
|
||||
names = []
|
||||
cdef TransitionSystem moves = self.moves
|
||||
for i in range(self.moves.n_moves):
|
||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
||||
name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
|
||||
# Explicitly removing the internal "U-" token used for blocking entities
|
||||
if name != "U-":
|
||||
names.append(name)
|
||||
|
@ -219,9 +219,6 @@ cdef class Parser(TrainablePipe):
|
|||
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int): Number of documents to accumulate into a working set.
|
||||
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
|
||||
deals with a failing batch of documents. The default function just reraises
|
||||
the exception.
|
||||
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
|
@ -243,79 +240,27 @@ cdef class Parser(TrainablePipe):
|
|||
def predict(self, docs):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
self._ensure_labels_are_added(docs)
|
||||
if not any(len(doc) for doc in docs):
|
||||
result = self.moves.init_batch(docs)
|
||||
return result
|
||||
if self.cfg["beam_width"] == 1:
|
||||
return self.greedy_parse(docs, drop=0.0)
|
||||
else:
|
||||
return self.beam_parse(
|
||||
docs,
|
||||
drop=0.0,
|
||||
beam_width=self.cfg["beam_width"],
|
||||
beam_density=self.cfg["beam_density"]
|
||||
)
|
||||
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||
states_or_beams, _ = self.model.predict((docs, self.moves))
|
||||
return states_or_beams
|
||||
|
||||
def greedy_parse(self, docs, drop=0.):
|
||||
cdef vector[StateC*] states
|
||||
cdef StateClass state
|
||||
self._ensure_labels_are_added(docs)
|
||||
set_dropout_rate(self.model, drop)
|
||||
batch = self.moves.init_batch(docs)
|
||||
model = self.model.predict(docs)
|
||||
weights = get_c_weights(model)
|
||||
for state in batch:
|
||||
if not state.is_final():
|
||||
states.push_back(state.c)
|
||||
sizes = get_c_sizes(model, states.size())
|
||||
with nogil:
|
||||
self._parseC(&states[0],
|
||||
weights, sizes)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return batch
|
||||
# TODO: Deprecated
|
||||
self._resize()
|
||||
with _change_attrs(self.model, beam_width=1):
|
||||
states, _ = self.model.predict((docs, self.moves))
|
||||
return states
|
||||
|
||||
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
||||
cdef Beam beam
|
||||
cdef Doc doc
|
||||
self._ensure_labels_are_added(docs)
|
||||
batch = _beam_utils.BeamBatch(
|
||||
self.moves,
|
||||
self.moves.init_batch(docs),
|
||||
None,
|
||||
beam_width,
|
||||
density=beam_density
|
||||
)
|
||||
model = self.model.predict(docs)
|
||||
while not batch.is_done:
|
||||
states = batch.get_unfinished_states()
|
||||
if not states:
|
||||
break
|
||||
scores = model.predict(states)
|
||||
batch.advance(scores)
|
||||
model.clear_memory()
|
||||
del model
|
||||
return list(batch)
|
||||
|
||||
cdef void _parseC(self, StateC** states,
|
||||
WeightsC weights, SizesC sizes) nogil:
|
||||
cdef int i, j
|
||||
cdef vector[StateC*] unfinished
|
||||
cdef ActivationsC activations = alloc_activations(sizes)
|
||||
while sizes.states >= 1:
|
||||
predict_states(&activations,
|
||||
states, &weights, sizes)
|
||||
# Validate actions, argmax, take action.
|
||||
self.c_transition_batch(states,
|
||||
activations.scores, sizes.classes, sizes.states)
|
||||
for i in range(sizes.states):
|
||||
if not states[i].is_final():
|
||||
unfinished.push_back(states[i])
|
||||
for i in range(unfinished.size()):
|
||||
states[i] = unfinished[i]
|
||||
sizes.states = unfinished.size()
|
||||
unfinished.clear()
|
||||
free_activations(&activations)
|
||||
# TODO: Deprecated
|
||||
self._resize()
|
||||
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||
beams, _ = self.model.predict((docs, self.moves))
|
||||
return beams
|
||||
|
||||
def set_annotations(self, docs, states_or_beams):
|
||||
cdef StateClass state
|
||||
|
@ -327,35 +272,6 @@ cdef class Parser(TrainablePipe):
|
|||
for hook in self.postprocesses:
|
||||
hook(doc)
|
||||
|
||||
def transition_states(self, states, float[:, ::1] scores):
|
||||
cdef StateClass state
|
||||
cdef float* c_scores = &scores[0, 0]
|
||||
cdef vector[StateC*] c_states
|
||||
for state in states:
|
||||
c_states.push_back(state.c)
|
||||
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||
return [state for state in states if not state.c.is_final()]
|
||||
|
||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
||||
int nr_class, int batch_size) nogil:
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
with gil:
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
||||
cdef int i, guess
|
||||
cdef Transition action
|
||||
for i in range(batch_size):
|
||||
self.moves.set_valid(is_valid, states[i])
|
||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||
if guess == -1:
|
||||
# This shouldn't happen, but it's hard to raise an error here,
|
||||
# and we don't want to infinite loop. So, force to end state.
|
||||
states[i].force_final()
|
||||
else:
|
||||
action = self.moves.c[guess]
|
||||
action.do(states[i], action.label)
|
||||
free(is_valid)
|
||||
|
||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||
cdef StateClass state
|
||||
if losses is None:
|
||||
|
@ -367,166 +283,88 @@ cdef class Parser(TrainablePipe):
|
|||
)
|
||||
for multitask in self._multitasks:
|
||||
multitask.update(examples, drop=drop, sgd=sgd)
|
||||
# We need to take care to act on the whole batch, because we might be
|
||||
# getting vectors via a listener.
|
||||
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
||||
if n_examples == 0:
|
||||
return losses
|
||||
set_dropout_rate(self.model, drop)
|
||||
# The probability we use beam update, instead of falling back to
|
||||
# a greedy update
|
||||
beam_update_prob = self.cfg["beam_update_prob"]
|
||||
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
|
||||
return self.update_beam(
|
||||
examples,
|
||||
beam_width=self.cfg["beam_width"],
|
||||
sgd=sgd,
|
||||
losses=losses,
|
||||
beam_density=self.cfg["beam_density"]
|
||||
)
|
||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||
if max_moves >= 1:
|
||||
# Chop sequences into lengths of this many words, to make the
|
||||
# batch uniform length.
|
||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||
states, golds, _ = self._init_gold_batch(
|
||||
examples,
|
||||
max_length=max_moves
|
||||
)
|
||||
else:
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
docs = [eg.x for eg in examples if len(eg.x)]
|
||||
(states, scores), backprop_scores = self.model.begin_update((docs, self.moves))
|
||||
if sum(s.shape[0] for s in scores) == 0:
|
||||
return losses
|
||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
||||
|
||||
all_states = list(states)
|
||||
states_golds = list(zip(states, golds))
|
||||
n_moves = 0
|
||||
while states_golds:
|
||||
states, golds = zip(*states_golds)
|
||||
scores, backprop = model.begin_update(states)
|
||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
||||
# Note that the gradient isn't normalized by the batch size
|
||||
# here, because our "samples" are really the states...But we
|
||||
# can't normalize by the number of states either, as then we'd
|
||||
# be getting smaller gradients for states in long sequences.
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, scores)
|
||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
||||
if max_moves >= 1 and n_moves >= max_moves:
|
||||
break
|
||||
n_moves += 1
|
||||
|
||||
backprop_tok2vec(golds)
|
||||
d_scores = self.get_loss((states, scores), examples)
|
||||
backprop_scores((states, d_scores))
|
||||
if sgd not in (None, False):
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||
# removing these in time if we don't explicitly delete? It's confusing.
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
del model
|
||||
del backprop_scores
|
||||
return losses
|
||||
|
||||
def get_loss(self, states_scores, examples):
|
||||
states, scores = states_scores
|
||||
scores = self.model.ops.xp.vstack(scores)
|
||||
costs = self._get_costs_from_histories(
|
||||
examples,
|
||||
[list(state.history) for state in states]
|
||||
)
|
||||
xp = get_array_module(scores)
|
||||
best_costs = costs.min(axis=1, keepdims=True)
|
||||
gscores = scores.copy()
|
||||
min_score = scores.min() - 1000
|
||||
assert costs.shape == scores.shape, (costs.shape, scores.shape)
|
||||
gscores[costs > best_costs] = min_score
|
||||
max_ = scores.max(axis=1, keepdims=True)
|
||||
gmax = gscores.max(axis=1, keepdims=True)
|
||||
exp_scores = xp.exp(scores - max_)
|
||||
exp_gscores = xp.exp(gscores - gmax)
|
||||
Z = exp_scores.sum(axis=1, keepdims=True)
|
||||
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
||||
d_scores = exp_scores / Z
|
||||
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
|
||||
return d_scores
|
||||
|
||||
def _get_costs_from_histories(self, examples, histories):
|
||||
cdef TransitionSystem moves = self.moves
|
||||
cdef StateClass state
|
||||
cdef int clas
|
||||
cdef int nF = self.model.get_dim("nF")
|
||||
cdef int nO = moves.n_moves
|
||||
cdef int nS = sum([len(history) for history in histories])
|
||||
cdef Pool mem = Pool()
|
||||
is_valid = <int*>mem.alloc(nO, sizeof(int))
|
||||
c_costs = <float*>mem.alloc(nO, sizeof(float))
|
||||
states = moves.init_batch([eg.x for eg in examples])
|
||||
batch = []
|
||||
for eg, s, h in zip(examples, states, histories):
|
||||
if not s.is_final():
|
||||
gold = moves.init_gold(s, eg)
|
||||
batch.append((eg, s, h, gold))
|
||||
output = []
|
||||
while batch:
|
||||
costs = numpy.zeros((len(batch), nO), dtype="f")
|
||||
for i, (eg, state, history, gold) in enumerate(batch):
|
||||
clas = history.pop(0)
|
||||
moves.set_costs(is_valid, c_costs, state.c, gold)
|
||||
action = moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
state.c.history.push_back(clas)
|
||||
for j in range(nO):
|
||||
costs[i, j] = c_costs[j]
|
||||
output.append(costs)
|
||||
batch = [(eg, s, h, g) for eg, s, h, g in batch if len(h) != 0]
|
||||
return self.model.ops.xp.vstack(output)
|
||||
|
||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||
if losses is None:
|
||||
losses = {}
|
||||
for multitask in self._multitasks:
|
||||
if hasattr(multitask, 'rehearse'):
|
||||
multitask.rehearse(examples, losses=losses, sgd=sgd)
|
||||
if self._rehearsal_model is None:
|
||||
return None
|
||||
losses.setdefault(self.name, 0.)
|
||||
validate_examples(examples, "Parser.rehearse")
|
||||
docs = [eg.predicted for eg in examples]
|
||||
states = self.moves.init_batch(docs)
|
||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
self._resize()
|
||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||
set_dropout_rate(self.model, 0.0)
|
||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
||||
model, backprop_tok2vec = self.model.begin_update(docs)
|
||||
n_scores = 0.
|
||||
loss = 0.
|
||||
while states:
|
||||
targets, _ = tutor.begin_update(states)
|
||||
guesses, backprop = model.begin_update(states)
|
||||
d_scores = (guesses - targets) / targets.shape[0]
|
||||
# If all weights for an output are 0 in the original model, don't
|
||||
# supervise that output. This allows us to add classes.
|
||||
loss += (d_scores**2).sum()
|
||||
backprop(d_scores)
|
||||
# Follow the predicted action
|
||||
self.transition_states(states, guesses)
|
||||
states = [state for state in states if not state.is_final()]
|
||||
n_scores += d_scores.size
|
||||
# Do the backprop
|
||||
backprop_tok2vec(docs)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
losses[self.name] += loss / n_scores
|
||||
del backprop
|
||||
del backprop_tok2vec
|
||||
model.clear_memory()
|
||||
tutor.clear_memory()
|
||||
del model
|
||||
del tutor
|
||||
return losses
|
||||
raise NotImplementedError
|
||||
|
||||
def update_beam(self, examples, *, beam_width,
|
||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
||||
if not states:
|
||||
return losses
|
||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
model, backprop_tok2vec = self.model.begin_update(
|
||||
[eg.predicted for eg in examples])
|
||||
loss = _beam_utils.update_beam(
|
||||
self.moves,
|
||||
states,
|
||||
golds,
|
||||
model,
|
||||
beam_width,
|
||||
beam_density=beam_density,
|
||||
)
|
||||
losses[self.name] += loss
|
||||
backprop_tok2vec(golds)
|
||||
if sgd is not None:
|
||||
self.finish_update(sgd)
|
||||
|
||||
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
|
||||
cdef StateClass state
|
||||
cdef Pool mem = Pool()
|
||||
cdef int i
|
||||
|
||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
||||
|
||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
||||
dtype='f', order='C')
|
||||
c_d_scores = <float*>d_scores.data
|
||||
unseen_classes = self.model.attrs["unseen_classes"]
|
||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
||||
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
|
||||
memset(costs, 0, self.moves.n_moves * sizeof(float))
|
||||
self.moves.set_costs(is_valid, costs, state.c, gold)
|
||||
for j in range(self.moves.n_moves):
|
||||
if costs[j] <= 0.0 and j in unseen_classes:
|
||||
unseen_classes.remove(j)
|
||||
cpu_log_loss(c_d_scores,
|
||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
||||
c_d_scores += d_scores.shape[1]
|
||||
# Note that we don't normalize this. See comment in update() for why.
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += (d_scores**2).sum()
|
||||
return d_scores
|
||||
raise NotImplementedError
|
||||
|
||||
def set_output(self, nO):
|
||||
self.model.attrs["resize_output"](self.model, nO)
|
||||
|
@ -565,7 +403,7 @@ cdef class Parser(TrainablePipe):
|
|||
for example in islice(get_examples(), 10):
|
||||
doc_sample.append(example.predicted)
|
||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||
self.model.initialize(doc_sample)
|
||||
self.model.initialize((doc_sample, self.moves))
|
||||
if nlp is not None:
|
||||
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||
|
||||
|
@ -622,44 +460,18 @@ cdef class Parser(TrainablePipe):
|
|||
raise ValueError(Errors.E149) from None
|
||||
return self
|
||||
|
||||
def _init_gold_batch(self, examples, max_length):
|
||||
"""Make a square batch, of length equal to the shortest transition
|
||||
sequence or a cap. A long
|
||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||
where N is the shortest doc. We'll make two states, one representing
|
||||
long_doc[:N], and another representing long_doc[N:]."""
|
||||
cdef:
|
||||
StateClass start_state
|
||||
StateClass state
|
||||
Transition action
|
||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
||||
states = []
|
||||
golds = []
|
||||
to_cut = []
|
||||
for state, eg in zip(all_states, examples):
|
||||
if self.moves.has_gold(eg) and not state.is_final():
|
||||
gold = self.moves.init_gold(state, eg)
|
||||
if len(eg.x) < max_length:
|
||||
states.append(state)
|
||||
golds.append(gold)
|
||||
else:
|
||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
||||
state.copy(), gold)
|
||||
to_cut.append((eg, state, gold, oracle_actions))
|
||||
if not to_cut:
|
||||
return states, golds, 0
|
||||
cdef int clas
|
||||
for eg, state, gold, oracle_actions in to_cut:
|
||||
for i in range(0, len(oracle_actions), max_length):
|
||||
start_state = state.copy()
|
||||
for clas in oracle_actions[i:i+max_length]:
|
||||
action = self.moves.c[clas]
|
||||
action.do(state.c, action.label)
|
||||
if state.is_final():
|
||||
break
|
||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||
states.append(start_state)
|
||||
golds.append(gold)
|
||||
if state.is_final():
|
||||
break
|
||||
return states, golds, max_length
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _change_attrs(model, **kwargs):
|
||||
"""Temporarily modify a thinc model's attributes."""
|
||||
unset = object()
|
||||
old_attrs = {}
|
||||
for key, value in kwargs.items():
|
||||
old_attrs[key] = model.attrs.get(key, unset)
|
||||
model.attrs[key] = value
|
||||
yield model
|
||||
for key, value in old_attrs.items():
|
||||
if value is unset:
|
||||
model.attrs.pop(key)
|
||||
else:
|
||||
model.attrs[key] = value
|
||||
|
|
|
@ -123,6 +123,7 @@ def test_ner_labels_added_implicitly_on_predict():
|
|||
assert "D" in ner.labels
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="Not yet supported")
|
||||
def test_ner_labels_added_implicitly_on_beam_parse():
|
||||
nlp = Language()
|
||||
ner = nlp.add_pipe("beam_ner")
|
||||
|
@ -134,6 +135,7 @@ def test_ner_labels_added_implicitly_on_beam_parse():
|
|||
assert "D" in ner.labels
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="greedy_parse is deprecated")
|
||||
def test_ner_labels_added_implicitly_on_greedy_parse():
|
||||
nlp = Language()
|
||||
ner = nlp.add_pipe("beam_ner")
|
||||
|
|
|
@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
|||
from spacy.training import Example, iob_to_biluo
|
||||
from spacy.tokens import Doc, Span
|
||||
from spacy.vocab import Vocab
|
||||
from thinc.api import fix_random_seed
|
||||
import logging
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
@ -180,6 +181,7 @@ def test_issue4267():
|
|||
assert token.ent_iob == 2
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
@pytest.mark.issue(4313)
|
||||
def test_issue4313():
|
||||
"""This should not crash or exit with some strange error code"""
|
||||
|
@ -391,7 +393,7 @@ def test_train_empty():
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||
ner = nlp.add_pipe("ner", last=True)
|
||||
ner.add_label("PERSON")
|
||||
nlp.initialize()
|
||||
nlp.initialize(get_examples=lambda: train_examples)
|
||||
for itn in range(2):
|
||||
losses = {}
|
||||
batches = util.minibatch(train_examples, size=8)
|
||||
|
@ -518,11 +520,11 @@ def test_block_ner():
|
|||
assert [token.ent_type_ for token in doc] == expected_types
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_upper", [True, False])
|
||||
def test_overfitting_IO(use_upper):
|
||||
def test_overfitting_IO():
|
||||
fix_random_seed(1)
|
||||
# Simple test to try and quickly overfit the NER component
|
||||
nlp = English()
|
||||
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
|
||||
ner = nlp.add_pipe("ner", config={"model": {}})
|
||||
train_examples = []
|
||||
for text, annotations in TRAIN_DATA:
|
||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
|
@ -533,7 +535,7 @@ def test_overfitting_IO(use_upper):
|
|||
for i in range(50):
|
||||
losses = {}
|
||||
nlp.update(train_examples, sgd=optimizer, losses=losses)
|
||||
assert losses["ner"] < 0.00001
|
||||
assert losses["ner"] < 0.001
|
||||
|
||||
# test the trained model
|
||||
test_text = "I like London."
|
||||
|
@ -554,7 +556,6 @@ def test_overfitting_IO(use_upper):
|
|||
assert ents2[0].label_ == "LOC"
|
||||
# Ensure that the predictions are still the same, even after adding a new label
|
||||
ner2 = nlp2.get_pipe("ner")
|
||||
assert ner2.model.attrs["has_upper"] == use_upper
|
||||
ner2.add_label("RANDOM_NEW_LABEL")
|
||||
doc3 = nlp2(test_text)
|
||||
ents3 = doc3.ents
|
||||
|
@ -596,6 +597,7 @@ def test_overfitting_IO(use_upper):
|
|||
assert ents[1].kb_id == 0
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
def test_beam_ner_scores():
|
||||
# Test that we can get confidence values out of the beam_ner pipe
|
||||
beam_width = 16
|
||||
|
@ -631,6 +633,7 @@ def test_beam_ner_scores():
|
|||
assert 0 - eps <= score <= 1 + eps
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
def test_beam_overfitting_IO(neg_key):
|
||||
# Simple test to try and quickly overfit the Beam NER component
|
||||
nlp = English()
|
||||
|
|
|
@ -118,6 +118,7 @@ def test_beam_advance_too_few_scores(beam, scores):
|
|||
beam.advance(scores[:-1])
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
def test_beam_parse(examples, beam_width):
|
||||
nlp = Language()
|
||||
parser = nlp.add_pipe("beam_parser")
|
||||
|
@ -128,6 +129,7 @@ def test_beam_parse(examples, beam_width):
|
|||
parser(doc)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
@hypothesis.given(hyp=hypothesis.strategies.data())
|
||||
def test_beam_density(moves, examples, beam_width, hyp):
|
||||
beam_density = float(hyp.draw(hypothesis.strategies.floats(0.0, 1.0, width=32)))
|
||||
|
|
|
@ -5,9 +5,11 @@ from thinc.api import Adam
|
|||
from spacy import registry, util
|
||||
from spacy.attrs import DEP, NORM
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
from spacy import util, registry
|
||||
from thinc.api import fix_random_seed
|
||||
|
||||
from ...pipeline import DependencyParser
|
||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||
|
@ -58,6 +60,8 @@ PARTIAL_DATA = [
|
|||
),
|
||||
]
|
||||
|
||||
PARSERS = ["parser"] # TODO: Test beam_parser when ready
|
||||
|
||||
eps = 0.1
|
||||
|
||||
|
||||
|
@ -318,7 +322,7 @@ def test_parser_constructor(en_vocab):
|
|||
DependencyParser(en_vocab, model)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
@pytest.mark.parametrize("pipe_name", PARSERS)
|
||||
def test_incomplete_data(pipe_name):
|
||||
# Test that the parser works with incomplete information
|
||||
nlp = English()
|
||||
|
@ -344,8 +348,9 @@ def test_incomplete_data(pipe_name):
|
|||
assert doc[2].head.i == 1
|
||||
|
||||
|
||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||
@pytest.mark.parametrize("pipe_name", PARSERS)
|
||||
def test_overfitting_IO(pipe_name):
|
||||
fix_random_seed(0)
|
||||
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
||||
nlp = English()
|
||||
parser = nlp.add_pipe(pipe_name)
|
||||
|
@ -354,6 +359,7 @@ def test_overfitting_IO(pipe_name):
|
|||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||
for dep in annotations.get("deps", []):
|
||||
parser.add_label(dep)
|
||||
# train_examples = train_examples[:1]
|
||||
optimizer = nlp.initialize()
|
||||
# run overfitting
|
||||
for i in range(200):
|
||||
|
@ -395,6 +401,7 @@ def test_overfitting_IO(pipe_name):
|
|||
assert_equal(batch_deps_1, no_batch_deps)
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
def test_beam_parser_scores():
|
||||
# Test that we can get confidence values out of the beam_parser pipe
|
||||
beam_width = 16
|
||||
|
@ -433,6 +440,7 @@ def test_beam_parser_scores():
|
|||
assert 0 - eps <= head_score <= 1 + eps
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="no beam parser yet")
|
||||
def test_beam_overfitting_IO():
|
||||
# Simple test to try and quickly overfit the Beam dependency parser
|
||||
nlp = English()
|
||||
|
|
|
@ -255,7 +255,7 @@ cfg_string_multi = """
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
|
|
|
@ -122,33 +122,11 @@ width = ${components.tok2vec.model.width}
|
|||
|
||||
parser_config_string_upper = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 66
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 333
|
||||
depth = 4
|
||||
embed_size = 5555
|
||||
window_size = 1
|
||||
maxout_pieces = 7
|
||||
subword_features = false
|
||||
"""
|
||||
|
||||
|
||||
parser_config_string_no_upper = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v2"
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 66
|
||||
maxout_pieces = 2
|
||||
use_upper = false
|
||||
|
||||
[model.tok2vec]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
|
@ -179,7 +157,6 @@ def my_parser():
|
|||
extra_state_tokens=True,
|
||||
hidden_width=65,
|
||||
maxout_pieces=5,
|
||||
use_upper=True,
|
||||
)
|
||||
return parser
|
||||
|
||||
|
@ -285,15 +262,14 @@ def test_serialize_custom_nlp():
|
|||
nlp.to_disk(d)
|
||||
nlp2 = spacy.load(d)
|
||||
model = nlp2.get_pipe("parser").model
|
||||
model.get_ref("tok2vec")
|
||||
# check that we have the correct settings, not the default ones
|
||||
assert model.get_ref("upper").get_dim("nI") == 65
|
||||
assert model.get_ref("lower").get_dim("nI") == 65
|
||||
assert model.get_ref("tok2vec") is not None
|
||||
assert model.has_param("lower_W")
|
||||
assert model.has_param("upper_W")
|
||||
assert model.has_param("lower_b")
|
||||
assert model.has_param("upper_b")
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
|
||||
)
|
||||
@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
|
||||
def test_serialize_parser(parser_config_string):
|
||||
"""Create a non-default parser config to check nlp serializes it correctly"""
|
||||
nlp = English()
|
||||
|
@ -306,11 +282,11 @@ def test_serialize_parser(parser_config_string):
|
|||
nlp.to_disk(d)
|
||||
nlp2 = spacy.load(d)
|
||||
model = nlp2.get_pipe("parser").model
|
||||
model.get_ref("tok2vec")
|
||||
# check that we have the correct settings, not the default ones
|
||||
if model.attrs["has_upper"]:
|
||||
assert model.get_ref("upper").get_dim("nI") == 66
|
||||
assert model.get_ref("lower").get_dim("nI") == 66
|
||||
assert model.get_ref("tok2vec") is not None
|
||||
assert model.has_param("lower_W")
|
||||
assert model.has_param("upper_W")
|
||||
assert model.has_param("lower_b")
|
||||
assert model.has_param("upper_b")
|
||||
|
||||
|
||||
def test_config_nlp_roundtrip():
|
||||
|
@ -457,9 +433,7 @@ def test_config_auto_fill_extra_fields():
|
|||
load_model_from_config(nlp.config)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
|
||||
)
|
||||
@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
|
||||
def test_config_validate_literal(parser_config_string):
|
||||
nlp = English()
|
||||
config = Config().from_str(parser_config_string)
|
||||
|
|
|
@ -5,10 +5,7 @@ from pathlib import Path
|
|||
from spacy.about import __version__ as spacy_version
|
||||
from spacy import util
|
||||
from spacy import prefer_gpu, require_gpu, require_cpu
|
||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
||||
from spacy.util import to_ternary_int
|
||||
from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
|
||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||
from thinc.api import set_current_ops
|
||||
from spacy.training.batchers import minibatch_by_words
|
||||
|
@ -81,32 +78,33 @@ def test_util_get_package_path(package):
|
|||
assert isinstance(path, Path)
|
||||
|
||||
|
||||
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
|
||||
assert model.get_param("W").shape == (nF, nO, nP, nI)
|
||||
tensor = model.ops.alloc((10, nI))
|
||||
Y, get_dX = model.begin_update(tensor)
|
||||
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
||||
dY = model.ops.alloc((15, nO, nP))
|
||||
ids = model.ops.alloc((15, nF))
|
||||
ids[1, 2] = -1
|
||||
dY[1] = 1
|
||||
assert not model.has_grad("pad")
|
||||
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
||||
assert d_pad[0, 2, 0, 0] == 1.0
|
||||
ids.fill(0.0)
|
||||
dY.fill(0.0)
|
||||
dY[0] = 0
|
||||
ids[1, 2] = 0
|
||||
ids[1, 1] = -1
|
||||
ids[1, 0] = -1
|
||||
dY[1] = 1
|
||||
ids[2, 0] = -1
|
||||
dY[2] = 5
|
||||
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
||||
assert d_pad[0, 0, 0, 0] == 6
|
||||
assert d_pad[0, 1, 0, 0] == 1
|
||||
assert d_pad[0, 2, 0, 0] == 0
|
||||
# @pytest.mark.skip(reason="No precomputable affine")
|
||||
# def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
||||
# model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
|
||||
# assert model.get_param("W").shape == (nF, nO, nP, nI)
|
||||
# tensor = model.ops.alloc((10, nI))
|
||||
# Y, get_dX = model.begin_update(tensor)
|
||||
# assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
||||
# dY = model.ops.alloc((15, nO, nP))
|
||||
# ids = model.ops.alloc((15, nF))
|
||||
# ids[1, 2] = -1
|
||||
# dY[1] = 1
|
||||
# assert not model.has_grad("pad")
|
||||
# d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
||||
# assert d_pad[0, 2, 0, 0] == 1.0
|
||||
# ids.fill(0.0)
|
||||
# dY.fill(0.0)
|
||||
# dY[0] = 0
|
||||
# ids[1, 2] = 0
|
||||
# ids[1, 1] = -1
|
||||
# ids[1, 0] = -1
|
||||
# dY[1] = 1
|
||||
# ids[2, 0] = -1
|
||||
# dY[2] = 5
|
||||
# d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
||||
# assert d_pad[0, 0, 0, 0] == 6
|
||||
# assert d_pad[0, 1, 0, 0] == 1
|
||||
# assert d_pad[0, 2, 0, 0] == 0
|
||||
|
||||
|
||||
def test_prefer_gpu():
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from collections.abc import Iterable as IterableInstance
|
||||
import warnings
|
||||
import numpy
|
||||
from murmurhash.mrmr cimport hash64
|
||||
|
||||
|
|
|
@ -552,18 +552,17 @@ for a Tok2Vec layer.
|
|||
|
||||
## Parser & NER architectures {#parser}
|
||||
|
||||
### spacy.TransitionBasedParser.v2 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
|
||||
### spacy.TransitionBasedParser.v3 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TransitionBasedParser.v2"
|
||||
> @architectures = "spacy.TransitionBasedParser.v3"
|
||||
> state_type = "ner"
|
||||
> extra_state_tokens = false
|
||||
> hidden_width = 64
|
||||
> maxout_pieces = 2
|
||||
> use_upper = true
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> @architectures = "spacy.HashEmbedCNN.v2"
|
||||
|
@ -593,16 +592,15 @@ consists of either two or three subnetworks:
|
|||
state representation. If not present, the output from the lower model is used
|
||||
as action scores directly.
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
|
||||
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
|
||||
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
||||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
|
||||
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
|
||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
|
||||
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
|
||||
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
||||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~ |
|
||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||
|
||||
<Accordion title="spacy.TransitionBasedParser.v1 definition" spaced>
|
||||
|
||||
|
|
|
@ -141,7 +141,7 @@ factory = "tok2vec"
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
|
@ -158,7 +158,7 @@ same. This makes them fully independent and doesn't require an upstream
|
|||
factory = "ner"
|
||||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2Vec.v2"
|
||||
|
@ -482,7 +482,7 @@ sneakily delegates to the `Transformer` pipeline component.
|
|||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
@architectures = "spacy.TransitionBasedParser.v3"
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
|
|
Loading…
Reference in New Issue
Block a user