mirror of
https://github.com/explosion/spaCy.git
synced 2025-08-05 21:00:19 +03:00
Merge branch 'v4' into rename-islandic-and-multi-lang-code
This commit is contained in:
commit
292ccb798e
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=9.0.0.dev1,<9.1.0",
|
"thinc>=9.0.0.dev2,<9.1.0",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.10,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=9.0.0.dev1,<9.1.0
|
thinc>=9.0.0.dev2,<9.1.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
|
|
|
@ -38,7 +38,7 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=9.0.0.dev1,<9.1.0
|
thinc>=9.0.0.dev2,<9.1.0
|
||||||
wasabi>=0.9.1,<1.2.0
|
wasabi>=0.9.1,<1.2.0
|
||||||
srsly>=2.4.3,<3.0.0
|
srsly>=2.4.3,<3.0.0
|
||||||
catalogue>=2.0.6,<2.1.0
|
catalogue>=2.0.6,<2.1.0
|
||||||
|
|
6
setup.py
6
setup.py
|
@ -33,12 +33,10 @@ MOD_NAMES = [
|
||||||
"spacy.kb.candidate",
|
"spacy.kb.candidate",
|
||||||
"spacy.kb.kb",
|
"spacy.kb.kb",
|
||||||
"spacy.kb.kb_in_memory",
|
"spacy.kb.kb_in_memory",
|
||||||
"spacy.ml.parser_model",
|
"spacy.ml.tb_framework",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline.dep_parser",
|
|
||||||
"spacy.pipeline._edit_tree_internals.edit_trees",
|
"spacy.pipeline._edit_tree_internals.edit_trees",
|
||||||
"spacy.pipeline.morphologizer",
|
"spacy.pipeline.morphologizer",
|
||||||
"spacy.pipeline.ner",
|
|
||||||
"spacy.pipeline.pipe",
|
"spacy.pipeline.pipe",
|
||||||
"spacy.pipeline.trainable_pipe",
|
"spacy.pipeline.trainable_pipe",
|
||||||
"spacy.pipeline.sentencizer",
|
"spacy.pipeline.sentencizer",
|
||||||
|
@ -46,6 +44,7 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline.tagger",
|
"spacy.pipeline.tagger",
|
||||||
"spacy.pipeline.transition_parser",
|
"spacy.pipeline.transition_parser",
|
||||||
"spacy.pipeline._parser_internals.arc_eager",
|
"spacy.pipeline._parser_internals.arc_eager",
|
||||||
|
"spacy.pipeline._parser_internals.batch",
|
||||||
"spacy.pipeline._parser_internals.ner",
|
"spacy.pipeline._parser_internals.ner",
|
||||||
"spacy.pipeline._parser_internals.nonproj",
|
"spacy.pipeline._parser_internals.nonproj",
|
||||||
"spacy.pipeline._parser_internals.search",
|
"spacy.pipeline._parser_internals.search",
|
||||||
|
@ -53,6 +52,7 @@ MOD_NAMES = [
|
||||||
"spacy.pipeline._parser_internals.stateclass",
|
"spacy.pipeline._parser_internals.stateclass",
|
||||||
"spacy.pipeline._parser_internals.transition_system",
|
"spacy.pipeline._parser_internals.transition_system",
|
||||||
"spacy.pipeline._parser_internals._beam_utils",
|
"spacy.pipeline._parser_internals._beam_utils",
|
||||||
|
"spacy.pipeline._parser_internals._parser_utils",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
"spacy.training.align",
|
"spacy.training.align",
|
||||||
"spacy.training.gold_io",
|
"spacy.training.gold_io",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "3.5.0"
|
__version__ = "4.0.0.dev0"
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
__projects__ = "https://github.com/explosion/projects"
|
__projects__ = "https://github.com/explosion/projects"
|
||||||
|
|
|
@ -87,12 +87,11 @@ grad_factor = 1.0
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
use_upper = false
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
[components.parser.model.tok2vec]
|
||||||
|
@ -108,12 +107,11 @@ grad_factor = 1.0
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = false
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
[components.ner.model.tok2vec]
|
||||||
|
@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
factory = "parser"
|
factory = "parser"
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
maxout_pieces = 3
|
maxout_pieces = 3
|
||||||
use_upper = true
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.parser.model.tok2vec]
|
[components.parser.model.tok2vec]
|
||||||
|
@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width}
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = true
|
|
||||||
nO = null
|
nO = null
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
[components.ner.model.tok2vec]
|
||||||
|
|
|
@ -208,6 +208,8 @@ class Warnings(metaclass=ErrorsWithCodes):
|
||||||
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
W123 = ("Argument `enable` with value {enable} does not contain all values specified in the config option "
|
||||||
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
|
||||||
|
|
||||||
|
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
|
||||||
|
|
||||||
|
|
||||||
class Errors(metaclass=ErrorsWithCodes):
|
class Errors(metaclass=ErrorsWithCodes):
|
||||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||||
|
@ -949,6 +951,10 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||||
E4001 = ("Expected input to be one of the following types: ({expected_types}), "
|
E4001 = ("Expected input to be one of the following types: ({expected_types}), "
|
||||||
"but got '{received_type}'")
|
"but got '{received_type}'")
|
||||||
|
E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
|
||||||
|
E4003 = ("Training examples for distillation must have the exact same tokens in the "
|
||||||
|
"reference and predicted docs.")
|
||||||
|
E4004 = ("Backprop is not supported when is_train is not set.")
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
OLD_MODEL_SHORTCUTS = {
|
OLD_MODEL_SHORTCUTS = {
|
||||||
|
|
|
@ -5,7 +5,6 @@ from .attrs cimport attr_id_t
|
||||||
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
|
from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, LANG
|
||||||
|
|
||||||
from .structs cimport LexemeC
|
from .structs cimport LexemeC
|
||||||
from .strings cimport StringStore
|
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,164 +0,0 @@
|
||||||
from thinc.api import Model, normal_init
|
|
||||||
|
|
||||||
from ..util import registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.PrecomputableAffine.v1")
|
|
||||||
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
|
|
||||||
model = Model(
|
|
||||||
"precomputable_affine",
|
|
||||||
forward,
|
|
||||||
init=init,
|
|
||||||
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
|
|
||||||
params={"W": None, "b": None, "pad": None},
|
|
||||||
attrs={"dropout_rate": dropout},
|
|
||||||
)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def forward(model, X, is_train):
|
|
||||||
nF = model.get_dim("nF")
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
nP = model.get_dim("nP")
|
|
||||||
nI = model.get_dim("nI")
|
|
||||||
W = model.get_param("W")
|
|
||||||
# Preallocate array for layer output, including padding.
|
|
||||||
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
|
|
||||||
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
|
|
||||||
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
|
|
||||||
|
|
||||||
# Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
|
|
||||||
# change its shape to (nF, nO, nP) without breaking existing models. So
|
|
||||||
# we'll squeeze the first dimension here.
|
|
||||||
Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
|
|
||||||
|
|
||||||
def backward(dY_ids):
|
|
||||||
# This backprop is particularly tricky, because we get back a different
|
|
||||||
# thing from what we put out. We put out an array of shape:
|
|
||||||
# (nB, nF, nO, nP), and get back:
|
|
||||||
# (nB, nO, nP) and ids (nB, nF)
|
|
||||||
# The ids tell us the values of nF, so we would have:
|
|
||||||
#
|
|
||||||
# dYf = zeros((nB, nF, nO, nP))
|
|
||||||
# for b in range(nB):
|
|
||||||
# for f in range(nF):
|
|
||||||
# dYf[b, ids[b, f]] += dY[b]
|
|
||||||
#
|
|
||||||
# However, we avoid building that array for efficiency -- and just pass
|
|
||||||
# in the indices.
|
|
||||||
dY, ids = dY_ids
|
|
||||||
assert dY.ndim == 3
|
|
||||||
assert dY.shape[1] == nO, dY.shape
|
|
||||||
assert dY.shape[2] == nP, dY.shape
|
|
||||||
# nB = dY.shape[0]
|
|
||||||
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
|
|
||||||
Xf = X[ids]
|
|
||||||
Xf = Xf.reshape((Xf.shape[0], nF * nI))
|
|
||||||
|
|
||||||
model.inc_grad("b", dY.sum(axis=0))
|
|
||||||
dY = dY.reshape((dY.shape[0], nO * nP))
|
|
||||||
|
|
||||||
Wopfi = W.transpose((1, 2, 0, 3))
|
|
||||||
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
|
|
||||||
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
|
|
||||||
|
|
||||||
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
|
|
||||||
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
|
|
||||||
# (o, p, f, i) --> (f, o, p, i)
|
|
||||||
dWopfi = dWopfi.transpose((2, 0, 1, 3))
|
|
||||||
model.inc_grad("W", dWopfi)
|
|
||||||
return dXf.reshape((dXf.shape[0], nF, nI))
|
|
||||||
|
|
||||||
return Yf, backward
|
|
||||||
|
|
||||||
|
|
||||||
def _backprop_precomputable_affine_padding(model, dY, ids):
|
|
||||||
nB = dY.shape[0]
|
|
||||||
nF = model.get_dim("nF")
|
|
||||||
nP = model.get_dim("nP")
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
# Backprop the "padding", used as a filler for missing values.
|
|
||||||
# Values that are missing are set to -1, and each state vector could
|
|
||||||
# have multiple missing values. The padding has different values for
|
|
||||||
# different missing features. The gradient of the padding vector is:
|
|
||||||
#
|
|
||||||
# for b in range(nB):
|
|
||||||
# for f in range(nF):
|
|
||||||
# if ids[b, f] < 0:
|
|
||||||
# d_pad[f] += dY[b]
|
|
||||||
#
|
|
||||||
# Which can be rewritten as:
|
|
||||||
#
|
|
||||||
# (ids < 0).T @ dY
|
|
||||||
mask = model.ops.asarray(ids < 0, dtype="f")
|
|
||||||
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
|
|
||||||
return d_pad.reshape((1, nF, nO, nP))
|
|
||||||
|
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
|
||||||
"""This is like the 'layer sequential unit variance', but instead
|
|
||||||
of taking the actual inputs, we randomly generate whitened data.
|
|
||||||
|
|
||||||
Why's this all so complicated? We have a huge number of inputs,
|
|
||||||
and the maxout unit makes guessing the dynamics tricky. Instead
|
|
||||||
we set the maxout weights to values that empirically result in
|
|
||||||
whitened outputs given whitened inputs.
|
|
||||||
"""
|
|
||||||
if model.has_param("W") and model.get_param("W").any():
|
|
||||||
return
|
|
||||||
|
|
||||||
nF = model.get_dim("nF")
|
|
||||||
nO = model.get_dim("nO")
|
|
||||||
nP = model.get_dim("nP")
|
|
||||||
nI = model.get_dim("nI")
|
|
||||||
W = model.ops.alloc4f(nF, nO, nP, nI)
|
|
||||||
b = model.ops.alloc2f(nO, nP)
|
|
||||||
pad = model.ops.alloc4f(1, nF, nO, nP)
|
|
||||||
|
|
||||||
ops = model.ops
|
|
||||||
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
|
||||||
pad = normal_init(ops, pad.shape, mean=1.0)
|
|
||||||
model.set_param("W", W)
|
|
||||||
model.set_param("b", b)
|
|
||||||
model.set_param("pad", pad)
|
|
||||||
|
|
||||||
ids = ops.alloc((5000, nF), dtype="f")
|
|
||||||
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
|
||||||
ids = ops.asarray(ids, dtype="i")
|
|
||||||
tokvecs = ops.alloc((5000, nI), dtype="f")
|
|
||||||
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
|
||||||
tokvecs.shape
|
|
||||||
)
|
|
||||||
|
|
||||||
def predict(ids, tokvecs):
|
|
||||||
# nS ids. nW tokvecs. Exclude the padding array.
|
|
||||||
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
|
|
||||||
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
|
|
||||||
# need nS vectors
|
|
||||||
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
|
|
||||||
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
|
||||||
vectors = vectors.reshape((vectors.shape[0], nO, nP))
|
|
||||||
vectors += b
|
|
||||||
vectors = model.ops.asarray(vectors)
|
|
||||||
if nP >= 2:
|
|
||||||
return model.ops.maxout(vectors)[0]
|
|
||||||
else:
|
|
||||||
return vectors * (vectors >= 0)
|
|
||||||
|
|
||||||
tol_var = 0.01
|
|
||||||
tol_mean = 0.01
|
|
||||||
t_max = 10
|
|
||||||
W = model.get_param("W").copy()
|
|
||||||
b = model.get_param("b").copy()
|
|
||||||
for t_i in range(t_max):
|
|
||||||
acts1 = predict(ids, tokvecs)
|
|
||||||
var = model.ops.xp.var(acts1)
|
|
||||||
mean = model.ops.xp.mean(acts1)
|
|
||||||
if abs(var - 1.0) >= tol_var:
|
|
||||||
W /= model.ops.xp.sqrt(var)
|
|
||||||
model.set_param("W", W)
|
|
||||||
elif abs(mean) >= tol_mean:
|
|
||||||
b -= mean
|
|
||||||
model.set_param("b", b)
|
|
||||||
else:
|
|
||||||
break
|
|
|
@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
|
||||||
"update",
|
"update",
|
||||||
"rehearse",
|
"rehearse",
|
||||||
"get_loss",
|
"get_loss",
|
||||||
|
"get_teacher_student_loss",
|
||||||
"initialize",
|
"initialize",
|
||||||
"begin_update",
|
"begin_update",
|
||||||
"finish_update",
|
"finish_update",
|
||||||
|
|
|
@ -1,17 +1,20 @@
|
||||||
from typing import Optional, List, cast
|
from typing import Optional, List, Tuple, Any
|
||||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
from thinc.api import Model
|
||||||
|
import warnings
|
||||||
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors, Warnings
|
||||||
from ...compat import Literal
|
from ...compat import Literal
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from .._precomputable_affine import PrecomputableAffine
|
|
||||||
from ..tb_framework import TransitionModel
|
from ..tb_framework import TransitionModel
|
||||||
from ...tokens import Doc
|
from ...tokens.doc import Doc
|
||||||
|
|
||||||
|
TransitionSystem = Any # TODO
|
||||||
|
State = Any # TODO
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures("spacy.TransitionBasedParser.v2")
|
@registry.architectures.register("spacy.TransitionBasedParser.v2")
|
||||||
def build_tb_parser_model(
|
def transition_parser_v2(
|
||||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
state_type: Literal["parser", "ner"],
|
state_type: Literal["parser", "ner"],
|
||||||
extra_state_tokens: bool,
|
extra_state_tokens: bool,
|
||||||
|
@ -19,6 +22,46 @@ def build_tb_parser_model(
|
||||||
maxout_pieces: int,
|
maxout_pieces: int,
|
||||||
use_upper: bool,
|
use_upper: bool,
|
||||||
nO: Optional[int] = None,
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
|
if not use_upper:
|
||||||
|
warnings.warn(Warnings.W400)
|
||||||
|
|
||||||
|
return build_tb_parser_model(
|
||||||
|
tok2vec,
|
||||||
|
state_type,
|
||||||
|
extra_state_tokens,
|
||||||
|
hidden_width,
|
||||||
|
maxout_pieces,
|
||||||
|
nO=nO,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.architectures.register("spacy.TransitionBasedParser.v3")
|
||||||
|
def transition_parser_v3(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
state_type: Literal["parser", "ner"],
|
||||||
|
extra_state_tokens: bool,
|
||||||
|
hidden_width: int,
|
||||||
|
maxout_pieces: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
|
return build_tb_parser_model(
|
||||||
|
tok2vec,
|
||||||
|
state_type,
|
||||||
|
extra_state_tokens,
|
||||||
|
hidden_width,
|
||||||
|
maxout_pieces,
|
||||||
|
nO=nO,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def build_tb_parser_model(
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
state_type: Literal["parser", "ner"],
|
||||||
|
extra_state_tokens: bool,
|
||||||
|
hidden_width: int,
|
||||||
|
maxout_pieces: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
) -> Model:
|
) -> Model:
|
||||||
"""
|
"""
|
||||||
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
Build a transition-based parser model. Can apply to NER or dependency-parsing.
|
||||||
|
@ -51,14 +94,7 @@ def build_tb_parser_model(
|
||||||
feature sets (for the NER) or 13 (for the parser).
|
feature sets (for the NER) or 13 (for the parser).
|
||||||
hidden_width (int): The width of the hidden layer.
|
hidden_width (int): The width of the hidden layer.
|
||||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
Recommended values are 1, 2 or 3.
|
||||||
is replaced with a ReLu non-linearity if use_upper=True, and no
|
|
||||||
non-linearity if use_upper=False.
|
|
||||||
use_upper (bool): Whether to use an additional hidden layer after the state
|
|
||||||
vector in order to predict the action scores. It is recommended to set
|
|
||||||
this to False for large pretrained models such as transformers, and True
|
|
||||||
for smaller networks. The upper layer is computed on CPU, which becomes
|
|
||||||
a bottleneck on larger GPU-based models, where it's also less necessary.
|
|
||||||
nO (int or None): The number of actions the model will predict between.
|
nO (int or None): The number of actions the model will predict between.
|
||||||
Usually inferred from data at the beginning of training, or loaded from
|
Usually inferred from data at the beginning of training, or loaded from
|
||||||
disk.
|
disk.
|
||||||
|
@ -69,106 +105,11 @@ def build_tb_parser_model(
|
||||||
nr_feature_tokens = 6 if extra_state_tokens else 3
|
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||||
else:
|
else:
|
||||||
raise ValueError(Errors.E917.format(value=state_type))
|
raise ValueError(Errors.E917.format(value=state_type))
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
return TransitionModel(
|
||||||
tok2vec = chain(
|
tok2vec=tok2vec,
|
||||||
tok2vec,
|
state_tokens=nr_feature_tokens,
|
||||||
list2array(),
|
hidden_width=hidden_width,
|
||||||
Linear(hidden_width, t2v_width),
|
maxout_pieces=maxout_pieces,
|
||||||
|
nO=nO,
|
||||||
|
unseen_classes=set(),
|
||||||
)
|
)
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
|
||||||
lower = _define_lower(
|
|
||||||
nO=hidden_width if use_upper else nO,
|
|
||||||
nF=nr_feature_tokens,
|
|
||||||
nI=tok2vec.get_dim("nO"),
|
|
||||||
nP=maxout_pieces,
|
|
||||||
)
|
|
||||||
upper = None
|
|
||||||
if use_upper:
|
|
||||||
with use_ops("cpu"):
|
|
||||||
# Initialize weights at zero, as it's a classification layer.
|
|
||||||
upper = _define_upper(nO=nO, nI=None)
|
|
||||||
return TransitionModel(tok2vec, lower, upper, resize_output)
|
|
||||||
|
|
||||||
|
|
||||||
def _define_upper(nO, nI):
|
|
||||||
return Linear(nO=nO, nI=nI, init_W=zero_init)
|
|
||||||
|
|
||||||
|
|
||||||
def _define_lower(nO, nF, nI, nP):
|
|
||||||
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
|
|
||||||
|
|
||||||
|
|
||||||
def resize_output(model, new_nO):
|
|
||||||
if model.attrs["has_upper"]:
|
|
||||||
return _resize_upper(model, new_nO)
|
|
||||||
return _resize_lower(model, new_nO)
|
|
||||||
|
|
||||||
|
|
||||||
def _resize_upper(model, new_nO):
|
|
||||||
upper = model.get_ref("upper")
|
|
||||||
if upper.has_dim("nO") is None:
|
|
||||||
upper.set_dim("nO", new_nO)
|
|
||||||
return model
|
|
||||||
elif new_nO == upper.get_dim("nO"):
|
|
||||||
return model
|
|
||||||
|
|
||||||
smaller = upper
|
|
||||||
nI = smaller.maybe_get_dim("nI")
|
|
||||||
with use_ops("cpu"):
|
|
||||||
larger = _define_upper(nO=new_nO, nI=nI)
|
|
||||||
# it could be that the model is not initialized yet, then skip this bit
|
|
||||||
if smaller.has_param("W"):
|
|
||||||
larger_W = larger.ops.alloc2f(new_nO, nI)
|
|
||||||
larger_b = larger.ops.alloc1f(new_nO)
|
|
||||||
smaller_W = smaller.get_param("W")
|
|
||||||
smaller_b = smaller.get_param("b")
|
|
||||||
# Weights are stored in (nr_out, nr_in) format, so we're basically
|
|
||||||
# just adding rows here.
|
|
||||||
if smaller.has_dim("nO"):
|
|
||||||
old_nO = smaller.get_dim("nO")
|
|
||||||
larger_W[:old_nO] = smaller_W
|
|
||||||
larger_b[:old_nO] = smaller_b
|
|
||||||
for i in range(old_nO, new_nO):
|
|
||||||
model.attrs["unseen_classes"].add(i)
|
|
||||||
|
|
||||||
larger.set_param("W", larger_W)
|
|
||||||
larger.set_param("b", larger_b)
|
|
||||||
model._layers[-1] = larger
|
|
||||||
model.set_ref("upper", larger)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def _resize_lower(model, new_nO):
|
|
||||||
lower = model.get_ref("lower")
|
|
||||||
if lower.has_dim("nO") is None:
|
|
||||||
lower.set_dim("nO", new_nO)
|
|
||||||
return model
|
|
||||||
|
|
||||||
smaller = lower
|
|
||||||
nI = smaller.maybe_get_dim("nI")
|
|
||||||
nF = smaller.maybe_get_dim("nF")
|
|
||||||
nP = smaller.maybe_get_dim("nP")
|
|
||||||
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
|
|
||||||
# it could be that the model is not initialized yet, then skip this bit
|
|
||||||
if smaller.has_param("W"):
|
|
||||||
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
|
|
||||||
larger_b = larger.ops.alloc2f(new_nO, nP)
|
|
||||||
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
|
|
||||||
smaller_W = smaller.get_param("W")
|
|
||||||
smaller_b = smaller.get_param("b")
|
|
||||||
smaller_pad = smaller.get_param("pad")
|
|
||||||
# Copy the old weights and padding into the new layer
|
|
||||||
if smaller.has_dim("nO"):
|
|
||||||
old_nO = smaller.get_dim("nO")
|
|
||||||
larger_W[:, 0:old_nO, :, :] = smaller_W
|
|
||||||
larger_pad[:, :, 0:old_nO, :] = smaller_pad
|
|
||||||
larger_b[0:old_nO, :] = smaller_b
|
|
||||||
for i in range(old_nO, new_nO):
|
|
||||||
model.attrs["unseen_classes"].add(i)
|
|
||||||
|
|
||||||
larger.set_param("W", larger_W)
|
|
||||||
larger.set_param("b", larger_b)
|
|
||||||
larger.set_param("pad", larger_pad)
|
|
||||||
model._layers[1] = larger
|
|
||||||
model.set_ref("lower", larger)
|
|
||||||
return model
|
|
||||||
|
|
|
@ -1,49 +0,0 @@
|
||||||
from libc.string cimport memset, memcpy
|
|
||||||
from thinc.backends.cblas cimport CBlas
|
|
||||||
from ..typedefs cimport weight_t, hash_t
|
|
||||||
from ..pipeline._parser_internals._state cimport StateC
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct SizesC:
|
|
||||||
int states
|
|
||||||
int classes
|
|
||||||
int hiddens
|
|
||||||
int pieces
|
|
||||||
int feats
|
|
||||||
int embed_width
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct WeightsC:
|
|
||||||
const float* feat_weights
|
|
||||||
const float* feat_bias
|
|
||||||
const float* hidden_bias
|
|
||||||
const float* hidden_weights
|
|
||||||
const float* seen_classes
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct ActivationsC:
|
|
||||||
int* token_ids
|
|
||||||
float* unmaxed
|
|
||||||
float* scores
|
|
||||||
float* hiddens
|
|
||||||
int* is_valid
|
|
||||||
int _curr_size
|
|
||||||
int _max_size
|
|
||||||
|
|
||||||
|
|
||||||
cdef WeightsC get_c_weights(model) except *
|
|
||||||
|
|
||||||
cdef SizesC get_c_sizes(model, int batch_size) except *
|
|
||||||
|
|
||||||
cdef ActivationsC alloc_activations(SizesC n) nogil
|
|
||||||
|
|
||||||
cdef void free_activations(const ActivationsC* A) nogil
|
|
||||||
|
|
||||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
|
||||||
const WeightsC* W, SizesC n) nogil
|
|
||||||
|
|
||||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
|
|
||||||
|
|
||||||
cdef void cpu_log_loss(float* d_scores,
|
|
||||||
const float* costs, const int* is_valid, const float* scores, int O) nogil
|
|
||||||
|
|
|
@ -1,500 +0,0 @@
|
||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
|
|
||||||
cimport numpy as np
|
|
||||||
from libc.math cimport exp
|
|
||||||
from libc.string cimport memset, memcpy
|
|
||||||
from libc.stdlib cimport calloc, free, realloc
|
|
||||||
from thinc.backends.cblas cimport saxpy, sgemm
|
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy.random
|
|
||||||
from thinc.api import Model, CupyOps, NumpyOps, get_ops
|
|
||||||
|
|
||||||
from .. import util
|
|
||||||
from ..errors import Errors
|
|
||||||
from ..typedefs cimport weight_t, class_t, hash_t
|
|
||||||
from ..pipeline._parser_internals.stateclass cimport StateClass
|
|
||||||
|
|
||||||
|
|
||||||
cdef WeightsC get_c_weights(model) except *:
|
|
||||||
cdef WeightsC output
|
|
||||||
cdef precompute_hiddens state2vec = model.state2vec
|
|
||||||
output.feat_weights = state2vec.get_feat_weights()
|
|
||||||
output.feat_bias = <const float*>state2vec.bias.data
|
|
||||||
cdef np.ndarray vec2scores_W
|
|
||||||
cdef np.ndarray vec2scores_b
|
|
||||||
if model.vec2scores is None:
|
|
||||||
output.hidden_weights = NULL
|
|
||||||
output.hidden_bias = NULL
|
|
||||||
else:
|
|
||||||
vec2scores_W = model.vec2scores.get_param("W")
|
|
||||||
vec2scores_b = model.vec2scores.get_param("b")
|
|
||||||
output.hidden_weights = <const float*>vec2scores_W.data
|
|
||||||
output.hidden_bias = <const float*>vec2scores_b.data
|
|
||||||
cdef np.ndarray class_mask = model._class_mask
|
|
||||||
output.seen_classes = <const float*>class_mask.data
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
cdef SizesC get_c_sizes(model, int batch_size) except *:
|
|
||||||
cdef SizesC output
|
|
||||||
output.states = batch_size
|
|
||||||
if model.vec2scores is None:
|
|
||||||
output.classes = model.state2vec.get_dim("nO")
|
|
||||||
else:
|
|
||||||
output.classes = model.vec2scores.get_dim("nO")
|
|
||||||
output.hiddens = model.state2vec.get_dim("nO")
|
|
||||||
output.pieces = model.state2vec.get_dim("nP")
|
|
||||||
output.feats = model.state2vec.get_dim("nF")
|
|
||||||
output.embed_width = model.tokvecs.shape[1]
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
cdef ActivationsC alloc_activations(SizesC n) nogil:
|
|
||||||
cdef ActivationsC A
|
|
||||||
memset(&A, 0, sizeof(A))
|
|
||||||
resize_activations(&A, n)
|
|
||||||
return A
|
|
||||||
|
|
||||||
|
|
||||||
cdef void free_activations(const ActivationsC* A) nogil:
|
|
||||||
free(A.token_ids)
|
|
||||||
free(A.scores)
|
|
||||||
free(A.unmaxed)
|
|
||||||
free(A.hiddens)
|
|
||||||
free(A.is_valid)
|
|
||||||
|
|
||||||
|
|
||||||
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
|
|
||||||
if n.states <= A._max_size:
|
|
||||||
A._curr_size = n.states
|
|
||||||
return
|
|
||||||
if A._max_size == 0:
|
|
||||||
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
|
||||||
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
|
|
||||||
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
|
||||||
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
|
||||||
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
|
||||||
A._max_size = n.states
|
|
||||||
else:
|
|
||||||
A.token_ids = <int*>realloc(A.token_ids,
|
|
||||||
n.states * n.feats * sizeof(A.token_ids[0]))
|
|
||||||
A.scores = <float*>realloc(A.scores,
|
|
||||||
n.states * n.classes * sizeof(A.scores[0]))
|
|
||||||
A.unmaxed = <float*>realloc(A.unmaxed,
|
|
||||||
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
|
||||||
A.hiddens = <float*>realloc(A.hiddens,
|
|
||||||
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
|
||||||
A.is_valid = <int*>realloc(A.is_valid,
|
|
||||||
n.states * n.classes * sizeof(A.is_valid[0]))
|
|
||||||
A._max_size = n.states
|
|
||||||
A._curr_size = n.states
|
|
||||||
|
|
||||||
|
|
||||||
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
|
|
||||||
const WeightsC* W, SizesC n) nogil:
|
|
||||||
cdef double one = 1.0
|
|
||||||
resize_activations(A, n)
|
|
||||||
for i in range(n.states):
|
|
||||||
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
|
||||||
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
|
||||||
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
|
|
||||||
sum_state_features(cblas, A.unmaxed,
|
|
||||||
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
|
|
||||||
for i in range(n.states):
|
|
||||||
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
|
|
||||||
for j in range(n.hiddens):
|
|
||||||
index = i * n.hiddens * n.pieces + j * n.pieces
|
|
||||||
which = _arg_max(&A.unmaxed[index], n.pieces)
|
|
||||||
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
|
||||||
memset(A.scores, 0, n.states * n.classes * sizeof(float))
|
|
||||||
if W.hidden_weights == NULL:
|
|
||||||
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
|
|
||||||
else:
|
|
||||||
# Compute hidden-to-output
|
|
||||||
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
|
||||||
1.0, <const float *>A.hiddens, n.hiddens,
|
|
||||||
<const float *>W.hidden_weights, n.hiddens,
|
|
||||||
0.0, A.scores, n.classes)
|
|
||||||
# Add bias
|
|
||||||
for i in range(n.states):
|
|
||||||
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
|
|
||||||
# Set unseen classes to minimum value
|
|
||||||
i = 0
|
|
||||||
min_ = A.scores[0]
|
|
||||||
for i in range(1, n.states * n.classes):
|
|
||||||
if A.scores[i] < min_:
|
|
||||||
min_ = A.scores[i]
|
|
||||||
for i in range(n.states):
|
|
||||||
for j in range(n.classes):
|
|
||||||
if not W.seen_classes[j]:
|
|
||||||
A.scores[i*n.classes+j] = min_
|
|
||||||
|
|
||||||
|
|
||||||
cdef void sum_state_features(CBlas cblas, float* output,
|
|
||||||
const float* cached, const int* token_ids, int B, int F, int O) nogil:
|
|
||||||
cdef int idx, b, f, i
|
|
||||||
cdef const float* feature
|
|
||||||
padding = cached
|
|
||||||
cached += F * O
|
|
||||||
cdef int id_stride = F*O
|
|
||||||
cdef float one = 1.
|
|
||||||
for b in range(B):
|
|
||||||
for f in range(F):
|
|
||||||
if token_ids[f] < 0:
|
|
||||||
feature = &padding[f*O]
|
|
||||||
else:
|
|
||||||
idx = token_ids[f] * id_stride + f*O
|
|
||||||
feature = &cached[idx]
|
|
||||||
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
|
||||||
token_ids += F
|
|
||||||
|
|
||||||
|
|
||||||
cdef void cpu_log_loss(float* d_scores,
|
|
||||||
const float* costs, const int* is_valid, const float* scores,
|
|
||||||
int O) nogil:
|
|
||||||
"""Do multi-label log loss"""
|
|
||||||
cdef double max_, gmax, Z, gZ
|
|
||||||
best = arg_max_if_gold(scores, costs, is_valid, O)
|
|
||||||
guess = _arg_max(scores, O)
|
|
||||||
|
|
||||||
if best == -1 or guess == -1:
|
|
||||||
# These shouldn't happen, but if they do, we want to make sure we don't
|
|
||||||
# cause an OOB access.
|
|
||||||
return
|
|
||||||
Z = 1e-10
|
|
||||||
gZ = 1e-10
|
|
||||||
max_ = scores[guess]
|
|
||||||
gmax = scores[best]
|
|
||||||
for i in range(O):
|
|
||||||
Z += exp(scores[i] - max_)
|
|
||||||
if costs[i] <= costs[best]:
|
|
||||||
gZ += exp(scores[i] - gmax)
|
|
||||||
for i in range(O):
|
|
||||||
if costs[i] <= costs[best]:
|
|
||||||
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
|
|
||||||
else:
|
|
||||||
d_scores[i] = exp(scores[i]-max_) / Z
|
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
|
|
||||||
const int* is_valid, int n) nogil:
|
|
||||||
# Find minimum cost
|
|
||||||
cdef float cost = 1
|
|
||||||
for i in range(n):
|
|
||||||
if is_valid[i] and costs[i] < cost:
|
|
||||||
cost = costs[i]
|
|
||||||
# Now find best-scoring with that cost
|
|
||||||
cdef int best = -1
|
|
||||||
for i in range(n):
|
|
||||||
if costs[i] <= cost and is_valid[i]:
|
|
||||||
if best == -1 or scores[i] > scores[best]:
|
|
||||||
best = i
|
|
||||||
return best
|
|
||||||
|
|
||||||
|
|
||||||
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
|
|
||||||
cdef int best = -1
|
|
||||||
for i in range(n):
|
|
||||||
if is_valid[i] >= 1:
|
|
||||||
if best == -1 or scores[i] > scores[best]:
|
|
||||||
best = i
|
|
||||||
return best
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class ParserStepModel(Model):
|
|
||||||
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
|
|
||||||
dropout=0.1):
|
|
||||||
Model.__init__(self, name="parser_step_model", forward=step_forward)
|
|
||||||
self.attrs["has_upper"] = has_upper
|
|
||||||
self.attrs["dropout_rate"] = dropout
|
|
||||||
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
|
|
||||||
if layers[1].get_dim("nP") >= 2:
|
|
||||||
activation = "maxout"
|
|
||||||
elif has_upper:
|
|
||||||
activation = None
|
|
||||||
else:
|
|
||||||
activation = "relu"
|
|
||||||
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
|
|
||||||
activation=activation, train=train)
|
|
||||||
if has_upper:
|
|
||||||
self.vec2scores = layers[-1]
|
|
||||||
else:
|
|
||||||
self.vec2scores = None
|
|
||||||
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
|
|
||||||
self.backprops = []
|
|
||||||
self._class_mask = numpy.zeros((self.nO,), dtype='f')
|
|
||||||
self._class_mask.fill(1)
|
|
||||||
if unseen_classes is not None:
|
|
||||||
for class_ in unseen_classes:
|
|
||||||
self._class_mask[class_] = 0.
|
|
||||||
|
|
||||||
def clear_memory(self):
|
|
||||||
del self.tokvecs
|
|
||||||
del self.bp_tokvecs
|
|
||||||
del self.state2vec
|
|
||||||
del self.backprops
|
|
||||||
del self._class_mask
|
|
||||||
|
|
||||||
@property
|
|
||||||
def nO(self):
|
|
||||||
if self.attrs["has_upper"]:
|
|
||||||
return self.vec2scores.get_dim("nO")
|
|
||||||
else:
|
|
||||||
return self.state2vec.get_dim("nO")
|
|
||||||
|
|
||||||
def class_is_unseen(self, class_):
|
|
||||||
return self._class_mask[class_]
|
|
||||||
|
|
||||||
def mark_class_unseen(self, class_):
|
|
||||||
self._class_mask[class_] = 0
|
|
||||||
|
|
||||||
def mark_class_seen(self, class_):
|
|
||||||
self._class_mask[class_] = 1
|
|
||||||
|
|
||||||
def get_token_ids(self, states):
|
|
||||||
cdef StateClass state
|
|
||||||
states = [state for state in states if not state.is_final()]
|
|
||||||
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
|
|
||||||
dtype='i', order='C')
|
|
||||||
ids.fill(-1)
|
|
||||||
c_ids = <int*>ids.data
|
|
||||||
for state in states:
|
|
||||||
state.c.set_context_tokens(c_ids, ids.shape[1])
|
|
||||||
c_ids += ids.shape[1]
|
|
||||||
return ids
|
|
||||||
|
|
||||||
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
|
|
||||||
if isinstance(self.state2vec.ops, CupyOps) \
|
|
||||||
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
|
|
||||||
# Move token_ids and d_vector to GPU, asynchronously
|
|
||||||
self.backprops.append((
|
|
||||||
util.get_async(self.cuda_stream, token_ids),
|
|
||||||
util.get_async(self.cuda_stream, d_vector),
|
|
||||||
get_d_tokvecs
|
|
||||||
))
|
|
||||||
else:
|
|
||||||
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
|
|
||||||
|
|
||||||
|
|
||||||
def finish_steps(self, golds):
|
|
||||||
# Add a padding vector to the d_tokvecs gradient, so that missing
|
|
||||||
# values don't affect the real gradient.
|
|
||||||
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
|
|
||||||
# Tells CUDA to block, so our async copies complete.
|
|
||||||
if self.cuda_stream is not None:
|
|
||||||
self.cuda_stream.synchronize()
|
|
||||||
for ids, d_vector, bp_vector in self.backprops:
|
|
||||||
d_state_features = bp_vector((d_vector, ids))
|
|
||||||
ids = ids.flatten()
|
|
||||||
d_state_features = d_state_features.reshape(
|
|
||||||
(ids.size, d_state_features.shape[2]))
|
|
||||||
self.ops.scatter_add(d_tokvecs, ids,
|
|
||||||
d_state_features)
|
|
||||||
# Padded -- see update()
|
|
||||||
self.bp_tokvecs(d_tokvecs[:-1])
|
|
||||||
return d_tokvecs
|
|
||||||
|
|
||||||
NUMPY_OPS = NumpyOps()
|
|
||||||
|
|
||||||
def step_forward(model: ParserStepModel, states, is_train):
|
|
||||||
token_ids = model.get_token_ids(states)
|
|
||||||
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
|
|
||||||
mask = None
|
|
||||||
if model.attrs["has_upper"]:
|
|
||||||
dropout_rate = model.attrs["dropout_rate"]
|
|
||||||
if is_train and dropout_rate > 0:
|
|
||||||
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
|
|
||||||
vector *= mask
|
|
||||||
scores, get_d_vector = model.vec2scores(vector, is_train)
|
|
||||||
else:
|
|
||||||
scores = NumpyOps().asarray(vector)
|
|
||||||
get_d_vector = lambda d_scores: d_scores
|
|
||||||
# If the class is unseen, make sure its score is minimum
|
|
||||||
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
|
|
||||||
|
|
||||||
def backprop_parser_step(d_scores):
|
|
||||||
# Zero vectors for unseen classes
|
|
||||||
d_scores *= model._class_mask
|
|
||||||
d_vector = get_d_vector(d_scores)
|
|
||||||
if mask is not None:
|
|
||||||
d_vector *= mask
|
|
||||||
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
|
|
||||||
return None
|
|
||||||
return scores, backprop_parser_step
|
|
||||||
|
|
||||||
|
|
||||||
cdef class precompute_hiddens:
|
|
||||||
"""Allow a model to be "primed" by pre-computing input features in bulk.
|
|
||||||
|
|
||||||
This is used for the parser, where we want to take a batch of documents,
|
|
||||||
and compute vectors for each (token, position) pair. These vectors can then
|
|
||||||
be reused, especially for beam-search.
|
|
||||||
|
|
||||||
Let's say we're using 12 features for each state, e.g. word at start of
|
|
||||||
buffer, three words on stack, their children, etc. In the normal arc-eager
|
|
||||||
system, a document of length N is processed in 2*N states. This means we'll
|
|
||||||
create 2*N*12 feature vectors --- but if we pre-compute, we only need
|
|
||||||
N*12 vector computations. The saving for beam-search is much better:
|
|
||||||
if we have a beam of k, we'll normally make 2*N*12*K computations --
|
|
||||||
so we can save the factor k. This also gives a nice CPU/GPU division:
|
|
||||||
we can do all our hard maths up front, packed into large multiplications,
|
|
||||||
and do the hard-to-program parsing on the CPU.
|
|
||||||
"""
|
|
||||||
cdef readonly int nF, nO, nP
|
|
||||||
cdef bint _is_synchronized
|
|
||||||
cdef public object ops
|
|
||||||
cdef public object numpy_ops
|
|
||||||
cdef public object _cpu_ops
|
|
||||||
cdef np.ndarray _features
|
|
||||||
cdef np.ndarray _cached
|
|
||||||
cdef np.ndarray bias
|
|
||||||
cdef object _cuda_stream
|
|
||||||
cdef object _bp_hiddens
|
|
||||||
cdef object activation
|
|
||||||
|
|
||||||
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
|
|
||||||
activation="maxout", train=False):
|
|
||||||
gpu_cached, bp_features = lower_model(tokvecs, train)
|
|
||||||
cdef np.ndarray cached
|
|
||||||
if not isinstance(gpu_cached, numpy.ndarray):
|
|
||||||
# Note the passing of cuda_stream here: it lets
|
|
||||||
# cupy make the copy asynchronously.
|
|
||||||
# We then have to block before first use.
|
|
||||||
cached = gpu_cached.get(stream=cuda_stream)
|
|
||||||
else:
|
|
||||||
cached = gpu_cached
|
|
||||||
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
|
|
||||||
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
|
|
||||||
else:
|
|
||||||
self.bias = lower_model.get_param("b")
|
|
||||||
self.nF = cached.shape[1]
|
|
||||||
if lower_model.has_dim("nP"):
|
|
||||||
self.nP = lower_model.get_dim("nP")
|
|
||||||
else:
|
|
||||||
self.nP = 1
|
|
||||||
self.nO = cached.shape[2]
|
|
||||||
self.ops = lower_model.ops
|
|
||||||
self.numpy_ops = NumpyOps()
|
|
||||||
self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
|
|
||||||
assert activation in (None, "relu", "maxout")
|
|
||||||
self.activation = activation
|
|
||||||
self._is_synchronized = False
|
|
||||||
self._cuda_stream = cuda_stream
|
|
||||||
self._cached = cached
|
|
||||||
self._bp_hiddens = bp_features
|
|
||||||
|
|
||||||
cdef const float* get_feat_weights(self) except NULL:
|
|
||||||
if not self._is_synchronized and self._cuda_stream is not None:
|
|
||||||
self._cuda_stream.synchronize()
|
|
||||||
self._is_synchronized = True
|
|
||||||
return <float*>self._cached.data
|
|
||||||
|
|
||||||
def has_dim(self, name):
|
|
||||||
if name == "nF":
|
|
||||||
return self.nF if self.nF is not None else True
|
|
||||||
elif name == "nP":
|
|
||||||
return self.nP if self.nP is not None else True
|
|
||||||
elif name == "nO":
|
|
||||||
return self.nO if self.nO is not None else True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_dim(self, name):
|
|
||||||
if name == "nF":
|
|
||||||
return self.nF
|
|
||||||
elif name == "nP":
|
|
||||||
return self.nP
|
|
||||||
elif name == "nO":
|
|
||||||
return self.nO
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E1033.format(name=name))
|
|
||||||
|
|
||||||
def set_dim(self, name, value):
|
|
||||||
if name == "nF":
|
|
||||||
self.nF = value
|
|
||||||
elif name == "nP":
|
|
||||||
self.nP = value
|
|
||||||
elif name == "nO":
|
|
||||||
self.nO = value
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E1033.format(name=name))
|
|
||||||
|
|
||||||
def __call__(self, X, bint is_train):
|
|
||||||
if is_train:
|
|
||||||
return self.begin_update(X)
|
|
||||||
else:
|
|
||||||
return self.predict(X), lambda X: X
|
|
||||||
|
|
||||||
def predict(self, X):
|
|
||||||
return self.begin_update(X)[0]
|
|
||||||
|
|
||||||
def begin_update(self, token_ids):
|
|
||||||
cdef np.ndarray state_vector = numpy.zeros(
|
|
||||||
(token_ids.shape[0], self.nO, self.nP), dtype='f')
|
|
||||||
# This is tricky, but (assuming GPU available);
|
|
||||||
# - Input to forward on CPU
|
|
||||||
# - Output from forward on CPU
|
|
||||||
# - Input to backward on GPU!
|
|
||||||
# - Output from backward on GPU
|
|
||||||
bp_hiddens = self._bp_hiddens
|
|
||||||
|
|
||||||
cdef CBlas cblas = self._cpu_ops.cblas()
|
|
||||||
|
|
||||||
feat_weights = self.get_feat_weights()
|
|
||||||
cdef int[:, ::1] ids = token_ids
|
|
||||||
sum_state_features(cblas, <float*>state_vector.data,
|
|
||||||
feat_weights, &ids[0,0],
|
|
||||||
token_ids.shape[0], self.nF, self.nO*self.nP)
|
|
||||||
state_vector += self.bias
|
|
||||||
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
|
|
||||||
|
|
||||||
def backward(d_state_vector_ids):
|
|
||||||
d_state_vector, token_ids = d_state_vector_ids
|
|
||||||
d_state_vector = bp_nonlinearity(d_state_vector)
|
|
||||||
d_tokens = bp_hiddens((d_state_vector, token_ids))
|
|
||||||
return d_tokens
|
|
||||||
return state_vector, backward
|
|
||||||
|
|
||||||
def _nonlinearity(self, state_vector):
|
|
||||||
if self.activation == "maxout":
|
|
||||||
return self._maxout_nonlinearity(state_vector)
|
|
||||||
else:
|
|
||||||
return self._relu_nonlinearity(state_vector)
|
|
||||||
|
|
||||||
def _maxout_nonlinearity(self, state_vector):
|
|
||||||
state_vector, mask = self.numpy_ops.maxout(state_vector)
|
|
||||||
# We're outputting to CPU, but we need this variable on GPU for the
|
|
||||||
# backward pass.
|
|
||||||
mask = self.ops.asarray(mask)
|
|
||||||
|
|
||||||
def backprop_maxout(d_best):
|
|
||||||
return self.ops.backprop_maxout(d_best, mask, self.nP)
|
|
||||||
|
|
||||||
return state_vector, backprop_maxout
|
|
||||||
|
|
||||||
def _relu_nonlinearity(self, state_vector):
|
|
||||||
state_vector = state_vector.reshape((state_vector.shape[0], -1))
|
|
||||||
mask = state_vector >= 0.
|
|
||||||
state_vector *= mask
|
|
||||||
# We're outputting to CPU, but we need this variable on GPU for the
|
|
||||||
# backward pass.
|
|
||||||
mask = self.ops.asarray(mask)
|
|
||||||
|
|
||||||
def backprop_relu(d_best):
|
|
||||||
d_best *= mask
|
|
||||||
return d_best.reshape((d_best.shape + (1,)))
|
|
||||||
|
|
||||||
return state_vector, backprop_relu
|
|
||||||
|
|
||||||
cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
|
|
||||||
if n_classes == 2:
|
|
||||||
return 0 if scores[0] > scores[1] else 1
|
|
||||||
cdef int i
|
|
||||||
cdef int best = 0
|
|
||||||
cdef float mode = scores[0]
|
|
||||||
for i in range(1, n_classes):
|
|
||||||
if scores[i] > mode:
|
|
||||||
mode = scores[i]
|
|
||||||
best = i
|
|
||||||
return best
|
|
28
spacy/ml/tb_framework.pxd
Normal file
28
spacy/ml/tb_framework.pxd
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from libc.stdint cimport int8_t
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct SizesC:
|
||||||
|
int states
|
||||||
|
int classes
|
||||||
|
int hiddens
|
||||||
|
int pieces
|
||||||
|
int feats
|
||||||
|
int embed_width
|
||||||
|
int tokens
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct WeightsC:
|
||||||
|
const float* feat_weights
|
||||||
|
const float* feat_bias
|
||||||
|
const float* hidden_bias
|
||||||
|
const float* hidden_weights
|
||||||
|
const int8_t* seen_mask
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct ActivationsC:
|
||||||
|
int* token_ids
|
||||||
|
float* unmaxed
|
||||||
|
float* hiddens
|
||||||
|
int* is_valid
|
||||||
|
int _curr_size
|
||||||
|
int _max_size
|
|
@ -1,50 +0,0 @@
|
||||||
from thinc.api import Model, noop
|
|
||||||
from .parser_model import ParserStepModel
|
|
||||||
from ..util import registry
|
|
||||||
|
|
||||||
|
|
||||||
@registry.layers("spacy.TransitionModel.v1")
|
|
||||||
def TransitionModel(
|
|
||||||
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
|
|
||||||
):
|
|
||||||
"""Set up a stepwise transition-based model"""
|
|
||||||
if upper is None:
|
|
||||||
has_upper = False
|
|
||||||
upper = noop()
|
|
||||||
else:
|
|
||||||
has_upper = True
|
|
||||||
# don't define nO for this object, because we can't dynamically change it
|
|
||||||
return Model(
|
|
||||||
name="parser_model",
|
|
||||||
forward=forward,
|
|
||||||
dims={"nI": tok2vec.maybe_get_dim("nI")},
|
|
||||||
layers=[tok2vec, lower, upper],
|
|
||||||
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
|
|
||||||
init=init,
|
|
||||||
attrs={
|
|
||||||
"has_upper": has_upper,
|
|
||||||
"unseen_classes": set(unseen_classes),
|
|
||||||
"resize_output": resize_output,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def forward(model, X, is_train):
|
|
||||||
step_model = ParserStepModel(
|
|
||||||
X,
|
|
||||||
model.layers,
|
|
||||||
unseen_classes=model.attrs["unseen_classes"],
|
|
||||||
train=is_train,
|
|
||||||
has_upper=model.attrs["has_upper"],
|
|
||||||
)
|
|
||||||
|
|
||||||
return step_model, step_model.finish_steps
|
|
||||||
|
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
|
||||||
model.get_ref("tok2vec").initialize(X=X)
|
|
||||||
lower = model.get_ref("lower")
|
|
||||||
lower.initialize()
|
|
||||||
if model.attrs["has_upper"]:
|
|
||||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
|
||||||
model.get_ref("upper").initialize(X=statevecs)
|
|
621
spacy/ml/tb_framework.pyx
Normal file
621
spacy/ml/tb_framework.pyx
Normal file
|
@ -0,0 +1,621 @@
|
||||||
|
# cython: infer_types=True, cdivision=True, boundscheck=False
|
||||||
|
from typing import List, Tuple, Any, Optional, TypeVar, cast
|
||||||
|
from libc.string cimport memset, memcpy
|
||||||
|
from libc.stdlib cimport calloc, free, realloc
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
import numpy
|
||||||
|
cimport numpy as np
|
||||||
|
from thinc.api import Model, normal_init, chain, list2array, Linear
|
||||||
|
from thinc.api import uniform_init, glorot_uniform_init, zero_init
|
||||||
|
from thinc.api import NumpyOps
|
||||||
|
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
|
||||||
|
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
|
||||||
|
from thinc.types import Ints1d, Ints2d
|
||||||
|
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..pipeline._parser_internals import _beam_utils
|
||||||
|
from ..pipeline._parser_internals.batch import GreedyBatch
|
||||||
|
from ..pipeline._parser_internals._parser_utils cimport arg_max
|
||||||
|
from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
|
||||||
|
from ..pipeline._parser_internals.transition_system cimport TransitionSystem
|
||||||
|
from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
|
||||||
|
from ..tokens.doc import Doc
|
||||||
|
from ..util import registry
|
||||||
|
|
||||||
|
|
||||||
|
State = Any # TODO
|
||||||
|
|
||||||
|
|
||||||
|
@registry.layers("spacy.TransitionModel.v2")
|
||||||
|
def TransitionModel(
|
||||||
|
*,
|
||||||
|
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||||
|
beam_width: int = 1,
|
||||||
|
beam_density: float = 0.0,
|
||||||
|
state_tokens: int,
|
||||||
|
hidden_width: int,
|
||||||
|
maxout_pieces: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
unseen_classes=set(),
|
||||||
|
) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
|
||||||
|
"""Set up a transition-based parsing model, using a maxout hidden
|
||||||
|
layer and a linear output layer.
|
||||||
|
"""
|
||||||
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
|
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
|
||||||
|
tok2vec_projected.set_dim("nO", hidden_width)
|
||||||
|
|
||||||
|
# FIXME: we use `output` as a container for the output layer's
|
||||||
|
# weights and biases. Thinc optimizers cannot handle resizing
|
||||||
|
# of parameters. So, when the parser model is resized, we
|
||||||
|
# construct a new `output` layer, which has a different key in
|
||||||
|
# the optimizer. Once the optimizer supports parameter resizing,
|
||||||
|
# we can replace the `output` layer by `output_W` and `output_b`
|
||||||
|
# parameters in this model.
|
||||||
|
output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
|
||||||
|
|
||||||
|
return Model(
|
||||||
|
name="parser_model",
|
||||||
|
forward=forward,
|
||||||
|
init=init,
|
||||||
|
layers=[tok2vec_projected, output],
|
||||||
|
refs={
|
||||||
|
"tok2vec": tok2vec_projected,
|
||||||
|
"output": output,
|
||||||
|
},
|
||||||
|
params={
|
||||||
|
"hidden_W": None, # Floats2d W for the hidden layer
|
||||||
|
"hidden_b": None, # Floats1d bias for the hidden layer
|
||||||
|
"hidden_pad": None, # Floats1d padding for the hidden layer
|
||||||
|
},
|
||||||
|
dims={
|
||||||
|
"nO": None, # Output size
|
||||||
|
"nP": maxout_pieces,
|
||||||
|
"nH": hidden_width,
|
||||||
|
"nI": tok2vec_projected.maybe_get_dim("nO"),
|
||||||
|
"nF": state_tokens,
|
||||||
|
},
|
||||||
|
attrs={
|
||||||
|
"beam_width": beam_width,
|
||||||
|
"beam_density": beam_density,
|
||||||
|
"unseen_classes": set(unseen_classes),
|
||||||
|
"resize_output": resize_output,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def resize_output(model: Model, new_nO: int) -> Model:
|
||||||
|
old_nO = model.maybe_get_dim("nO")
|
||||||
|
output = model.get_ref("output")
|
||||||
|
if old_nO is None:
|
||||||
|
model.set_dim("nO", new_nO)
|
||||||
|
output.set_dim("nO", new_nO)
|
||||||
|
output.initialize()
|
||||||
|
return model
|
||||||
|
elif new_nO <= old_nO:
|
||||||
|
return model
|
||||||
|
elif output.has_param("W"):
|
||||||
|
nH = model.get_dim("nH")
|
||||||
|
new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
|
||||||
|
new_output.initialize()
|
||||||
|
new_W = new_output.get_param("W")
|
||||||
|
new_b = new_output.get_param("b")
|
||||||
|
old_W = output.get_param("W")
|
||||||
|
old_b = output.get_param("b")
|
||||||
|
new_W[:old_nO] = old_W # type: ignore
|
||||||
|
new_b[:old_nO] = old_b # type: ignore
|
||||||
|
for i in range(old_nO, new_nO):
|
||||||
|
model.attrs["unseen_classes"].add(i)
|
||||||
|
model.layers[-1] = new_output
|
||||||
|
model.set_ref("output", new_output)
|
||||||
|
# TODO: Avoid this private intrusion
|
||||||
|
model._dims["nO"] = new_nO
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def init(
|
||||||
|
model,
|
||||||
|
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
|
||||||
|
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
|
||||||
|
):
|
||||||
|
if X is not None:
|
||||||
|
docs, moves = X
|
||||||
|
model.get_ref("tok2vec").initialize(X=docs)
|
||||||
|
else:
|
||||||
|
model.get_ref("tok2vec").initialize()
|
||||||
|
inferred_nO = _infer_nO(Y)
|
||||||
|
if inferred_nO is not None:
|
||||||
|
current_nO = model.maybe_get_dim("nO")
|
||||||
|
if current_nO is None or current_nO != inferred_nO:
|
||||||
|
model.attrs["resize_output"](model, inferred_nO)
|
||||||
|
nO = model.get_dim("nO")
|
||||||
|
nP = model.get_dim("nP")
|
||||||
|
nH = model.get_dim("nH")
|
||||||
|
nI = model.get_dim("nI")
|
||||||
|
nF = model.get_dim("nF")
|
||||||
|
ops = model.ops
|
||||||
|
|
||||||
|
Wl = ops.alloc2f(nH * nP, nF * nI)
|
||||||
|
bl = ops.alloc1f(nH * nP)
|
||||||
|
padl = ops.alloc1f(nI)
|
||||||
|
# Wl = zero_init(ops, Wl.shape)
|
||||||
|
Wl = glorot_uniform_init(ops, Wl.shape)
|
||||||
|
padl = uniform_init(ops, padl.shape) # type: ignore
|
||||||
|
# TODO: Experiment with whether better to initialize output_W
|
||||||
|
model.set_param("hidden_W", Wl)
|
||||||
|
model.set_param("hidden_b", bl)
|
||||||
|
model.set_param("hidden_pad", padl)
|
||||||
|
# model = _lsuv_init(model)
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
class TransitionModelInputs:
|
||||||
|
"""
|
||||||
|
Input to transition model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# dataclass annotation is not yet supported in Cython 0.29.x,
|
||||||
|
# so, we'll do something close to it.
|
||||||
|
|
||||||
|
actions: Optional[List[Ints1d]]
|
||||||
|
docs: List[Doc]
|
||||||
|
max_moves: int
|
||||||
|
moves: TransitionSystem
|
||||||
|
states: Optional[List[State]]
|
||||||
|
|
||||||
|
__slots__ = [
|
||||||
|
"actions",
|
||||||
|
"docs",
|
||||||
|
"max_moves",
|
||||||
|
"moves",
|
||||||
|
"states",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
docs: List[Doc],
|
||||||
|
moves: TransitionSystem,
|
||||||
|
actions: Optional[List[Ints1d]]=None,
|
||||||
|
max_moves: int=0,
|
||||||
|
states: Optional[List[State]]=None):
|
||||||
|
"""
|
||||||
|
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
|
||||||
|
docs (List[Doc]): Docs to predict transition sequences for.
|
||||||
|
max_moves: (int): the maximum number of moves to apply, values less
|
||||||
|
than 1 will apply moves to states until they are final states.
|
||||||
|
moves (TransitionSystem): the transition system to use when predicting
|
||||||
|
the transition sequences.
|
||||||
|
states (Optional[List[States]]): the initial states to predict the
|
||||||
|
transition sequences for. When absent, the initial states are
|
||||||
|
initialized from the provided Docs.
|
||||||
|
"""
|
||||||
|
self.actions = actions
|
||||||
|
self.docs = docs
|
||||||
|
self.moves = moves
|
||||||
|
self.max_moves = max_moves
|
||||||
|
self.states = states
|
||||||
|
|
||||||
|
|
||||||
|
def forward(model, inputs: TransitionModelInputs, is_train: bool):
|
||||||
|
docs = inputs.docs
|
||||||
|
moves = inputs.moves
|
||||||
|
actions = inputs.actions
|
||||||
|
|
||||||
|
beam_width = model.attrs["beam_width"]
|
||||||
|
hidden_pad = model.get_param("hidden_pad")
|
||||||
|
tok2vec = model.get_ref("tok2vec")
|
||||||
|
|
||||||
|
states = moves.init_batch(docs) if inputs.states is None else inputs.states
|
||||||
|
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
|
||||||
|
tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
|
||||||
|
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
|
||||||
|
seen_mask = _get_seen_mask(model)
|
||||||
|
|
||||||
|
if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
|
||||||
|
# Note: max_moves is only used during training, so we don't need to
|
||||||
|
# pass it to the greedy inference path.
|
||||||
|
return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
|
||||||
|
else:
|
||||||
|
return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
|
||||||
|
feats, backprop_feats, seen_mask, is_train, actions=actions,
|
||||||
|
max_moves=inputs.max_moves)
|
||||||
|
|
||||||
|
|
||||||
|
def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
|
||||||
|
np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
|
||||||
|
cdef vector[StateC*] c_states
|
||||||
|
cdef StateClass state
|
||||||
|
for state in states:
|
||||||
|
if not state.is_final():
|
||||||
|
c_states.push_back(state.c)
|
||||||
|
weights = _get_c_weights(model, <float*>feats.data, seen_mask)
|
||||||
|
# Precomputed features have rows for each token, plus one for padding.
|
||||||
|
cdef int n_tokens = feats.shape[0] - 1
|
||||||
|
sizes = _get_c_sizes(model, c_states.size(), n_tokens)
|
||||||
|
cdef CBlas cblas = model.ops.cblas()
|
||||||
|
scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
|
||||||
|
|
||||||
|
def backprop(dY):
|
||||||
|
raise ValueError(Errors.E4004)
|
||||||
|
|
||||||
|
return (states, scores), backprop
|
||||||
|
|
||||||
|
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
|
||||||
|
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
|
||||||
|
cdef int i, j
|
||||||
|
cdef vector[StateC *] unfinished
|
||||||
|
cdef ActivationsC activations = _alloc_activations(sizes)
|
||||||
|
cdef np.ndarray step_scores
|
||||||
|
cdef np.ndarray step_actions
|
||||||
|
|
||||||
|
scores = []
|
||||||
|
while sizes.states >= 1:
|
||||||
|
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
|
||||||
|
step_actions = actions[0] if actions is not None else None
|
||||||
|
with nogil:
|
||||||
|
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
|
||||||
|
if actions is None:
|
||||||
|
# Validate actions, argmax, take action.
|
||||||
|
c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
|
||||||
|
sizes.states)
|
||||||
|
else:
|
||||||
|
c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
|
||||||
|
for i in range(sizes.states):
|
||||||
|
if not states[i].is_final():
|
||||||
|
unfinished.push_back(states[i])
|
||||||
|
for i in range(unfinished.size()):
|
||||||
|
states[i] = unfinished[i]
|
||||||
|
sizes.states = unfinished.size()
|
||||||
|
scores.append(step_scores)
|
||||||
|
unfinished.clear()
|
||||||
|
actions = actions[1:] if actions is not None else None
|
||||||
|
_free_activations(&activations)
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
||||||
|
|
||||||
|
def _forward_fallback(
|
||||||
|
model: Model,
|
||||||
|
moves: TransitionSystem,
|
||||||
|
states: List[StateClass],
|
||||||
|
tokvecs, backprop_tok2vec,
|
||||||
|
feats,
|
||||||
|
backprop_feats,
|
||||||
|
seen_mask,
|
||||||
|
is_train: bool,
|
||||||
|
actions: Optional[List[Ints1d]]=None,
|
||||||
|
max_moves: int=0):
|
||||||
|
nF = model.get_dim("nF")
|
||||||
|
output = model.get_ref("output")
|
||||||
|
hidden_b = model.get_param("hidden_b")
|
||||||
|
nH = model.get_dim("nH")
|
||||||
|
nP = model.get_dim("nP")
|
||||||
|
|
||||||
|
beam_width = model.attrs["beam_width"]
|
||||||
|
beam_density = model.attrs["beam_density"]
|
||||||
|
|
||||||
|
ops = model.ops
|
||||||
|
|
||||||
|
all_ids = []
|
||||||
|
all_which = []
|
||||||
|
all_statevecs = []
|
||||||
|
all_scores = []
|
||||||
|
if beam_width == 1:
|
||||||
|
batch = GreedyBatch(moves, states, None)
|
||||||
|
else:
|
||||||
|
batch = _beam_utils.BeamBatch(
|
||||||
|
moves, states, None, width=beam_width, density=beam_density
|
||||||
|
)
|
||||||
|
arange = ops.xp.arange(nF)
|
||||||
|
n_moves = 0
|
||||||
|
while not batch.is_done:
|
||||||
|
ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
|
||||||
|
for i, state in enumerate(batch.get_unfinished_states()):
|
||||||
|
state.set_context_tokens(ids, i, nF)
|
||||||
|
# Sum the state features, add the bias and apply the activation (maxout)
|
||||||
|
# to create the state vectors.
|
||||||
|
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
|
||||||
|
preacts2f += hidden_b
|
||||||
|
preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
|
||||||
|
assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
|
||||||
|
statevecs, which = ops.maxout(preacts)
|
||||||
|
# We don't use output's backprop, since we want to backprop for
|
||||||
|
# all states at once, rather than a single state.
|
||||||
|
scores = output.predict(statevecs)
|
||||||
|
scores[:, seen_mask] = ops.xp.nanmin(scores)
|
||||||
|
# Transition the states, filtering out any that are finished.
|
||||||
|
cpu_scores = ops.to_numpy(scores)
|
||||||
|
if actions is None:
|
||||||
|
batch.advance(cpu_scores)
|
||||||
|
else:
|
||||||
|
batch.advance_with_actions(actions[0])
|
||||||
|
actions = actions[1:]
|
||||||
|
all_scores.append(scores)
|
||||||
|
if is_train:
|
||||||
|
# Remember intermediate results for the backprop.
|
||||||
|
all_ids.append(ids)
|
||||||
|
all_statevecs.append(statevecs)
|
||||||
|
all_which.append(which)
|
||||||
|
if n_moves >= max_moves >= 1:
|
||||||
|
break
|
||||||
|
n_moves += 1
|
||||||
|
|
||||||
|
def backprop_parser(d_states_d_scores):
|
||||||
|
ids = ops.xp.vstack(all_ids)
|
||||||
|
which = ops.xp.vstack(all_which)
|
||||||
|
statevecs = ops.xp.vstack(all_statevecs)
|
||||||
|
_, d_scores = d_states_d_scores
|
||||||
|
if model.attrs.get("unseen_classes"):
|
||||||
|
# If we have a negative gradient (i.e. the probability should
|
||||||
|
# increase) on any classes we filtered out as unseen, mark
|
||||||
|
# them as seen.
|
||||||
|
for clas in set(model.attrs["unseen_classes"]):
|
||||||
|
if (d_scores[:, clas] < 0).any():
|
||||||
|
model.attrs["unseen_classes"].remove(clas)
|
||||||
|
d_scores *= seen_mask == False
|
||||||
|
# Calculate the gradients for the parameters of the output layer.
|
||||||
|
# The weight gemm is (nS, nO) @ (nS, nH).T
|
||||||
|
output.inc_grad("b", d_scores.sum(axis=0))
|
||||||
|
output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
|
||||||
|
# Now calculate d_statevecs, by backproping through the output linear layer.
|
||||||
|
# This gemm is (nS, nO) @ (nO, nH)
|
||||||
|
output_W = output.get_param("W")
|
||||||
|
d_statevecs = ops.gemm(d_scores, output_W)
|
||||||
|
# Backprop through the maxout activation
|
||||||
|
d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
|
||||||
|
d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
|
||||||
|
model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
|
||||||
|
# We don't need to backprop the summation, because we pass back the IDs instead
|
||||||
|
d_state_features = backprop_feats((d_preacts2f, ids))
|
||||||
|
d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
|
||||||
|
ops.scatter_add(d_tokvecs, ids, d_state_features)
|
||||||
|
model.inc_grad("hidden_pad", d_tokvecs[-1])
|
||||||
|
return (backprop_tok2vec(d_tokvecs[:-1]), None)
|
||||||
|
|
||||||
|
return (list(batch), all_scores), backprop_parser
|
||||||
|
|
||||||
|
|
||||||
|
def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
|
||||||
|
mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
|
||||||
|
for class_ in model.attrs.get("unseen_classes", set()):
|
||||||
|
mask[class_] = True
|
||||||
|
return mask
|
||||||
|
|
||||||
|
|
||||||
|
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
|
||||||
|
W: Floats2d = model.get_param("hidden_W")
|
||||||
|
nF = model.get_dim("nF")
|
||||||
|
nH = model.get_dim("nH")
|
||||||
|
nP = model.get_dim("nP")
|
||||||
|
nI = model.get_dim("nI")
|
||||||
|
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
|
||||||
|
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
|
||||||
|
W3f = W3f.transpose((1, 0, 2))
|
||||||
|
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
|
||||||
|
assert X.shape == (X.shape[0], nI), X.shape
|
||||||
|
Yf_ = model.ops.gemm(X, W2f, trans2=True)
|
||||||
|
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
|
||||||
|
|
||||||
|
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
|
||||||
|
# This backprop is particularly tricky, because we get back a different
|
||||||
|
# thing from what we put out. We put out an array of shape:
|
||||||
|
# (nB, nF, nH, nP), and get back:
|
||||||
|
# (nB, nH, nP) and ids (nB, nF)
|
||||||
|
# The ids tell us the values of nF, so we would have:
|
||||||
|
#
|
||||||
|
# dYf = zeros((nB, nF, nH, nP))
|
||||||
|
# for b in range(nB):
|
||||||
|
# for f in range(nF):
|
||||||
|
# dYf[b, ids[b, f]] += dY[b]
|
||||||
|
#
|
||||||
|
# However, we avoid building that array for efficiency -- and just pass
|
||||||
|
# in the indices.
|
||||||
|
dY, ids = dY_ids
|
||||||
|
dXf = model.ops.gemm(dY, W)
|
||||||
|
Xf = X[ids].reshape((ids.shape[0], -1))
|
||||||
|
dW = model.ops.gemm(dY, Xf, trans1=True)
|
||||||
|
model.inc_grad("hidden_W", dW)
|
||||||
|
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
|
||||||
|
|
||||||
|
return Yf, backward
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
|
||||||
|
if Y is None:
|
||||||
|
return None
|
||||||
|
_, scores = Y
|
||||||
|
if len(scores) == 0:
|
||||||
|
return None
|
||||||
|
assert scores[0].shape[0] >= 1
|
||||||
|
assert len(scores[0].shape) == 2
|
||||||
|
return scores[0].shape[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _lsuv_init(model: Model):
|
||||||
|
"""This is like the 'layer sequential unit variance', but instead
|
||||||
|
of taking the actual inputs, we randomly generate whitened data.
|
||||||
|
|
||||||
|
Why's this all so complicated? We have a huge number of inputs,
|
||||||
|
and the maxout unit makes guessing the dynamics tricky. Instead
|
||||||
|
we set the maxout weights to values that empirically result in
|
||||||
|
whitened outputs given whitened inputs.
|
||||||
|
"""
|
||||||
|
W = model.maybe_get_param("hidden_W")
|
||||||
|
if W is not None and W.any():
|
||||||
|
return
|
||||||
|
|
||||||
|
nF = model.get_dim("nF")
|
||||||
|
nH = model.get_dim("nH")
|
||||||
|
nP = model.get_dim("nP")
|
||||||
|
nI = model.get_dim("nI")
|
||||||
|
W = model.ops.alloc4f(nF, nH, nP, nI)
|
||||||
|
b = model.ops.alloc2f(nH, nP)
|
||||||
|
pad = model.ops.alloc4f(1, nF, nH, nP)
|
||||||
|
|
||||||
|
ops = model.ops
|
||||||
|
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
|
||||||
|
pad = normal_init(ops, pad.shape, mean=1.0)
|
||||||
|
model.set_param("W", W)
|
||||||
|
model.set_param("b", b)
|
||||||
|
model.set_param("pad", pad)
|
||||||
|
|
||||||
|
ids = ops.alloc_f((5000, nF), dtype="f")
|
||||||
|
ids += ops.xp.random.uniform(0, 1000, ids.shape)
|
||||||
|
ids = ops.asarray(ids, dtype="i")
|
||||||
|
tokvecs = ops.alloc_f((5000, nI), dtype="f")
|
||||||
|
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
|
||||||
|
tokvecs.shape
|
||||||
|
)
|
||||||
|
|
||||||
|
def predict(ids, tokvecs):
|
||||||
|
# nS ids. nW tokvecs. Exclude the padding array.
|
||||||
|
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
|
||||||
|
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
|
||||||
|
# need nS vectors
|
||||||
|
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
|
||||||
|
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
|
||||||
|
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
|
||||||
|
vectors3f += b
|
||||||
|
return model.ops.maxout(vectors3f)[0]
|
||||||
|
|
||||||
|
tol_var = 0.01
|
||||||
|
tol_mean = 0.01
|
||||||
|
t_max = 10
|
||||||
|
W = cast(Floats4d, model.get_param("hidden_W").copy())
|
||||||
|
b = cast(Floats2d, model.get_param("hidden_b").copy())
|
||||||
|
for t_i in range(t_max):
|
||||||
|
acts1 = predict(ids, tokvecs)
|
||||||
|
var = model.ops.xp.var(acts1)
|
||||||
|
mean = model.ops.xp.mean(acts1)
|
||||||
|
if abs(var - 1.0) >= tol_var:
|
||||||
|
W /= model.ops.xp.sqrt(var)
|
||||||
|
model.set_param("hidden_W", W)
|
||||||
|
elif abs(mean) >= tol_mean:
|
||||||
|
b -= mean
|
||||||
|
model.set_param("hidden_b", b)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
|
||||||
|
output = model.get_ref("output")
|
||||||
|
cdef np.ndarray hidden_b = model.get_param("hidden_b")
|
||||||
|
cdef np.ndarray output_W = output.get_param("W")
|
||||||
|
cdef np.ndarray output_b = output.get_param("b")
|
||||||
|
|
||||||
|
cdef WeightsC weights
|
||||||
|
weights.feat_weights = feats
|
||||||
|
weights.feat_bias = <const float*>hidden_b.data
|
||||||
|
weights.hidden_weights = <const float *> output_W.data
|
||||||
|
weights.hidden_bias = <const float *> output_b.data
|
||||||
|
weights.seen_mask = <const int8_t*> seen_mask.data
|
||||||
|
|
||||||
|
return weights
|
||||||
|
|
||||||
|
|
||||||
|
cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
|
||||||
|
cdef SizesC sizes
|
||||||
|
sizes.states = batch_size
|
||||||
|
sizes.classes = model.get_dim("nO")
|
||||||
|
sizes.hiddens = model.get_dim("nH")
|
||||||
|
sizes.pieces = model.get_dim("nP")
|
||||||
|
sizes.feats = model.get_dim("nF")
|
||||||
|
sizes.embed_width = model.get_dim("nI")
|
||||||
|
sizes.tokens = tokens
|
||||||
|
return sizes
|
||||||
|
|
||||||
|
|
||||||
|
cdef ActivationsC _alloc_activations(SizesC n) nogil:
|
||||||
|
cdef ActivationsC A
|
||||||
|
memset(&A, 0, sizeof(A))
|
||||||
|
_resize_activations(&A, n)
|
||||||
|
return A
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _free_activations(const ActivationsC* A) nogil:
|
||||||
|
free(A.token_ids)
|
||||||
|
free(A.unmaxed)
|
||||||
|
free(A.hiddens)
|
||||||
|
free(A.is_valid)
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
|
||||||
|
if n.states <= A._max_size:
|
||||||
|
A._curr_size = n.states
|
||||||
|
return
|
||||||
|
if A._max_size == 0:
|
||||||
|
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
|
||||||
|
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
|
||||||
|
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
|
||||||
|
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
|
||||||
|
A._max_size = n.states
|
||||||
|
else:
|
||||||
|
A.token_ids = <int*>realloc(A.token_ids,
|
||||||
|
n.states * n.feats * sizeof(A.token_ids[0]))
|
||||||
|
A.unmaxed = <float*>realloc(A.unmaxed,
|
||||||
|
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
|
||||||
|
A.hiddens = <float*>realloc(A.hiddens,
|
||||||
|
n.states * n.hiddens * sizeof(A.hiddens[0]))
|
||||||
|
A.is_valid = <int*>realloc(A.is_valid,
|
||||||
|
n.states * n.classes * sizeof(A.is_valid[0]))
|
||||||
|
A._max_size = n.states
|
||||||
|
A._curr_size = n.states
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
|
||||||
|
_resize_activations(A, n)
|
||||||
|
for i in range(n.states):
|
||||||
|
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
|
||||||
|
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
|
||||||
|
_sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
|
||||||
|
for i in range(n.states):
|
||||||
|
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
|
||||||
|
for j in range(n.hiddens):
|
||||||
|
index = i * n.hiddens * n.pieces + j * n.pieces
|
||||||
|
which = arg_max(&A.unmaxed[index], n.pieces)
|
||||||
|
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
|
||||||
|
if W.hidden_weights == NULL:
|
||||||
|
memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
|
||||||
|
else:
|
||||||
|
# Compute hidden-to-output
|
||||||
|
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
|
||||||
|
1.0, <const float *>A.hiddens, n.hiddens,
|
||||||
|
<const float *>W.hidden_weights, n.hiddens,
|
||||||
|
0.0, scores, n.classes)
|
||||||
|
# Add bias
|
||||||
|
for i in range(n.states):
|
||||||
|
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
|
||||||
|
# Set unseen classes to minimum value
|
||||||
|
i = 0
|
||||||
|
min_ = scores[0]
|
||||||
|
for i in range(1, n.states * n.classes):
|
||||||
|
if scores[i] < min_:
|
||||||
|
min_ = scores[i]
|
||||||
|
for i in range(n.states):
|
||||||
|
for j in range(n.classes):
|
||||||
|
if W.seen_mask[j]:
|
||||||
|
scores[i*n.classes+j] = min_
|
||||||
|
|
||||||
|
|
||||||
|
cdef void _sum_state_features(CBlas cblas, float* output,
|
||||||
|
const float* cached, const int* token_ids, SizesC n) nogil:
|
||||||
|
cdef int idx, b, f, i
|
||||||
|
cdef const float* feature
|
||||||
|
cdef int B = n.states
|
||||||
|
cdef int O = n.hiddens * n.pieces
|
||||||
|
cdef int F = n.feats
|
||||||
|
cdef int T = n.tokens
|
||||||
|
padding = cached + (T * F * O)
|
||||||
|
cdef int id_stride = F*O
|
||||||
|
cdef float one = 1.
|
||||||
|
for b in range(B):
|
||||||
|
for f in range(F):
|
||||||
|
if token_ids[f] < 0:
|
||||||
|
feature = &padding[f*O]
|
||||||
|
else:
|
||||||
|
idx = token_ids[f] * id_stride + f*O
|
||||||
|
feature = &cached[idx]
|
||||||
|
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
|
||||||
|
token_ids += F
|
||||||
|
|
|
@ -7,6 +7,7 @@ from cpython.ref cimport PyObject, Py_XDECREF
|
||||||
from ...typedefs cimport hash_t, class_t
|
from ...typedefs cimport hash_t, class_t
|
||||||
from .transition_system cimport TransitionSystem, Transition
|
from .transition_system cimport TransitionSystem, Transition
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
|
from .batch cimport Batch
|
||||||
from .search cimport Beam, MaxViolation
|
from .search cimport Beam, MaxViolation
|
||||||
from .search import MaxViolation
|
from .search import MaxViolation
|
||||||
from .stateclass cimport StateC, StateClass
|
from .stateclass cimport StateC, StateClass
|
||||||
|
@ -26,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
|
||||||
return state.is_final()
|
return state.is_final()
|
||||||
|
|
||||||
|
|
||||||
cdef class BeamBatch(object):
|
cdef class BeamBatch(Batch):
|
||||||
cdef public TransitionSystem moves
|
cdef public TransitionSystem moves
|
||||||
cdef public object states
|
cdef public object states
|
||||||
cdef public object docs
|
cdef public object docs
|
||||||
|
|
2
spacy/pipeline/_parser_internals/_parser_utils.pxd
Normal file
2
spacy/pipeline/_parser_internals/_parser_utils.pxd
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
cdef int arg_max(const float* scores, const int n_classes) nogil
|
||||||
|
cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil
|
22
spacy/pipeline/_parser_internals/_parser_utils.pyx
Normal file
22
spacy/pipeline/_parser_internals/_parser_utils.pyx
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
# cython: infer_types=True
|
||||||
|
|
||||||
|
cdef inline int arg_max(const float* scores, const int n_classes) nogil:
|
||||||
|
if n_classes == 2:
|
||||||
|
return 0 if scores[0] > scores[1] else 1
|
||||||
|
cdef int i
|
||||||
|
cdef int best = 0
|
||||||
|
cdef float mode = scores[0]
|
||||||
|
for i in range(1, n_classes):
|
||||||
|
if scores[i] > mode:
|
||||||
|
mode = scores[i]
|
||||||
|
best = i
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
|
||||||
|
cdef int best = -1
|
||||||
|
for i in range(n):
|
||||||
|
if is_valid[i] >= 1:
|
||||||
|
if best == -1 or scores[i] > scores[best]:
|
||||||
|
best = i
|
||||||
|
return best
|
|
@ -6,7 +6,6 @@ cimport libcpp
|
||||||
from libcpp.unordered_map cimport unordered_map
|
from libcpp.unordered_map cimport unordered_map
|
||||||
from libcpp.vector cimport vector
|
from libcpp.vector cimport vector
|
||||||
from libcpp.set cimport set
|
from libcpp.set cimport set
|
||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
|
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
from ...vocab cimport EMPTY_LEXEME
|
from ...vocab cimport EMPTY_LEXEME
|
||||||
|
@ -26,7 +25,7 @@ cdef struct ArcC:
|
||||||
|
|
||||||
|
|
||||||
cdef cppclass StateC:
|
cdef cppclass StateC:
|
||||||
int* _heads
|
vector[int] _heads
|
||||||
const TokenC* _sent
|
const TokenC* _sent
|
||||||
vector[int] _stack
|
vector[int] _stack
|
||||||
vector[int] _rebuffer
|
vector[int] _rebuffer
|
||||||
|
@ -34,31 +33,34 @@ cdef cppclass StateC:
|
||||||
unordered_map[int, vector[ArcC]] _left_arcs
|
unordered_map[int, vector[ArcC]] _left_arcs
|
||||||
unordered_map[int, vector[ArcC]] _right_arcs
|
unordered_map[int, vector[ArcC]] _right_arcs
|
||||||
vector[libcpp.bool] _unshiftable
|
vector[libcpp.bool] _unshiftable
|
||||||
|
vector[int] history
|
||||||
set[int] _sent_starts
|
set[int] _sent_starts
|
||||||
TokenC _empty_token
|
TokenC _empty_token
|
||||||
int length
|
int length
|
||||||
int offset
|
int offset
|
||||||
int _b_i
|
int _b_i
|
||||||
|
|
||||||
__init__(const TokenC* sent, int length) nogil:
|
__init__(const TokenC* sent, int length) nogil except +:
|
||||||
|
this._heads.resize(length, -1)
|
||||||
|
this._unshiftable.resize(length, False)
|
||||||
|
|
||||||
|
# Reserve memory ahead of time to minimize allocations during parsing.
|
||||||
|
# The initial capacity set here ideally reflects the expected average-case/majority usage.
|
||||||
|
cdef int init_capacity = 32
|
||||||
|
this._stack.reserve(init_capacity)
|
||||||
|
this._rebuffer.reserve(init_capacity)
|
||||||
|
this._ents.reserve(init_capacity)
|
||||||
|
this._left_arcs.reserve(init_capacity)
|
||||||
|
this._right_arcs.reserve(init_capacity)
|
||||||
|
this.history.reserve(init_capacity)
|
||||||
|
|
||||||
this._sent = sent
|
this._sent = sent
|
||||||
this._heads = <int*>calloc(length, sizeof(int))
|
|
||||||
if not (this._sent and this._heads):
|
|
||||||
with gil:
|
|
||||||
PyErr_SetFromErrno(MemoryError)
|
|
||||||
PyErr_CheckSignals()
|
|
||||||
this.offset = 0
|
this.offset = 0
|
||||||
this.length = length
|
this.length = length
|
||||||
this._b_i = 0
|
this._b_i = 0
|
||||||
for i in range(length):
|
|
||||||
this._heads[i] = -1
|
|
||||||
this._unshiftable.push_back(0)
|
|
||||||
memset(&this._empty_token, 0, sizeof(TokenC))
|
memset(&this._empty_token, 0, sizeof(TokenC))
|
||||||
this._empty_token.lex = &EMPTY_LEXEME
|
this._empty_token.lex = &EMPTY_LEXEME
|
||||||
|
|
||||||
__dealloc__():
|
|
||||||
free(this._heads)
|
|
||||||
|
|
||||||
void set_context_tokens(int* ids, int n) nogil:
|
void set_context_tokens(int* ids, int n) nogil:
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
if n == 1:
|
if n == 1:
|
||||||
|
@ -131,19 +133,20 @@ cdef cppclass StateC:
|
||||||
ids[i] = -1
|
ids[i] = -1
|
||||||
|
|
||||||
int S(int i) nogil const:
|
int S(int i) nogil const:
|
||||||
if i >= this._stack.size():
|
cdef int stack_size = this._stack.size()
|
||||||
|
if i >= stack_size or i < 0:
|
||||||
return -1
|
return -1
|
||||||
elif i < 0:
|
else:
|
||||||
return -1
|
return this._stack[stack_size - (i+1)]
|
||||||
return this._stack.at(this._stack.size() - (i+1))
|
|
||||||
|
|
||||||
int B(int i) nogil const:
|
int B(int i) nogil const:
|
||||||
|
cdef int buf_size = this._rebuffer.size()
|
||||||
if i < 0:
|
if i < 0:
|
||||||
return -1
|
return -1
|
||||||
elif i < this._rebuffer.size():
|
elif i < buf_size:
|
||||||
return this._rebuffer.at(this._rebuffer.size() - (i+1))
|
return this._rebuffer[buf_size - (i+1)]
|
||||||
else:
|
else:
|
||||||
b_i = this._b_i + (i - this._rebuffer.size())
|
b_i = this._b_i + (i - buf_size)
|
||||||
if b_i >= this.length:
|
if b_i >= this.length:
|
||||||
return -1
|
return -1
|
||||||
else:
|
else:
|
||||||
|
@ -242,7 +245,7 @@ cdef cppclass StateC:
|
||||||
return 0
|
return 0
|
||||||
elif this._sent[word].sent_start == 1:
|
elif this._sent[word].sent_start == 1:
|
||||||
return 1
|
return 1
|
||||||
elif this._sent_starts.count(word) >= 1:
|
elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
@ -327,7 +330,7 @@ cdef cppclass StateC:
|
||||||
if item >= this._unshiftable.size():
|
if item >= this._unshiftable.size():
|
||||||
return 0
|
return 0
|
||||||
else:
|
else:
|
||||||
return this._unshiftable.at(item)
|
return this._unshiftable[item]
|
||||||
|
|
||||||
void set_reshiftable(int item) nogil:
|
void set_reshiftable(int item) nogil:
|
||||||
if item < this._unshiftable.size():
|
if item < this._unshiftable.size():
|
||||||
|
@ -347,6 +350,9 @@ cdef cppclass StateC:
|
||||||
this._heads[child] = head
|
this._heads[child] = head
|
||||||
|
|
||||||
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
|
||||||
|
cdef vector[ArcC]* arcs
|
||||||
|
cdef ArcC* arc
|
||||||
|
|
||||||
arcs_it = heads_arcs.find(h_i)
|
arcs_it = heads_arcs.find(h_i)
|
||||||
if arcs_it == heads_arcs.end():
|
if arcs_it == heads_arcs.end():
|
||||||
return
|
return
|
||||||
|
@ -355,12 +361,12 @@ cdef cppclass StateC:
|
||||||
if arcs.size() == 0:
|
if arcs.size() == 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
arc = arcs.back()
|
arc = &arcs.back()
|
||||||
if arc.head == h_i and arc.child == c_i:
|
if arc.head == h_i and arc.child == c_i:
|
||||||
arcs.pop_back()
|
arcs.pop_back()
|
||||||
else:
|
else:
|
||||||
for i in range(arcs.size()-1):
|
for i in range(arcs.size()-1):
|
||||||
arc = arcs.at(i)
|
arc = &deref(arcs)[i]
|
||||||
if arc.head == h_i and arc.child == c_i:
|
if arc.head == h_i and arc.child == c_i:
|
||||||
arc.head = -1
|
arc.head = -1
|
||||||
arc.child = -1
|
arc.child = -1
|
||||||
|
@ -400,10 +406,11 @@ cdef cppclass StateC:
|
||||||
this._rebuffer = src._rebuffer
|
this._rebuffer = src._rebuffer
|
||||||
this._sent_starts = src._sent_starts
|
this._sent_starts = src._sent_starts
|
||||||
this._unshiftable = src._unshiftable
|
this._unshiftable = src._unshiftable
|
||||||
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
|
this._heads = src._heads
|
||||||
this._ents = src._ents
|
this._ents = src._ents
|
||||||
this._left_arcs = src._left_arcs
|
this._left_arcs = src._left_arcs
|
||||||
this._right_arcs = src._right_arcs
|
this._right_arcs = src._right_arcs
|
||||||
this._b_i = src._b_i
|
this._b_i = src._b_i
|
||||||
this.offset = src.offset
|
this.offset = src.offset
|
||||||
this._empty_token = src._empty_token
|
this._empty_token = src._empty_token
|
||||||
|
this.history = src.history
|
||||||
|
|
|
@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem):
|
||||||
return list(arcs)
|
return list(arcs)
|
||||||
|
|
||||||
def has_gold(self, Example eg, start=0, end=None):
|
def has_gold(self, Example eg, start=0, end=None):
|
||||||
|
if end is not None and end < 0:
|
||||||
|
end = None
|
||||||
for word in eg.y[start:end]:
|
for word in eg.y[start:end]:
|
||||||
if word.dep != 0:
|
if word.dep != 0:
|
||||||
return True
|
return True
|
||||||
|
@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
state.print_state()
|
state.print_state()
|
||||||
)))
|
)))
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
|
state.c.history.push_back(i)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
failed = False
|
failed = False
|
||||||
|
|
2
spacy/pipeline/_parser_internals/batch.pxd
Normal file
2
spacy/pipeline/_parser_internals/batch.pxd
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
cdef class Batch:
|
||||||
|
pass
|
52
spacy/pipeline/_parser_internals/batch.pyx
Normal file
52
spacy/pipeline/_parser_internals/batch.pyx
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
TransitionSystem = Any # TODO
|
||||||
|
|
||||||
|
cdef class Batch:
|
||||||
|
def advance(self, scores):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_states(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_done(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def get_unfinished_states(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class GreedyBatch(Batch):
|
||||||
|
def __init__(self, moves: TransitionSystem, states, golds):
|
||||||
|
self._moves = moves
|
||||||
|
self._states = states
|
||||||
|
self._next_states = [s for s in states if not s.is_final()]
|
||||||
|
|
||||||
|
def advance(self, scores):
|
||||||
|
self._next_states = self._moves.transition_states(self._next_states, scores)
|
||||||
|
|
||||||
|
def advance_with_actions(self, actions):
|
||||||
|
self._next_states = self._moves.apply_actions(self._next_states, actions)
|
||||||
|
|
||||||
|
def get_states(self):
|
||||||
|
return self._states
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_done(self):
|
||||||
|
return all(s.is_final() for s in self._states)
|
||||||
|
|
||||||
|
def get_unfinished_states(self):
|
||||||
|
return [st for st in self._states if not st.is_final()]
|
||||||
|
|
||||||
|
def __getitem__(self, i):
|
||||||
|
return self._states[i]
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._states)
|
|
@ -306,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
for span in eg.y.spans.get(neg_key, []):
|
for span in eg.y.spans.get(neg_key, []):
|
||||||
if span.start >= start and span.end <= end:
|
if span.start >= start and span.end <= end:
|
||||||
return True
|
return True
|
||||||
|
if end is not None and end < 0:
|
||||||
|
end = None
|
||||||
for word in eg.y[start:end]:
|
for word in eg.y[start:end]:
|
||||||
if word.ent_iob != 0:
|
if word.ent_iob != 0:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -20,6 +20,10 @@ cdef class StateClass:
|
||||||
if self._borrowed != 1:
|
if self._borrowed != 1:
|
||||||
del self.c
|
del self.c
|
||||||
|
|
||||||
|
@property
|
||||||
|
def history(self):
|
||||||
|
return list(self.c.history)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def stack(self):
|
def stack(self):
|
||||||
return [self.S(i) for i in range(self.c.stack_depth())]
|
return [self.S(i) for i in range(self.c.stack_depth())]
|
||||||
|
@ -176,3 +180,6 @@ cdef class StateClass:
|
||||||
|
|
||||||
def clone(self, StateClass src):
|
def clone(self, StateClass src):
|
||||||
self.c.clone(src.c)
|
self.c.clone(src.c)
|
||||||
|
|
||||||
|
def set_context_tokens(self, int[:, :] output, int row, int n_feats):
|
||||||
|
self.c.set_context_tokens(&output[row, 0], n_feats)
|
||||||
|
|
|
@ -53,3 +53,10 @@ cdef class TransitionSystem:
|
||||||
|
|
||||||
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
cdef int set_costs(self, int* is_valid, weight_t* costs,
|
||||||
const StateC* state, gold) except -1
|
const StateC* state, gold) except -1
|
||||||
|
|
||||||
|
|
||||||
|
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
|
||||||
|
int batch_size) nogil
|
||||||
|
|
||||||
|
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||||
|
int nr_class, int batch_size) nogil
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# cython: infer_types=True
|
# cython: infer_types=True
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
from libc.stdlib cimport calloc, free
|
||||||
|
from libcpp.vector cimport vector
|
||||||
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
|
@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t
|
||||||
from ...tokens.doc cimport Doc
|
from ...tokens.doc cimport Doc
|
||||||
from ...structs cimport TokenC
|
from ...structs cimport TokenC
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
from ._parser_utils cimport arg_max_if_valid
|
||||||
|
|
||||||
from ...errors import Errors
|
from ...errors import Errors
|
||||||
from ... import util
|
from ... import util
|
||||||
|
@ -73,7 +76,18 @@ cdef class TransitionSystem:
|
||||||
offset += len(doc)
|
offset += len(doc)
|
||||||
return states
|
return states
|
||||||
|
|
||||||
|
def follow_history(self, doc, history):
|
||||||
|
cdef int clas
|
||||||
|
cdef StateClass state = StateClass(doc)
|
||||||
|
for clas in history:
|
||||||
|
action = self.c[clas]
|
||||||
|
action.do(state.c, action.label)
|
||||||
|
state.c.history.push_back(clas)
|
||||||
|
return state
|
||||||
|
|
||||||
def get_oracle_sequence(self, Example example, _debug=False):
|
def get_oracle_sequence(self, Example example, _debug=False):
|
||||||
|
if not self.has_gold(example):
|
||||||
|
return []
|
||||||
states, golds, _ = self.init_gold_batch([example])
|
states, golds, _ = self.init_gold_batch([example])
|
||||||
if not states:
|
if not states:
|
||||||
return []
|
return []
|
||||||
|
@ -85,6 +99,8 @@ cdef class TransitionSystem:
|
||||||
return self.get_oracle_sequence_from_state(state, gold)
|
return self.get_oracle_sequence_from_state(state, gold)
|
||||||
|
|
||||||
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
|
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
|
||||||
|
if state.is_final():
|
||||||
|
return []
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
||||||
assert self.n_moves > 0
|
assert self.n_moves > 0
|
||||||
|
@ -110,6 +126,7 @@ cdef class TransitionSystem:
|
||||||
"S0 head?", str(state.has_head(state.S(0))),
|
"S0 head?", str(state.has_head(state.S(0))),
|
||||||
)))
|
)))
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
|
state.c.history.push_back(i)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
if _debug:
|
if _debug:
|
||||||
|
@ -137,6 +154,28 @@ cdef class TransitionSystem:
|
||||||
raise ValueError(Errors.E170.format(name=name))
|
raise ValueError(Errors.E170.format(name=name))
|
||||||
action = self.lookup_transition(name)
|
action = self.lookup_transition(name)
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
|
state.c.history.push_back(action.clas)
|
||||||
|
|
||||||
|
def apply_actions(self, states, const int[::1] actions):
|
||||||
|
assert len(states) == actions.shape[0]
|
||||||
|
cdef StateClass state
|
||||||
|
cdef vector[StateC*] c_states
|
||||||
|
c_states.resize(len(states))
|
||||||
|
cdef int i
|
||||||
|
for (i, state) in enumerate(states):
|
||||||
|
c_states[i] = state.c
|
||||||
|
c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
|
||||||
|
return [state for state in states if not state.c.is_final()]
|
||||||
|
|
||||||
|
def transition_states(self, states, float[:, ::1] scores):
|
||||||
|
assert len(states) == scores.shape[0]
|
||||||
|
cdef StateClass state
|
||||||
|
cdef float* c_scores = &scores[0, 0]
|
||||||
|
cdef vector[StateC*] c_states
|
||||||
|
for state in states:
|
||||||
|
c_states.push_back(state.c)
|
||||||
|
c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
||||||
|
return [state for state in states if not state.c.is_final()]
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
@ -250,3 +289,35 @@ cdef class TransitionSystem:
|
||||||
self.cfg.update(msg['cfg'])
|
self.cfg.update(msg['cfg'])
|
||||||
self.initialize_actions(labels)
|
self.initialize_actions(labels)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
|
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
|
||||||
|
int batch_size) nogil:
|
||||||
|
cdef int i
|
||||||
|
cdef Transition action
|
||||||
|
cdef StateC* state
|
||||||
|
for i in range(batch_size):
|
||||||
|
state = states[i]
|
||||||
|
action = moves.c[actions[i]]
|
||||||
|
action.do(state, action.label)
|
||||||
|
state.history.push_back(action.clas)
|
||||||
|
|
||||||
|
|
||||||
|
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
|
||||||
|
int nr_class, int batch_size) nogil:
|
||||||
|
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
|
||||||
|
cdef int i, guess
|
||||||
|
cdef Transition action
|
||||||
|
for i in range(batch_size):
|
||||||
|
moves.set_valid(is_valid, states[i])
|
||||||
|
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
||||||
|
if guess == -1:
|
||||||
|
# This shouldn't happen, but it's hard to raise an error here,
|
||||||
|
# and we don't want to infinite loop. So, force to end state.
|
||||||
|
states[i].force_final()
|
||||||
|
else:
|
||||||
|
action = moves.c[guess]
|
||||||
|
action.do(states[i], action.label)
|
||||||
|
states[i].history.push_back(guess)
|
||||||
|
free(is_valid)
|
||||||
|
|
||||||
|
|
|
@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser import Parser
|
||||||
from ._parser_internals.arc_eager cimport ArcEager
|
from ._parser_internals.arc_eager import ArcEager
|
||||||
|
|
||||||
from .functions import merge_subtokens
|
from .functions import merge_subtokens
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -18,12 +18,11 @@ from ..util import registry
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
@ -123,6 +122,7 @@ def make_parser(
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"beam_parser",
|
"beam_parser",
|
||||||
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
|
||||||
|
@ -228,6 +228,7 @@ def parser_score(examples, **kwargs):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser#score
|
DOCS: https://spacy.io/api/dependencyparser#score
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def has_sents(doc):
|
def has_sents(doc):
|
||||||
return doc.has_annotation("SENT_START")
|
return doc.has_annotation("SENT_START")
|
||||||
|
|
||||||
|
@ -235,8 +236,11 @@ def parser_score(examples, **kwargs):
|
||||||
dep = getattr(token, attr)
|
dep = getattr(token, attr)
|
||||||
dep = token.vocab.strings.as_string(dep).lower()
|
dep = token.vocab.strings.as_string(dep).lower()
|
||||||
return dep
|
return dep
|
||||||
|
|
||||||
results = {}
|
results = {}
|
||||||
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
|
results.update(
|
||||||
|
Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
|
||||||
|
)
|
||||||
kwargs.setdefault("getter", dep_getter)
|
kwargs.setdefault("getter", dep_getter)
|
||||||
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
kwargs.setdefault("ignore_labels", ("p", "punct"))
|
||||||
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
results.update(Scorer.score_deps(examples, "dep", **kwargs))
|
||||||
|
@ -249,11 +253,12 @@ def make_parser_scorer():
|
||||||
return parser_score
|
return parser_score
|
||||||
|
|
||||||
|
|
||||||
cdef class DependencyParser(Parser):
|
class DependencyParser(Parser):
|
||||||
"""Pipeline component for dependency parsing.
|
"""Pipeline component for dependency parsing.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/dependencyparser
|
DOCS: https://spacy.io/api/dependencyparser
|
||||||
"""
|
"""
|
||||||
|
|
||||||
TransitionSystem = ArcEager
|
TransitionSystem = ArcEager
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -273,8 +278,7 @@ cdef class DependencyParser(Parser):
|
||||||
incorrect_spans_key=None,
|
incorrect_spans_key=None,
|
||||||
scorer=parser_score,
|
scorer=parser_score,
|
||||||
):
|
):
|
||||||
"""Create a DependencyParser.
|
"""Create a DependencyParser."""
|
||||||
"""
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab,
|
vocab,
|
||||||
model,
|
model,
|
|
@ -155,6 +155,25 @@ class EditTreeLemmatizer(TrainablePipe):
|
||||||
|
|
||||||
return float(loss), d_scores
|
return float(loss), d_scores
|
||||||
|
|
||||||
|
def get_teacher_student_loss(
|
||||||
|
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
||||||
|
) -> Tuple[float, List[Floats2d]]:
|
||||||
|
"""Calculate the loss and its gradient for a batch of student
|
||||||
|
scores, relative to teacher scores.
|
||||||
|
|
||||||
|
teacher_scores: Scores representing the teacher model's predictions.
|
||||||
|
student_scores: Scores representing the student model's predictions.
|
||||||
|
|
||||||
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
|
||||||
|
"""
|
||||||
|
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
|
||||||
|
d_scores, loss = loss_func(student_scores, teacher_scores)
|
||||||
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
raise ValueError(Errors.E910.format(name=self.name))
|
||||||
|
return float(loss), d_scores
|
||||||
|
|
||||||
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
|
||||||
n_docs = len(list(docs))
|
n_docs = len(list(docs))
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
|
|
0
spacy/pipeline/entityruler.py
Normal file
0
spacy/pipeline/entityruler.py
Normal file
|
@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
|
||||||
from thinc.api import Model, Config
|
from thinc.api import Model, Config
|
||||||
|
|
||||||
from ._parser_internals.transition_system import TransitionSystem
|
from ._parser_internals.transition_system import TransitionSystem
|
||||||
from .transition_parser cimport Parser
|
from .transition_parser import Parser
|
||||||
from ._parser_internals.ner cimport BiluoPushDown
|
from ._parser_internals.ner import BiluoPushDown
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
from ..scorer import get_ner_prf, PRFScore
|
from ..scorer import get_ner_prf, PRFScore
|
||||||
|
from ..training import validate_examples
|
||||||
from ..util import registry
|
from ..util import registry
|
||||||
from ..training import remove_bilu_prefix
|
from ..training import remove_bilu_prefix
|
||||||
|
|
||||||
|
|
||||||
default_model_config = """
|
default_model_config = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 64
|
hidden_width = 64
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v2"
|
@architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
||||||
"incorrect_spans_key": None,
|
"incorrect_spans_key": None,
|
||||||
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={
|
||||||
|
"ents_f": 1.0,
|
||||||
|
"ents_p": 0.0,
|
||||||
|
"ents_r": 0.0,
|
||||||
|
"ents_per_type": None,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
def make_ner(
|
def make_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
|
@ -98,6 +102,7 @@ def make_ner(
|
||||||
scorer=scorer,
|
scorer=scorer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
@Language.factory(
|
||||||
"beam_ner",
|
"beam_ner",
|
||||||
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||||
|
@ -111,7 +116,12 @@ def make_ner(
|
||||||
"incorrect_spans_key": None,
|
"incorrect_spans_key": None,
|
||||||
"scorer": None,
|
"scorer": None,
|
||||||
},
|
},
|
||||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
default_score_weights={
|
||||||
|
"ents_f": 1.0,
|
||||||
|
"ents_p": 0.0,
|
||||||
|
"ents_r": 0.0,
|
||||||
|
"ents_per_type": None,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
def make_beam_ner(
|
def make_beam_ner(
|
||||||
nlp: Language,
|
nlp: Language,
|
||||||
|
@ -185,11 +195,12 @@ def make_ner_scorer():
|
||||||
return ner_score
|
return ner_score
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
class EntityRecognizer(Parser):
|
||||||
"""Pipeline component for named entity recognition.
|
"""Pipeline component for named entity recognition.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/entityrecognizer
|
DOCS: https://spacy.io/api/entityrecognizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -207,8 +218,7 @@ cdef class EntityRecognizer(Parser):
|
||||||
incorrect_spans_key=None,
|
incorrect_spans_key=None,
|
||||||
scorer=ner_score,
|
scorer=ner_score,
|
||||||
):
|
):
|
||||||
"""Create an EntityRecognizer.
|
"""Create an EntityRecognizer."""
|
||||||
"""
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab,
|
vocab,
|
||||||
model,
|
model,
|
|
@ -87,6 +87,10 @@ cdef class Pipe:
|
||||||
return self.scorer(examples, **scorer_kwargs)
|
return self.scorer(examples, **scorer_kwargs)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_distillable(self) -> bool:
|
||||||
|
return False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_trainable(self) -> bool:
|
def is_trainable(self) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# cython: infer_types=True, profile=True, binding=True
|
# cython: infer_types=True, profile=True, binding=True
|
||||||
from typing import Callable, Dict, Iterable, List, Optional, Union
|
from typing import Callable, Dict, Iterable, List, Optional, Union
|
||||||
|
from typing import Tuple
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import Model, set_dropout_rate, Config
|
from thinc.api import Model, set_dropout_rate, Config
|
||||||
|
@ -245,7 +246,6 @@ class Tagger(TrainablePipe):
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tagger#rehearse
|
DOCS: https://spacy.io/api/tagger#rehearse
|
||||||
"""
|
"""
|
||||||
loss_func = LegacySequenceCategoricalCrossentropy()
|
|
||||||
if losses is None:
|
if losses is None:
|
||||||
losses = {}
|
losses = {}
|
||||||
losses.setdefault(self.name, 0.0)
|
losses.setdefault(self.name, 0.0)
|
||||||
|
@ -259,12 +259,32 @@ class Tagger(TrainablePipe):
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
tag_scores, bp_tag_scores = self.model.begin_update(docs)
|
||||||
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
|
||||||
grads, loss = loss_func(tag_scores, tutor_tag_scores)
|
loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
|
||||||
bp_tag_scores(grads)
|
bp_tag_scores(grads)
|
||||||
|
if sgd is not None:
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
def get_teacher_student_loss(
|
||||||
|
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
|
||||||
|
) -> Tuple[float, List[Floats2d]]:
|
||||||
|
"""Calculate the loss and its gradient for a batch of student
|
||||||
|
scores, relative to teacher scores.
|
||||||
|
|
||||||
|
teacher_scores: Scores representing the teacher model's predictions.
|
||||||
|
student_scores: Scores representing the student model's predictions.
|
||||||
|
|
||||||
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
|
||||||
|
"""
|
||||||
|
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
|
||||||
|
d_scores, loss = loss_func(student_scores, teacher_scores)
|
||||||
|
if self.model.ops.xp.isnan(loss):
|
||||||
|
raise ValueError(Errors.E910.format(name=self.name))
|
||||||
|
return float(loss), d_scores
|
||||||
|
|
||||||
def get_loss(self, examples, scores):
|
def get_loss(self, examples, scores):
|
||||||
"""Find the loss and gradient of loss for the batch of documents and
|
"""Find the loss and gradient of loss for the batch of documents and
|
||||||
their predicted scores.
|
their predicted scores.
|
||||||
|
|
|
@ -77,7 +77,7 @@ subword_features = true
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.0,
|
"threshold": 0.0,
|
||||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||||
"save_activations": False,
|
"save_activations": False,
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
|
|
|
@ -74,7 +74,7 @@ subword_features = true
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.5,
|
"threshold": 0.5,
|
||||||
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
|
||||||
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
|
"scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
|
||||||
"save_activations": False,
|
"save_activations": False,
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
|
|
|
@ -6,7 +6,7 @@ import warnings
|
||||||
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
|
|
||||||
from ..training import validate_examples
|
from ..training import validate_examples, validate_distillation_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .pipe import Pipe, deserialize_config
|
from .pipe import Pipe, deserialize_config
|
||||||
from .. import util
|
from .. import util
|
||||||
|
@ -56,6 +56,53 @@ cdef class TrainablePipe(Pipe):
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, [doc], e)
|
error_handler(self.name, self, [doc], e)
|
||||||
|
|
||||||
|
|
||||||
|
def distill(self,
|
||||||
|
teacher_pipe: Optional["TrainablePipe"],
|
||||||
|
examples: Iterable["Example"],
|
||||||
|
*,
|
||||||
|
drop: float=0.0,
|
||||||
|
sgd: Optional[Optimizer]=None,
|
||||||
|
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
||||||
|
"""Train a pipe (the student) on the predictions of another pipe
|
||||||
|
(the teacher). The student is typically trained on the probability
|
||||||
|
distribution of the teacher, but details may differ per pipe.
|
||||||
|
|
||||||
|
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
|
||||||
|
from.
|
||||||
|
examples (Iterable[Example]): Distillation examples. The reference
|
||||||
|
and predicted docs must have the same number of tokens and the
|
||||||
|
same orthography.
|
||||||
|
drop (float): dropout rate.
|
||||||
|
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||||
|
create_optimizer if not set.
|
||||||
|
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||||
|
distillation.
|
||||||
|
RETURNS: The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/pipe#distill
|
||||||
|
"""
|
||||||
|
# By default we require a teacher pipe, but there are downstream
|
||||||
|
# implementations that don't require a pipe.
|
||||||
|
if teacher_pipe is None:
|
||||||
|
raise ValueError(Errors.E4002.format(name=self.name))
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
validate_distillation_examples(examples, "TrainablePipe.distill")
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
for node in teacher_pipe.model.walk():
|
||||||
|
if node.name == "softmax":
|
||||||
|
node.attrs["softmax_normalize"] = True
|
||||||
|
teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
|
||||||
|
student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
|
||||||
|
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
bp_student_scores(d_scores)
|
||||||
|
if sgd is not None:
|
||||||
|
self.finish_update(sgd)
|
||||||
|
losses[self.name] += loss
|
||||||
|
return losses
|
||||||
|
|
||||||
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
||||||
"""Apply the pipe to a stream of documents. This usually happens under
|
"""Apply the pipe to a stream of documents. This usually happens under
|
||||||
the hood when the nlp object is called on a text and all components are
|
the hood when the nlp object is called on a text and all components are
|
||||||
|
@ -169,6 +216,19 @@ cdef class TrainablePipe(Pipe):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
|
||||||
|
|
||||||
|
def get_teacher_student_loss(self, teacher_scores, student_scores):
|
||||||
|
"""Calculate the loss and its gradient for a batch of student
|
||||||
|
scores, relative to teacher scores.
|
||||||
|
|
||||||
|
teacher_scores: Scores representing the teacher model's predictions.
|
||||||
|
student_scores: Scores representing the student model's predictions.
|
||||||
|
|
||||||
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
|
||||||
|
|
||||||
def create_optimizer(self) -> Optimizer:
|
def create_optimizer(self) -> Optimizer:
|
||||||
"""Create an optimizer for the pipeline component.
|
"""Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
@ -205,6 +265,14 @@ cdef class TrainablePipe(Pipe):
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_distillable(self) -> bool:
|
||||||
|
# Normally a pipe overrides `get_teacher_student_loss` to implement
|
||||||
|
# distillation. In more exceptional cases, a pipe can provide its
|
||||||
|
# own `distill` implementation. If neither of these methods is
|
||||||
|
# overridden, the pipe does not implement distillation.
|
||||||
|
return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_trainable(self) -> bool:
|
def is_trainable(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
from thinc.backends.cblas cimport CBlas
|
|
||||||
|
|
||||||
from ..vocab cimport Vocab
|
|
||||||
from .trainable_pipe cimport TrainablePipe
|
|
||||||
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
|
||||||
from ._parser_internals._state cimport StateC
|
|
||||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(TrainablePipe):
|
|
||||||
cdef public object _rehearsal_model
|
|
||||||
cdef readonly TransitionSystem moves
|
|
||||||
cdef public object _multitasks
|
|
||||||
cdef object _cpu_ops
|
|
||||||
|
|
||||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
|
||||||
WeightsC weights, SizesC sizes) nogil
|
|
||||||
|
|
||||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
|
||||||
int nr_class, int batch_size) nogil
|
|
|
@ -1,5 +1,6 @@
|
||||||
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
|
||||||
from __future__ import print_function
|
from __future__ import print_function
|
||||||
|
from typing import Dict, Iterable, List, Optional, Tuple
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
@ -7,25 +8,30 @@ from libcpp.vector cimport vector
|
||||||
from libc.string cimport memset, memcpy
|
from libc.string cimport memset, memcpy
|
||||||
from libc.stdlib cimport calloc, free
|
from libc.stdlib cimport calloc, free
|
||||||
import random
|
import random
|
||||||
|
import contextlib
|
||||||
|
|
||||||
import srsly
|
import srsly
|
||||||
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
|
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
|
||||||
|
from thinc.api import chain, softmax_activation, use_ops, get_array_module
|
||||||
|
from thinc.legacy import LegacySequenceCategoricalCrossentropy
|
||||||
|
from thinc.types import Floats2d, Ints1d
|
||||||
import numpy.random
|
import numpy.random
|
||||||
import numpy
|
import numpy
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from ._parser_internals.stateclass cimport StateClass
|
from ..ml.tb_framework import TransitionModelInputs
|
||||||
|
from ._parser_internals.stateclass cimport StateC, StateClass
|
||||||
from ._parser_internals.search cimport Beam
|
from ._parser_internals.search cimport Beam
|
||||||
from ..ml.parser_model cimport alloc_activations, free_activations
|
|
||||||
from ..ml.parser_model cimport predict_states, arg_max_if_valid
|
|
||||||
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
|
|
||||||
from ..ml.parser_model cimport get_c_weights, get_c_sizes
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from .trainable_pipe import TrainablePipe
|
from .trainable_pipe cimport TrainablePipe
|
||||||
from ._parser_internals cimport _beam_utils
|
from ._parser_internals cimport _beam_utils
|
||||||
from ._parser_internals import _beam_utils
|
from ._parser_internals import _beam_utils
|
||||||
|
from ..vocab cimport Vocab
|
||||||
|
from ._parser_internals.transition_system cimport Transition, TransitionSystem
|
||||||
|
from ..typedefs cimport weight_t
|
||||||
|
|
||||||
from ..training import validate_examples, validate_get_examples
|
from ..training import validate_examples, validate_get_examples
|
||||||
|
from ..training import validate_distillation_examples
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
@ -33,7 +39,7 @@ from .. import util
|
||||||
NUMPY_OPS = NumpyOps()
|
NUMPY_OPS = NumpyOps()
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser(TrainablePipe):
|
class Parser(TrainablePipe):
|
||||||
"""
|
"""
|
||||||
Base class of the DependencyParser and EntityRecognizer.
|
Base class of the DependencyParser and EntityRecognizer.
|
||||||
"""
|
"""
|
||||||
|
@ -133,8 +139,9 @@ cdef class Parser(TrainablePipe):
|
||||||
@property
|
@property
|
||||||
def move_names(self):
|
def move_names(self):
|
||||||
names = []
|
names = []
|
||||||
|
cdef TransitionSystem moves = self.moves
|
||||||
for i in range(self.moves.n_moves):
|
for i in range(self.moves.n_moves):
|
||||||
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
|
name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
|
||||||
# Explicitly removing the internal "U-" token used for blocking entities
|
# Explicitly removing the internal "U-" token used for blocking entities
|
||||||
if name != "U-":
|
if name != "U-":
|
||||||
names.append(name)
|
names.append(name)
|
||||||
|
@ -203,6 +210,118 @@ cdef class Parser(TrainablePipe):
|
||||||
# Defined in subclasses, to avoid circular import
|
# Defined in subclasses, to avoid circular import
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def distill(self,
|
||||||
|
teacher_pipe: Optional[TrainablePipe],
|
||||||
|
examples: Iterable["Example"],
|
||||||
|
*,
|
||||||
|
drop: float=0.0,
|
||||||
|
sgd: Optional[Optimizer]=None,
|
||||||
|
losses: Optional[Dict[str, float]]=None):
|
||||||
|
"""Train a pipe (the student) on the predictions of another pipe
|
||||||
|
(the teacher). The student is trained on the transition probabilities
|
||||||
|
of the teacher.
|
||||||
|
|
||||||
|
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
|
||||||
|
from.
|
||||||
|
examples (Iterable[Example]): Distillation examples. The reference
|
||||||
|
and predicted docs must have the same number of tokens and the
|
||||||
|
same orthography.
|
||||||
|
drop (float): dropout rate.
|
||||||
|
sgd (Optional[Optimizer]): An optimizer. Will be created via
|
||||||
|
create_optimizer if not set.
|
||||||
|
losses (Optional[Dict[str, float]]): Optional record of loss during
|
||||||
|
distillation.
|
||||||
|
RETURNS: The updated losses dictionary.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/dependencyparser#distill
|
||||||
|
"""
|
||||||
|
if teacher_pipe is None:
|
||||||
|
raise ValueError(Errors.E4002.format(name=self.name))
|
||||||
|
if losses is None:
|
||||||
|
losses = {}
|
||||||
|
losses.setdefault(self.name, 0.0)
|
||||||
|
|
||||||
|
validate_distillation_examples(examples, "TransitionParser.distill")
|
||||||
|
|
||||||
|
set_dropout_rate(self.model, drop)
|
||||||
|
|
||||||
|
student_docs = [eg.predicted for eg in examples]
|
||||||
|
|
||||||
|
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||||
|
if max_moves >= 1:
|
||||||
|
# Chop sequences into lengths of this many words, to make the
|
||||||
|
# batch uniform length. Since we do not have a gold standard
|
||||||
|
# sequence, we use the teacher's predictions as the gold
|
||||||
|
# standard.
|
||||||
|
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
||||||
|
states = self._init_batch(teacher_pipe, student_docs, max_moves)
|
||||||
|
else:
|
||||||
|
states = self.moves.init_batch(student_docs)
|
||||||
|
|
||||||
|
# We distill as follows: 1. we first let the student predict transition
|
||||||
|
# sequences (and the corresponding transition probabilities); (2) we
|
||||||
|
# let the teacher follow the student's predicted transition sequences
|
||||||
|
# to obtain the teacher's transition probabilities; (3) we compute the
|
||||||
|
# gradients of the student's transition distributions relative to the
|
||||||
|
# teacher's distributions.
|
||||||
|
|
||||||
|
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
|
||||||
|
max_moves=max_moves)
|
||||||
|
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||||
|
actions = states2actions(student_states)
|
||||||
|
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
|
||||||
|
moves=self.moves, actions=actions)
|
||||||
|
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
|
||||||
|
|
||||||
|
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
backprop_scores((student_states, d_scores))
|
||||||
|
|
||||||
|
if sgd is not None:
|
||||||
|
self.finish_update(sgd)
|
||||||
|
|
||||||
|
losses[self.name] += loss
|
||||||
|
|
||||||
|
return losses
|
||||||
|
|
||||||
|
|
||||||
|
def get_teacher_student_loss(
|
||||||
|
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
|
||||||
|
normalize: bool=False,
|
||||||
|
) -> Tuple[float, List[Floats2d]]:
|
||||||
|
"""Calculate the loss and its gradient for a batch of student
|
||||||
|
scores, relative to teacher scores.
|
||||||
|
|
||||||
|
teacher_scores: Scores representing the teacher model's predictions.
|
||||||
|
student_scores: Scores representing the student model's predictions.
|
||||||
|
|
||||||
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
||||||
|
|
||||||
|
DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
|
||||||
|
"""
|
||||||
|
|
||||||
|
# We can't easily hook up a softmax layer in the parsing model, since
|
||||||
|
# the get_loss does additional masking. So, we could apply softmax
|
||||||
|
# manually here and use Thinc's cross-entropy loss. But it's a bit
|
||||||
|
# suboptimal, since we can have a lot of states that would result in
|
||||||
|
# many kernel launches. Futhermore the parsing model's backprop expects
|
||||||
|
# a XP array, so we'd have to concat the softmaxes anyway. So, like
|
||||||
|
# the get_loss implementation, we'll compute the loss and gradients
|
||||||
|
# ourselves.
|
||||||
|
|
||||||
|
teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
|
||||||
|
axis=-1, inplace=True)
|
||||||
|
student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
|
||||||
|
axis=-1, inplace=True)
|
||||||
|
|
||||||
|
assert teacher_scores.shape == student_scores.shape
|
||||||
|
|
||||||
|
d_scores = student_scores - teacher_scores
|
||||||
|
if normalize:
|
||||||
|
d_scores /= d_scores.shape[0]
|
||||||
|
loss = (d_scores**2).sum() / d_scores.size
|
||||||
|
|
||||||
|
return float(loss), d_scores
|
||||||
|
|
||||||
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
|
||||||
"""Setup models for secondary objectives, to benefit from multi-task
|
"""Setup models for secondary objectives, to benefit from multi-task
|
||||||
learning. This method is intended to be overridden by subclasses.
|
learning. This method is intended to be overridden by subclasses.
|
||||||
|
@ -223,9 +342,6 @@ cdef class Parser(TrainablePipe):
|
||||||
|
|
||||||
stream: The sequence of documents to process.
|
stream: The sequence of documents to process.
|
||||||
batch_size (int): Number of documents to accumulate into a working set.
|
batch_size (int): Number of documents to accumulate into a working set.
|
||||||
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
|
|
||||||
deals with a failing batch of documents. The default function just reraises
|
|
||||||
the exception.
|
|
||||||
|
|
||||||
YIELDS (Doc): Documents, in order.
|
YIELDS (Doc): Documents, in order.
|
||||||
"""
|
"""
|
||||||
|
@ -247,78 +363,29 @@ cdef class Parser(TrainablePipe):
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
|
self._ensure_labels_are_added(docs)
|
||||||
if not any(len(doc) for doc in docs):
|
if not any(len(doc) for doc in docs):
|
||||||
result = self.moves.init_batch(docs)
|
result = self.moves.init_batch(docs)
|
||||||
return result
|
return result
|
||||||
if self.cfg["beam_width"] == 1:
|
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||||
return self.greedy_parse(docs, drop=0.0)
|
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||||
else:
|
states_or_beams, _ = self.model.predict(inputs)
|
||||||
return self.beam_parse(
|
return states_or_beams
|
||||||
docs,
|
|
||||||
drop=0.0,
|
|
||||||
beam_width=self.cfg["beam_width"],
|
|
||||||
beam_density=self.cfg["beam_density"]
|
|
||||||
)
|
|
||||||
|
|
||||||
def greedy_parse(self, docs, drop=0.):
|
def greedy_parse(self, docs, drop=0.):
|
||||||
cdef vector[StateC*] states
|
self._resize()
|
||||||
cdef StateClass state
|
|
||||||
cdef CBlas cblas = self._cpu_ops.cblas()
|
|
||||||
self._ensure_labels_are_added(docs)
|
self._ensure_labels_are_added(docs)
|
||||||
set_dropout_rate(self.model, drop)
|
with _change_attrs(self.model, beam_width=1):
|
||||||
batch = self.moves.init_batch(docs)
|
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||||
model = self.model.predict(docs)
|
states, _ = self.model.predict(inputs)
|
||||||
weights = get_c_weights(model)
|
return states
|
||||||
for state in batch:
|
|
||||||
if not state.is_final():
|
|
||||||
states.push_back(state.c)
|
|
||||||
sizes = get_c_sizes(model, states.size())
|
|
||||||
with nogil:
|
|
||||||
self._parseC(cblas, &states[0], weights, sizes)
|
|
||||||
model.clear_memory()
|
|
||||||
del model
|
|
||||||
return batch
|
|
||||||
|
|
||||||
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
|
||||||
cdef Beam beam
|
|
||||||
cdef Doc doc
|
|
||||||
self._ensure_labels_are_added(docs)
|
self._ensure_labels_are_added(docs)
|
||||||
batch = _beam_utils.BeamBatch(
|
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
|
||||||
self.moves,
|
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||||
self.moves.init_batch(docs),
|
beams, _ = self.model.predict(inputs)
|
||||||
None,
|
return beams
|
||||||
beam_width,
|
|
||||||
density=beam_density
|
|
||||||
)
|
|
||||||
model = self.model.predict(docs)
|
|
||||||
while not batch.is_done:
|
|
||||||
states = batch.get_unfinished_states()
|
|
||||||
if not states:
|
|
||||||
break
|
|
||||||
scores = model.predict(states)
|
|
||||||
batch.advance(scores)
|
|
||||||
model.clear_memory()
|
|
||||||
del model
|
|
||||||
return list(batch)
|
|
||||||
|
|
||||||
cdef void _parseC(self, CBlas cblas, StateC** states,
|
|
||||||
WeightsC weights, SizesC sizes) nogil:
|
|
||||||
cdef int i, j
|
|
||||||
cdef vector[StateC*] unfinished
|
|
||||||
cdef ActivationsC activations = alloc_activations(sizes)
|
|
||||||
while sizes.states >= 1:
|
|
||||||
predict_states(cblas, &activations, states, &weights, sizes)
|
|
||||||
# Validate actions, argmax, take action.
|
|
||||||
self.c_transition_batch(states,
|
|
||||||
activations.scores, sizes.classes, sizes.states)
|
|
||||||
for i in range(sizes.states):
|
|
||||||
if not states[i].is_final():
|
|
||||||
unfinished.push_back(states[i])
|
|
||||||
for i in range(unfinished.size()):
|
|
||||||
states[i] = unfinished[i]
|
|
||||||
sizes.states = unfinished.size()
|
|
||||||
unfinished.clear()
|
|
||||||
free_activations(&activations)
|
|
||||||
|
|
||||||
def set_annotations(self, docs, states_or_beams):
|
def set_annotations(self, docs, states_or_beams):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
|
@ -330,35 +397,6 @@ cdef class Parser(TrainablePipe):
|
||||||
for hook in self.postprocesses:
|
for hook in self.postprocesses:
|
||||||
hook(doc)
|
hook(doc)
|
||||||
|
|
||||||
def transition_states(self, states, float[:, ::1] scores):
|
|
||||||
cdef StateClass state
|
|
||||||
cdef float* c_scores = &scores[0, 0]
|
|
||||||
cdef vector[StateC*] c_states
|
|
||||||
for state in states:
|
|
||||||
c_states.push_back(state.c)
|
|
||||||
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
|
|
||||||
return [state for state in states if not state.c.is_final()]
|
|
||||||
|
|
||||||
cdef void c_transition_batch(self, StateC** states, const float* scores,
|
|
||||||
int nr_class, int batch_size) nogil:
|
|
||||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
|
||||||
with gil:
|
|
||||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
|
||||||
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
|
|
||||||
cdef int i, guess
|
|
||||||
cdef Transition action
|
|
||||||
for i in range(batch_size):
|
|
||||||
self.moves.set_valid(is_valid, states[i])
|
|
||||||
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
|
|
||||||
if guess == -1:
|
|
||||||
# This shouldn't happen, but it's hard to raise an error here,
|
|
||||||
# and we don't want to infinite loop. So, force to end state.
|
|
||||||
states[i].force_final()
|
|
||||||
else:
|
|
||||||
action = self.moves.c[guess]
|
|
||||||
action.do(states[i], action.label)
|
|
||||||
free(is_valid)
|
|
||||||
|
|
||||||
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
def update(self, examples, *, drop=0., sgd=None, losses=None):
|
||||||
cdef StateClass state
|
cdef StateClass state
|
||||||
if losses is None:
|
if losses is None:
|
||||||
|
@ -370,67 +408,99 @@ cdef class Parser(TrainablePipe):
|
||||||
)
|
)
|
||||||
for multitask in self._multitasks:
|
for multitask in self._multitasks:
|
||||||
multitask.update(examples, drop=drop, sgd=sgd)
|
multitask.update(examples, drop=drop, sgd=sgd)
|
||||||
|
# We need to take care to act on the whole batch, because we might be
|
||||||
|
# getting vectors via a listener.
|
||||||
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
|
||||||
if n_examples == 0:
|
if n_examples == 0:
|
||||||
return losses
|
return losses
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
# The probability we use beam update, instead of falling back to
|
docs = [eg.x for eg in examples if len(eg.x)]
|
||||||
# a greedy update
|
|
||||||
beam_update_prob = self.cfg["beam_update_prob"]
|
|
||||||
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
|
|
||||||
return self.update_beam(
|
|
||||||
examples,
|
|
||||||
beam_width=self.cfg["beam_width"],
|
|
||||||
sgd=sgd,
|
|
||||||
losses=losses,
|
|
||||||
beam_density=self.cfg["beam_density"]
|
|
||||||
)
|
|
||||||
max_moves = self.cfg["update_with_oracle_cut_size"]
|
max_moves = self.cfg["update_with_oracle_cut_size"]
|
||||||
if max_moves >= 1:
|
if max_moves >= 1:
|
||||||
# Chop sequences into lengths of this many words, to make the
|
# Chop sequences into lengths of this many words, to make the
|
||||||
# batch uniform length.
|
# batch uniform length.
|
||||||
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
|
max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
|
||||||
states, golds, _ = self._init_gold_batch(
|
init_states, gold_states, _ = self._init_gold_batch(
|
||||||
examples,
|
examples,
|
||||||
max_length=max_moves
|
max_length=max_moves
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
|
||||||
if not states:
|
|
||||||
|
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
|
||||||
|
max_moves=max_moves, states=[state.copy() for state in init_states])
|
||||||
|
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
|
||||||
|
if sum(s.shape[0] for s in scores) == 0:
|
||||||
return losses
|
return losses
|
||||||
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
|
d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
|
||||||
|
examples, max_moves)
|
||||||
all_states = list(states)
|
backprop_scores((pred_states, d_scores))
|
||||||
states_golds = list(zip(states, golds))
|
|
||||||
n_moves = 0
|
|
||||||
while states_golds:
|
|
||||||
states, golds = zip(*states_golds)
|
|
||||||
scores, backprop = model.begin_update(states)
|
|
||||||
d_scores = self.get_batch_loss(states, golds, scores, losses)
|
|
||||||
# Note that the gradient isn't normalized by the batch size
|
|
||||||
# here, because our "samples" are really the states...But we
|
|
||||||
# can't normalize by the number of states either, as then we'd
|
|
||||||
# be getting smaller gradients for states in long sequences.
|
|
||||||
backprop(d_scores)
|
|
||||||
# Follow the predicted action
|
|
||||||
self.transition_states(states, scores)
|
|
||||||
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
|
|
||||||
if max_moves >= 1 and n_moves >= max_moves:
|
|
||||||
break
|
|
||||||
n_moves += 1
|
|
||||||
|
|
||||||
backprop_tok2vec(golds)
|
|
||||||
if sgd not in (None, False):
|
if sgd not in (None, False):
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
|
losses[self.name] += float((d_scores**2).sum())
|
||||||
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
# Ugh, this is annoying. If we're working on GPU, we want to free the
|
||||||
# memory ASAP. It seems that Python doesn't necessarily get around to
|
# memory ASAP. It seems that Python doesn't necessarily get around to
|
||||||
# removing these in time if we don't explicitly delete? It's confusing.
|
# removing these in time if we don't explicitly delete? It's confusing.
|
||||||
del backprop
|
del backprop_scores
|
||||||
del backprop_tok2vec
|
|
||||||
model.clear_memory()
|
|
||||||
del model
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
|
def get_loss(self, states_scores, examples, max_moves):
|
||||||
|
gold_states, init_states, pred_states, scores = states_scores
|
||||||
|
scores = self.model.ops.xp.vstack(scores)
|
||||||
|
costs = self._get_costs_from_histories(
|
||||||
|
examples,
|
||||||
|
gold_states,
|
||||||
|
init_states,
|
||||||
|
[list(state.history) for state in pred_states],
|
||||||
|
max_moves
|
||||||
|
)
|
||||||
|
xp = get_array_module(scores)
|
||||||
|
best_costs = costs.min(axis=1, keepdims=True)
|
||||||
|
gscores = scores.copy()
|
||||||
|
min_score = scores.min() - 1000
|
||||||
|
assert costs.shape == scores.shape, (costs.shape, scores.shape)
|
||||||
|
gscores[costs > best_costs] = min_score
|
||||||
|
max_ = scores.max(axis=1, keepdims=True)
|
||||||
|
gmax = gscores.max(axis=1, keepdims=True)
|
||||||
|
exp_scores = xp.exp(scores - max_)
|
||||||
|
exp_gscores = xp.exp(gscores - gmax)
|
||||||
|
Z = exp_scores.sum(axis=1, keepdims=True)
|
||||||
|
gZ = exp_gscores.sum(axis=1, keepdims=True)
|
||||||
|
d_scores = exp_scores / Z
|
||||||
|
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
|
||||||
|
return d_scores
|
||||||
|
|
||||||
|
def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
|
||||||
|
cdef TransitionSystem moves = self.moves
|
||||||
|
cdef StateClass state
|
||||||
|
cdef int clas
|
||||||
|
cdef int nF = self.model.get_dim("nF")
|
||||||
|
cdef int nO = moves.n_moves
|
||||||
|
cdef int nS = sum([len(history) for history in histories])
|
||||||
|
cdef Pool mem = Pool()
|
||||||
|
cdef np.ndarray costs_i
|
||||||
|
is_valid = <int*>mem.alloc(nO, sizeof(int))
|
||||||
|
batch = list(zip(init_states, histories, gold_states))
|
||||||
|
n_moves = 0
|
||||||
|
output = []
|
||||||
|
while batch:
|
||||||
|
costs = numpy.zeros((len(batch), nO), dtype="f")
|
||||||
|
for i, (state, history, gold) in enumerate(batch):
|
||||||
|
costs_i = costs[i]
|
||||||
|
clas = history.pop(0)
|
||||||
|
moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
|
||||||
|
action = moves.c[clas]
|
||||||
|
action.do(state.c, action.label)
|
||||||
|
state.c.history.push_back(clas)
|
||||||
|
output.append(costs)
|
||||||
|
batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
|
||||||
|
if n_moves >= max_moves >= 1:
|
||||||
|
break
|
||||||
|
n_moves += 1
|
||||||
|
|
||||||
|
return self.model.ops.xp.vstack(output)
|
||||||
|
|
||||||
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
def rehearse(self, examples, sgd=None, losses=None, **cfg):
|
||||||
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
|
||||||
if losses is None:
|
if losses is None:
|
||||||
|
@ -440,10 +510,9 @@ cdef class Parser(TrainablePipe):
|
||||||
multitask.rehearse(examples, losses=losses, sgd=sgd)
|
multitask.rehearse(examples, losses=losses, sgd=sgd)
|
||||||
if self._rehearsal_model is None:
|
if self._rehearsal_model is None:
|
||||||
return None
|
return None
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.0)
|
||||||
validate_examples(examples, "Parser.rehearse")
|
validate_examples(examples, "Parser.rehearse")
|
||||||
docs = [eg.predicted for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
states = self.moves.init_batch(docs)
|
|
||||||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||||
# if labels are missing. We therefore have to check whether we need to
|
# if labels are missing. We therefore have to check whether we need to
|
||||||
# expand our model output.
|
# expand our model output.
|
||||||
|
@ -451,85 +520,33 @@ cdef class Parser(TrainablePipe):
|
||||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||||
set_dropout_rate(self._rehearsal_model, 0.0)
|
set_dropout_rate(self._rehearsal_model, 0.0)
|
||||||
set_dropout_rate(self.model, 0.0)
|
set_dropout_rate(self.model, 0.0)
|
||||||
tutor, _ = self._rehearsal_model.begin_update(docs)
|
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
|
||||||
model, backprop_tok2vec = self.model.begin_update(docs)
|
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
|
||||||
n_scores = 0.
|
actions = states2actions(student_states)
|
||||||
loss = 0.
|
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
|
||||||
while states:
|
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
|
||||||
targets, _ = tutor.begin_update(states)
|
|
||||||
guesses, backprop = model.begin_update(states)
|
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
|
||||||
d_scores = (guesses - targets) / targets.shape[0]
|
|
||||||
|
teacher_scores = self.model.ops.xp.vstack(teacher_scores)
|
||||||
|
student_scores = self.model.ops.xp.vstack(student_scores)
|
||||||
|
assert teacher_scores.shape == student_scores.shape
|
||||||
|
|
||||||
|
d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
|
||||||
# If all weights for an output are 0 in the original model, don't
|
# If all weights for an output are 0 in the original model, don't
|
||||||
# supervise that output. This allows us to add classes.
|
# supervise that output. This allows us to add classes.
|
||||||
loss += (d_scores**2).sum()
|
loss = (d_scores**2).sum() / d_scores.size
|
||||||
backprop(d_scores)
|
backprop_scores((student_states, d_scores))
|
||||||
# Follow the predicted action
|
|
||||||
self.transition_states(states, guesses)
|
|
||||||
states = [state for state in states if not state.is_final()]
|
|
||||||
n_scores += d_scores.size
|
|
||||||
# Do the backprop
|
|
||||||
backprop_tok2vec(docs)
|
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
self.finish_update(sgd)
|
self.finish_update(sgd)
|
||||||
losses[self.name] += loss / n_scores
|
losses[self.name] += loss
|
||||||
del backprop
|
|
||||||
del backprop_tok2vec
|
|
||||||
model.clear_memory()
|
|
||||||
tutor.clear_memory()
|
|
||||||
del model
|
|
||||||
del tutor
|
|
||||||
return losses
|
return losses
|
||||||
|
|
||||||
def update_beam(self, examples, *, beam_width,
|
def update_beam(self, examples, *, beam_width,
|
||||||
drop=0., sgd=None, losses=None, beam_density=0.0):
|
drop=0., sgd=None, losses=None, beam_density=0.0):
|
||||||
states, golds, _ = self.moves.init_gold_batch(examples)
|
raise NotImplementedError
|
||||||
if not states:
|
|
||||||
return losses
|
|
||||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
|
||||||
model, backprop_tok2vec = self.model.begin_update(
|
|
||||||
[eg.predicted for eg in examples])
|
|
||||||
loss = _beam_utils.update_beam(
|
|
||||||
self.moves,
|
|
||||||
states,
|
|
||||||
golds,
|
|
||||||
model,
|
|
||||||
beam_width,
|
|
||||||
beam_density=beam_density,
|
|
||||||
)
|
|
||||||
losses[self.name] += loss
|
|
||||||
backprop_tok2vec(golds)
|
|
||||||
if sgd is not None:
|
|
||||||
self.finish_update(sgd)
|
|
||||||
|
|
||||||
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
|
|
||||||
cdef StateClass state
|
|
||||||
cdef Pool mem = Pool()
|
|
||||||
cdef int i
|
|
||||||
|
|
||||||
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
|
|
||||||
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
|
|
||||||
|
|
||||||
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
|
|
||||||
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
|
|
||||||
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
|
|
||||||
dtype='f', order='C')
|
|
||||||
c_d_scores = <float*>d_scores.data
|
|
||||||
unseen_classes = self.model.attrs["unseen_classes"]
|
|
||||||
for i, (state, gold) in enumerate(zip(states, golds)):
|
|
||||||
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
|
|
||||||
memset(costs, 0, self.moves.n_moves * sizeof(float))
|
|
||||||
self.moves.set_costs(is_valid, costs, state.c, gold)
|
|
||||||
for j in range(self.moves.n_moves):
|
|
||||||
if costs[j] <= 0.0 and j in unseen_classes:
|
|
||||||
unseen_classes.remove(j)
|
|
||||||
cpu_log_loss(c_d_scores,
|
|
||||||
costs, is_valid, &scores[i, 0], d_scores.shape[1])
|
|
||||||
c_d_scores += d_scores.shape[1]
|
|
||||||
# Note that we don't normalize this. See comment in update() for why.
|
|
||||||
if losses is not None:
|
|
||||||
losses.setdefault(self.name, 0.)
|
|
||||||
losses[self.name] += (d_scores**2).sum()
|
|
||||||
return d_scores
|
|
||||||
|
|
||||||
def set_output(self, nO):
|
def set_output(self, nO):
|
||||||
self.model.attrs["resize_output"](self.model, nO)
|
self.model.attrs["resize_output"](self.model, nO)
|
||||||
|
@ -568,7 +585,7 @@ cdef class Parser(TrainablePipe):
|
||||||
for example in islice(get_examples(), 10):
|
for example in islice(get_examples(), 10):
|
||||||
doc_sample.append(example.predicted)
|
doc_sample.append(example.predicted)
|
||||||
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
|
||||||
self.model.initialize(doc_sample)
|
self.model.initialize((doc_sample, self.moves))
|
||||||
if nlp is not None:
|
if nlp is not None:
|
||||||
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
self.init_multitask_objectives(get_examples, nlp.pipeline)
|
||||||
|
|
||||||
|
@ -625,28 +642,63 @@ cdef class Parser(TrainablePipe):
|
||||||
raise ValueError(Errors.E149) from None
|
raise ValueError(Errors.E149) from None
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _init_gold_batch(self, examples, max_length):
|
def _init_batch(self, teacher_step_model, docs, max_length):
|
||||||
"""Make a square batch, of length equal to the shortest transition
|
"""Make a square batch of length equal to the shortest transition
|
||||||
sequence or a cap. A long
|
sequence or a cap. A long
|
||||||
doc will get multiple states. Let's say we have a doc of length 2*N,
|
doc will get multiple states. Let's say we have a doc of length 2*N,
|
||||||
where N is the shortest doc. We'll make two states, one representing
|
where N is the shortest doc. We'll make two states, one representing
|
||||||
long_doc[:N], and another representing long_doc[N:]."""
|
long_doc[:N], and another representing long_doc[N:]. In contrast to
|
||||||
|
_init_gold_batch, this version uses a teacher model to generate the
|
||||||
|
cut sequences."""
|
||||||
cdef:
|
cdef:
|
||||||
StateClass start_state
|
StateClass start_state
|
||||||
StateClass state
|
StateClass state
|
||||||
Transition action
|
Transition action
|
||||||
all_states = self.moves.init_batch([eg.predicted for eg in examples])
|
all_states = self.moves.init_batch(docs)
|
||||||
|
states = []
|
||||||
|
to_cut = []
|
||||||
|
for state, doc in zip(all_states, docs):
|
||||||
|
if not state.is_final():
|
||||||
|
if len(doc) < max_length:
|
||||||
|
states.append(state)
|
||||||
|
else:
|
||||||
|
to_cut.append(state)
|
||||||
|
while to_cut:
|
||||||
|
states.extend(state.copy() for state in to_cut)
|
||||||
|
# Move states forward max_length actions.
|
||||||
|
length = 0
|
||||||
|
while to_cut and length < max_length:
|
||||||
|
teacher_scores = teacher_step_model.predict(to_cut)
|
||||||
|
self.transition_states(to_cut, teacher_scores)
|
||||||
|
# States that are completed do not need further cutting.
|
||||||
|
to_cut = [state for state in to_cut if not state.is_final()]
|
||||||
|
length += 1
|
||||||
|
return states
|
||||||
|
|
||||||
|
|
||||||
|
def _init_gold_batch(self, examples, max_length):
|
||||||
|
"""Make a square batch, of length equal to the shortest transition
|
||||||
|
sequence or a cap. A long doc will get multiple states. Let's say we
|
||||||
|
have a doc of length 2*N, where N is the shortest doc. We'll make
|
||||||
|
two states, one representing long_doc[:N], and another representing
|
||||||
|
long_doc[N:]."""
|
||||||
|
cdef:
|
||||||
|
StateClass start_state
|
||||||
|
StateClass state
|
||||||
|
Transition action
|
||||||
|
TransitionSystem moves = self.moves
|
||||||
|
all_states = moves.init_batch([eg.predicted for eg in examples])
|
||||||
states = []
|
states = []
|
||||||
golds = []
|
golds = []
|
||||||
to_cut = []
|
to_cut = []
|
||||||
for state, eg in zip(all_states, examples):
|
for state, eg in zip(all_states, examples):
|
||||||
if self.moves.has_gold(eg) and not state.is_final():
|
if moves.has_gold(eg) and not state.is_final():
|
||||||
gold = self.moves.init_gold(state, eg)
|
gold = moves.init_gold(state, eg)
|
||||||
if len(eg.x) < max_length:
|
if len(eg.x) < max_length:
|
||||||
states.append(state)
|
states.append(state)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
else:
|
else:
|
||||||
oracle_actions = self.moves.get_oracle_sequence_from_state(
|
oracle_actions = moves.get_oracle_sequence_from_state(
|
||||||
state.copy(), gold)
|
state.copy(), gold)
|
||||||
to_cut.append((eg, state, gold, oracle_actions))
|
to_cut.append((eg, state, gold, oracle_actions))
|
||||||
if not to_cut:
|
if not to_cut:
|
||||||
|
@ -656,13 +708,52 @@ cdef class Parser(TrainablePipe):
|
||||||
for i in range(0, len(oracle_actions), max_length):
|
for i in range(0, len(oracle_actions), max_length):
|
||||||
start_state = state.copy()
|
start_state = state.copy()
|
||||||
for clas in oracle_actions[i:i+max_length]:
|
for clas in oracle_actions[i:i+max_length]:
|
||||||
action = self.moves.c[clas]
|
action = moves.c[clas]
|
||||||
action.do(state.c, action.label)
|
action.do(state.c, action.label)
|
||||||
if state.is_final():
|
if state.is_final():
|
||||||
break
|
break
|
||||||
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
|
if moves.has_gold(eg, start_state.B(0), state.B(0)):
|
||||||
states.append(start_state)
|
states.append(start_state)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
if state.is_final():
|
if state.is_final():
|
||||||
break
|
break
|
||||||
return states, golds, max_length
|
return states, golds, max_length
|
||||||
|
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def _change_attrs(model, **kwargs):
|
||||||
|
"""Temporarily modify a thinc model's attributes."""
|
||||||
|
unset = object()
|
||||||
|
old_attrs = {}
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
old_attrs[key] = model.attrs.get(key, unset)
|
||||||
|
model.attrs[key] = value
|
||||||
|
yield model
|
||||||
|
for key, value in old_attrs.items():
|
||||||
|
if value is unset:
|
||||||
|
model.attrs.pop(key)
|
||||||
|
else:
|
||||||
|
model.attrs[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
def states2actions(states: List[StateClass]) -> List[Ints1d]:
|
||||||
|
cdef int step
|
||||||
|
cdef StateClass state
|
||||||
|
cdef StateC* c_state
|
||||||
|
actions = []
|
||||||
|
while True:
|
||||||
|
step = len(actions)
|
||||||
|
|
||||||
|
step_actions = []
|
||||||
|
for state in states:
|
||||||
|
c_state = state.c
|
||||||
|
if step < c_state.history.size():
|
||||||
|
step_actions.append(c_state.history[step])
|
||||||
|
|
||||||
|
# We are done if we have exhausted all histories.
|
||||||
|
if len(step_actions) == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
actions.append(numpy.array(step_actions, dtype="i"))
|
||||||
|
|
||||||
|
return actions
|
||||||
|
|
|
@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
|
||||||
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
from spacy.training import Example, iob_to_biluo, split_bilu_label
|
||||||
from spacy.tokens import Doc, Span
|
from spacy.tokens import Doc, Span
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
from thinc.api import fix_random_seed
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from ..util import make_tempdir
|
from ..util import make_tempdir
|
||||||
|
@ -412,7 +413,7 @@ def test_train_empty():
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
|
||||||
ner = nlp.add_pipe("ner", last=True)
|
ner = nlp.add_pipe("ner", last=True)
|
||||||
ner.add_label("PERSON")
|
ner.add_label("PERSON")
|
||||||
nlp.initialize()
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
for itn in range(2):
|
for itn in range(2):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(train_examples, size=8)
|
batches = util.minibatch(train_examples, size=8)
|
||||||
|
@ -539,11 +540,11 @@ def test_block_ner():
|
||||||
assert [token.ent_type_ for token in doc] == expected_types
|
assert [token.ent_type_ for token in doc] == expected_types
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("use_upper", [True, False])
|
def test_overfitting_IO():
|
||||||
def test_overfitting_IO(use_upper):
|
fix_random_seed(1)
|
||||||
# Simple test to try and quickly overfit the NER component
|
# Simple test to try and quickly overfit the NER component
|
||||||
nlp = English()
|
nlp = English()
|
||||||
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
|
ner = nlp.add_pipe("ner", config={"model": {}})
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper):
|
||||||
assert ents2[0].label_ == "LOC"
|
assert ents2[0].label_ == "LOC"
|
||||||
# Ensure that the predictions are still the same, even after adding a new label
|
# Ensure that the predictions are still the same, even after adding a new label
|
||||||
ner2 = nlp2.get_pipe("ner")
|
ner2 = nlp2.get_pipe("ner")
|
||||||
assert ner2.model.attrs["has_upper"] == use_upper
|
|
||||||
ner2.add_label("RANDOM_NEW_LABEL")
|
ner2.add_label("RANDOM_NEW_LABEL")
|
||||||
doc3 = nlp2(test_text)
|
doc3 = nlp2(test_text)
|
||||||
ents3 = doc3.ents
|
ents3 = doc3.ents
|
||||||
|
@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
|
||||||
assert ents[1].kb_id == 0
|
assert ents[1].kb_id == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
ner = nlp.add_pipe("ner")
|
||||||
|
assert ner.is_distillable
|
||||||
|
|
||||||
|
|
||||||
|
def test_distill():
|
||||||
|
teacher = English()
|
||||||
|
teacher_ner = teacher.add_pipe("ner")
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
|
||||||
|
for ent in annotations.get("entities"):
|
||||||
|
teacher_ner.add_label(ent[2])
|
||||||
|
|
||||||
|
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["ner"] < 0.00001
|
||||||
|
|
||||||
|
student = English()
|
||||||
|
student_ner = student.add_pipe("ner")
|
||||||
|
student_ner.initialize(
|
||||||
|
get_examples=lambda: train_examples, labels=teacher_ner.label_data
|
||||||
|
)
|
||||||
|
|
||||||
|
distill_examples = [
|
||||||
|
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(100):
|
||||||
|
losses = {}
|
||||||
|
student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["ner"] < 0.0001
|
||||||
|
|
||||||
|
# test the trained model
|
||||||
|
test_text = "I like London."
|
||||||
|
doc = student(test_text)
|
||||||
|
ents = doc.ents
|
||||||
|
assert len(ents) == 1
|
||||||
|
assert ents[0].text == "London"
|
||||||
|
assert ents[0].label_ == "LOC"
|
||||||
|
|
||||||
|
|
||||||
def test_beam_ner_scores():
|
def test_beam_ner_scores():
|
||||||
# Test that we can get confidence values out of the beam_ner pipe
|
# Test that we can get confidence values out of the beam_ner pipe
|
||||||
beam_width = 16
|
beam_width = 16
|
||||||
|
|
|
@ -1,13 +1,17 @@
|
||||||
|
import itertools
|
||||||
import pytest
|
import pytest
|
||||||
|
import numpy
|
||||||
from numpy.testing import assert_equal
|
from numpy.testing import assert_equal
|
||||||
from thinc.api import Adam
|
from thinc.api import Adam
|
||||||
|
|
||||||
from spacy import registry, util
|
from spacy import registry, util
|
||||||
from spacy.attrs import DEP, NORM
|
from spacy.attrs import DEP, NORM
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc
|
|
||||||
from spacy.training import Example
|
from spacy.training import Example
|
||||||
|
from spacy.tokens import Doc
|
||||||
from spacy.vocab import Vocab
|
from spacy.vocab import Vocab
|
||||||
|
from spacy import util, registry
|
||||||
|
from thinc.api import fix_random_seed
|
||||||
|
|
||||||
from ...pipeline import DependencyParser
|
from ...pipeline import DependencyParser
|
||||||
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
|
||||||
|
@ -59,6 +63,8 @@ PARTIAL_DATA = [
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
PARSERS = ["parser"] # TODO: Test beam_parser when ready
|
||||||
|
|
||||||
eps = 0.1
|
eps = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
|
||||||
assert doc[0].dep != 0
|
assert doc[0].dep != 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_parser_apply_actions(en_vocab, en_parser):
|
||||||
|
words = ["I", "ate", "pizza"]
|
||||||
|
words2 = ["Eat", "more", "pizza", "!"]
|
||||||
|
doc1 = Doc(en_vocab, words=words)
|
||||||
|
doc2 = Doc(en_vocab, words=words2)
|
||||||
|
docs = [doc1, doc2]
|
||||||
|
|
||||||
|
moves = en_parser.moves
|
||||||
|
moves.add_action(0, "")
|
||||||
|
moves.add_action(1, "")
|
||||||
|
moves.add_action(2, "nsubj")
|
||||||
|
moves.add_action(3, "obj")
|
||||||
|
moves.add_action(2, "amod")
|
||||||
|
|
||||||
|
actions = [
|
||||||
|
numpy.array([0, 0], dtype="i"),
|
||||||
|
numpy.array([2, 0], dtype="i"),
|
||||||
|
numpy.array([0, 4], dtype="i"),
|
||||||
|
numpy.array([3, 3], dtype="i"),
|
||||||
|
numpy.array([1, 1], dtype="i"),
|
||||||
|
numpy.array([1, 1], dtype="i"),
|
||||||
|
numpy.array([0], dtype="i"),
|
||||||
|
numpy.array([1], dtype="i"),
|
||||||
|
]
|
||||||
|
|
||||||
|
states = moves.init_batch(docs)
|
||||||
|
active_states = states
|
||||||
|
|
||||||
|
for step_actions in actions:
|
||||||
|
active_states = moves.apply_actions(active_states, step_actions)
|
||||||
|
|
||||||
|
assert len(active_states) == 0
|
||||||
|
|
||||||
|
for (state, doc) in zip(states, docs):
|
||||||
|
moves.set_annotations(state, doc)
|
||||||
|
|
||||||
|
assert docs[0][0].head.i == 1
|
||||||
|
assert docs[0][0].dep_ == "nsubj"
|
||||||
|
assert docs[0][1].head.i == 1
|
||||||
|
assert docs[0][1].dep_ == "ROOT"
|
||||||
|
assert docs[0][2].head.i == 1
|
||||||
|
assert docs[0][2].dep_ == "obj"
|
||||||
|
|
||||||
|
assert docs[1][0].head.i == 0
|
||||||
|
assert docs[1][0].dep_ == "ROOT"
|
||||||
|
assert docs[1][1].head.i == 2
|
||||||
|
assert docs[1][1].dep_ == "amod"
|
||||||
|
assert docs[1][2].head.i == 0
|
||||||
|
assert docs[1][2].dep_ == "obj"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
reason="The step_through API was removed (but should be brought back)"
|
reason="The step_through API was removed (but should be brought back)"
|
||||||
)
|
)
|
||||||
|
@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab):
|
||||||
DependencyParser(en_vocab, model)
|
DependencyParser(en_vocab, model)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
@pytest.mark.parametrize("pipe_name", PARSERS)
|
||||||
def test_incomplete_data(pipe_name):
|
def test_incomplete_data(pipe_name):
|
||||||
# Test that the parser works with incomplete information
|
# Test that the parser works with incomplete information
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name):
|
||||||
assert doc[2].head.i == 1
|
assert doc[2].head.i == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
@pytest.mark.parametrize(
|
||||||
def test_overfitting_IO(pipe_name):
|
"pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
|
||||||
|
)
|
||||||
|
def test_overfitting_IO(pipe_name, max_moves):
|
||||||
|
fix_random_seed(0)
|
||||||
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
# Simple test to try and quickly overfit the dependency parser (normal or beam)
|
||||||
nlp = English()
|
nlp = English()
|
||||||
parser = nlp.add_pipe(pipe_name)
|
parser = nlp.add_pipe(pipe_name)
|
||||||
|
parser.cfg["update_with_oracle_cut_size"] = max_moves
|
||||||
train_examples = []
|
train_examples = []
|
||||||
for text, annotations in TRAIN_DATA:
|
for text, annotations in TRAIN_DATA:
|
||||||
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
@ -396,16 +457,67 @@ def test_overfitting_IO(pipe_name):
|
||||||
assert_equal(batch_deps_1, no_batch_deps)
|
assert_equal(batch_deps_1, no_batch_deps)
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
parser = nlp.add_pipe("parser")
|
||||||
|
assert parser.is_distillable
|
||||||
|
|
||||||
|
|
||||||
|
def test_distill():
|
||||||
|
teacher = English()
|
||||||
|
teacher_parser = teacher.add_pipe("parser")
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
|
||||||
|
for dep in annotations.get("deps", []):
|
||||||
|
teacher_parser.add_label(dep)
|
||||||
|
|
||||||
|
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(200):
|
||||||
|
losses = {}
|
||||||
|
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["parser"] < 0.0001
|
||||||
|
|
||||||
|
student = English()
|
||||||
|
student_parser = student.add_pipe("parser")
|
||||||
|
student_parser.initialize(
|
||||||
|
get_examples=lambda: train_examples, labels=teacher_parser.label_data
|
||||||
|
)
|
||||||
|
|
||||||
|
distill_examples = [
|
||||||
|
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(200):
|
||||||
|
losses = {}
|
||||||
|
student_parser.distill(
|
||||||
|
teacher_parser, distill_examples, sgd=optimizer, losses=losses
|
||||||
|
)
|
||||||
|
assert losses["parser"] < 0.0001
|
||||||
|
|
||||||
|
test_text = "I like securities."
|
||||||
|
doc = student(test_text)
|
||||||
|
assert doc[0].dep_ == "nsubj"
|
||||||
|
assert doc[2].dep_ == "dobj"
|
||||||
|
assert doc[3].dep_ == "punct"
|
||||||
|
assert doc[0].head.i == 1
|
||||||
|
assert doc[2].head.i == 1
|
||||||
|
assert doc[3].head.i == 1
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"parser_config",
|
"parser_config",
|
||||||
[
|
[
|
||||||
# TransitionBasedParser V1
|
# TODO: re-enable after we have a spacy-legacy release for v4. See
|
||||||
({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
# https://github.com/explosion/spacy-legacy/pull/36
|
||||||
# TransitionBasedParser V2
|
#({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||||
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
|
||||||
|
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
|
||||||
|
({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -195,6 +195,53 @@ def test_overfitting_IO():
|
||||||
assert doc4[3].lemma_ == "egg"
|
assert doc4[3].lemma_ == "egg"
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
|
||||||
|
assert lemmatizer.is_distillable
|
||||||
|
|
||||||
|
|
||||||
|
def test_distill():
|
||||||
|
teacher = English()
|
||||||
|
teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
|
||||||
|
teacher_lemmatizer.min_tree_freq = 1
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["trainable_lemmatizer"] < 0.00001
|
||||||
|
|
||||||
|
student = English()
|
||||||
|
student_lemmatizer = student.add_pipe("trainable_lemmatizer")
|
||||||
|
student_lemmatizer.min_tree_freq = 1
|
||||||
|
student_lemmatizer.initialize(
|
||||||
|
get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
|
||||||
|
)
|
||||||
|
|
||||||
|
distill_examples = [
|
||||||
|
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
student_lemmatizer.distill(
|
||||||
|
teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
|
||||||
|
)
|
||||||
|
assert losses["trainable_lemmatizer"] < 0.00001
|
||||||
|
|
||||||
|
test_text = "She likes blue eggs"
|
||||||
|
doc = student(test_text)
|
||||||
|
assert doc[0].lemma_ == "she"
|
||||||
|
assert doc[1].lemma_ == "like"
|
||||||
|
assert doc[2].lemma_ == "blue"
|
||||||
|
assert doc[3].lemma_ == "egg"
|
||||||
|
|
||||||
|
|
||||||
def test_lemmatizer_requires_labels():
|
def test_lemmatizer_requires_labels():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("trainable_lemmatizer")
|
nlp.add_pipe("trainable_lemmatizer")
|
||||||
|
|
|
@ -353,6 +353,39 @@ def test_entity_ruler_overlapping_spans(nlp):
|
||||||
assert doc.ents[0].label_ == "FOOBAR"
|
assert doc.ents[0].label_ == "FOOBAR"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_fuzzy_pipe(nlp):
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
doc = nlp("helloo")
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_fuzzy(nlp):
|
||||||
|
ruler = nlp.add_pipe("entity_ruler")
|
||||||
|
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
doc = nlp("helloo")
|
||||||
|
assert len(doc.ents) == 1
|
||||||
|
assert doc.ents[0].label_ == "HELLO"
|
||||||
|
|
||||||
|
|
||||||
|
def test_entity_ruler_fuzzy_disabled(nlp):
|
||||||
|
@registry.misc("test_fuzzy_compare_disabled")
|
||||||
|
def make_test_fuzzy_compare_disabled():
|
||||||
|
return lambda x, y, z: False
|
||||||
|
|
||||||
|
ruler = nlp.add_pipe(
|
||||||
|
"entity_ruler",
|
||||||
|
config={"matcher_fuzzy_compare": {"@misc": "test_fuzzy_compare_disabled"}},
|
||||||
|
)
|
||||||
|
patterns = [{"label": "HELLO", "pattern": [{"LOWER": {"FUZZY": "hello"}}]}]
|
||||||
|
ruler.add_patterns(patterns)
|
||||||
|
doc = nlp("helloo")
|
||||||
|
assert len(doc.ents) == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("n_process", [1, 2])
|
@pytest.mark.parametrize("n_process", [1, 2])
|
||||||
def test_entity_ruler_multiprocessing(nlp, n_process):
|
def test_entity_ruler_multiprocessing(nlp, n_process):
|
||||||
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
if isinstance(get_current_ops, NumpyOps) or n_process < 2:
|
||||||
|
|
|
@ -50,6 +50,12 @@ def test_implicit_label():
|
||||||
nlp.initialize(get_examples=lambda: train_examples)
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
|
assert morphologizer.is_distillable
|
||||||
|
|
||||||
|
|
||||||
def test_no_resize():
|
def test_no_resize():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
morphologizer = nlp.add_pipe("morphologizer")
|
morphologizer = nlp.add_pipe("morphologizer")
|
||||||
|
|
|
@ -11,6 +11,12 @@ from spacy.pipeline import TrainablePipe
|
||||||
from spacy.tests.util import make_tempdir
|
from spacy.tests.util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
senter = nlp.add_pipe("senter")
|
||||||
|
assert senter.is_distillable
|
||||||
|
|
||||||
|
|
||||||
def test_label_types():
|
def test_label_types():
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
senter = nlp.add_pipe("senter")
|
senter = nlp.add_pipe("senter")
|
||||||
|
|
|
@ -24,7 +24,9 @@ def test_issue4348():
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
|
batches = util.minibatch(
|
||||||
|
TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
|
||||||
|
)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
@ -213,6 +215,52 @@ def test_overfitting_IO():
|
||||||
assert doc3[0].tag_ != "N"
|
assert doc3[0].tag_ != "N"
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
tagger = nlp.add_pipe("tagger")
|
||||||
|
assert tagger.is_distillable
|
||||||
|
|
||||||
|
|
||||||
|
def test_distill():
|
||||||
|
teacher = English()
|
||||||
|
teacher_tagger = teacher.add_pipe("tagger")
|
||||||
|
train_examples = []
|
||||||
|
for t in TRAIN_DATA:
|
||||||
|
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
|
||||||
|
|
||||||
|
optimizer = teacher.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
teacher.update(train_examples, sgd=optimizer, losses=losses)
|
||||||
|
assert losses["tagger"] < 0.00001
|
||||||
|
|
||||||
|
student = English()
|
||||||
|
student_tagger = student.add_pipe("tagger")
|
||||||
|
student_tagger.min_tree_freq = 1
|
||||||
|
student_tagger.initialize(
|
||||||
|
get_examples=lambda: train_examples, labels=teacher_tagger.label_data
|
||||||
|
)
|
||||||
|
|
||||||
|
distill_examples = [
|
||||||
|
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
|
||||||
|
]
|
||||||
|
|
||||||
|
for i in range(50):
|
||||||
|
losses = {}
|
||||||
|
student_tagger.distill(
|
||||||
|
teacher_tagger, distill_examples, sgd=optimizer, losses=losses
|
||||||
|
)
|
||||||
|
assert losses["tagger"] < 0.00001
|
||||||
|
|
||||||
|
test_text = "I like blue eggs"
|
||||||
|
doc = student(test_text)
|
||||||
|
assert doc[0].tag_ == "N"
|
||||||
|
assert doc[1].tag_ == "V"
|
||||||
|
assert doc[2].tag_ == "J"
|
||||||
|
assert doc[3].tag_ == "N"
|
||||||
|
|
||||||
|
|
||||||
def test_save_activations():
|
def test_save_activations():
|
||||||
# Test if activations are correctly added to Doc when requested.
|
# Test if activations are correctly added to Doc when requested.
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
|
|
@ -91,7 +91,9 @@ def test_issue3611():
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = util.minibatch(
|
||||||
|
train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
|
||||||
|
)
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
|
@ -128,7 +130,9 @@ def test_issue4030():
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
batches = util.minibatch(
|
||||||
|
train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
|
||||||
|
)
|
||||||
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
|
@ -565,6 +569,12 @@ def test_initialize_examples(name, get_examples, train_data):
|
||||||
nlp.initialize(get_examples=get_examples())
|
nlp.initialize(get_examples=get_examples())
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_distillable():
|
||||||
|
nlp = English()
|
||||||
|
textcat = nlp.add_pipe("textcat")
|
||||||
|
assert not textcat.is_distillable
|
||||||
|
|
||||||
|
|
||||||
def test_overfitting_IO():
|
def test_overfitting_IO():
|
||||||
# Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
|
# Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
|
||||||
fix_random_seed(0)
|
fix_random_seed(0)
|
||||||
|
@ -934,3 +944,26 @@ def test_save_activations_multi():
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
|
assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
|
||||||
assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
|
assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"component_name,scorer",
|
||||||
|
[
|
||||||
|
("textcat", "spacy.textcat_scorer.v1"),
|
||||||
|
("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_textcat_legacy_scorers(component_name, scorer):
|
||||||
|
"""Check that legacy scorers are registered and produce the expected score
|
||||||
|
keys."""
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
# score the model (it's not actually trained but that doesn't matter)
|
||||||
|
scores = nlp.evaluate(train_examples)
|
||||||
|
assert 0 <= scores["cats_score"] <= 1
|
||||||
|
|
|
@ -382,7 +382,7 @@ cfg_string_multi = """
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
[components.ner.model.tok2vec]
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
|
|
@ -122,33 +122,11 @@ width = ${components.tok2vec.model.width}
|
||||||
|
|
||||||
parser_config_string_upper = """
|
parser_config_string_upper = """
|
||||||
[model]
|
[model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 66
|
hidden_width = 66
|
||||||
maxout_pieces = 2
|
maxout_pieces = 2
|
||||||
use_upper = true
|
|
||||||
|
|
||||||
[model.tok2vec]
|
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
|
||||||
pretrained_vectors = null
|
|
||||||
width = 333
|
|
||||||
depth = 4
|
|
||||||
embed_size = 5555
|
|
||||||
window_size = 1
|
|
||||||
maxout_pieces = 7
|
|
||||||
subword_features = false
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
parser_config_string_no_upper = """
|
|
||||||
[model]
|
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
|
||||||
state_type = "parser"
|
|
||||||
extra_state_tokens = false
|
|
||||||
hidden_width = 66
|
|
||||||
maxout_pieces = 2
|
|
||||||
use_upper = false
|
|
||||||
|
|
||||||
[model.tok2vec]
|
[model.tok2vec]
|
||||||
@architectures = "spacy.HashEmbedCNN.v1"
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
@ -179,7 +157,6 @@ def my_parser():
|
||||||
extra_state_tokens=True,
|
extra_state_tokens=True,
|
||||||
hidden_width=65,
|
hidden_width=65,
|
||||||
maxout_pieces=5,
|
maxout_pieces=5,
|
||||||
use_upper=True,
|
|
||||||
)
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
@ -285,15 +262,16 @@ def test_serialize_custom_nlp():
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
nlp2 = spacy.load(d)
|
nlp2 = spacy.load(d)
|
||||||
model = nlp2.get_pipe("parser").model
|
model = nlp2.get_pipe("parser").model
|
||||||
model.get_ref("tok2vec")
|
assert model.get_ref("tok2vec") is not None
|
||||||
# check that we have the correct settings, not the default ones
|
assert model.has_param("hidden_W")
|
||||||
assert model.get_ref("upper").get_dim("nI") == 65
|
assert model.has_param("hidden_b")
|
||||||
assert model.get_ref("lower").get_dim("nI") == 65
|
output = model.get_ref("output")
|
||||||
|
assert output is not None
|
||||||
|
assert output.has_param("W")
|
||||||
|
assert output.has_param("b")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
|
||||||
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
|
|
||||||
)
|
|
||||||
def test_serialize_parser(parser_config_string):
|
def test_serialize_parser(parser_config_string):
|
||||||
"""Create a non-default parser config to check nlp serializes it correctly"""
|
"""Create a non-default parser config to check nlp serializes it correctly"""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
|
@ -306,11 +284,13 @@ def test_serialize_parser(parser_config_string):
|
||||||
nlp.to_disk(d)
|
nlp.to_disk(d)
|
||||||
nlp2 = spacy.load(d)
|
nlp2 = spacy.load(d)
|
||||||
model = nlp2.get_pipe("parser").model
|
model = nlp2.get_pipe("parser").model
|
||||||
model.get_ref("tok2vec")
|
assert model.get_ref("tok2vec") is not None
|
||||||
# check that we have the correct settings, not the default ones
|
assert model.has_param("hidden_W")
|
||||||
if model.attrs["has_upper"]:
|
assert model.has_param("hidden_b")
|
||||||
assert model.get_ref("upper").get_dim("nI") == 66
|
output = model.get_ref("output")
|
||||||
assert model.get_ref("lower").get_dim("nI") == 66
|
assert output is not None
|
||||||
|
assert output.has_param("b")
|
||||||
|
assert output.has_param("W")
|
||||||
|
|
||||||
|
|
||||||
def test_config_nlp_roundtrip():
|
def test_config_nlp_roundtrip():
|
||||||
|
@ -457,9 +437,7 @@ def test_config_auto_fill_extra_fields():
|
||||||
load_model_from_config(nlp.config)
|
load_model_from_config(nlp.config)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
|
||||||
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
|
|
||||||
)
|
|
||||||
def test_config_validate_literal(parser_config_string):
|
def test_config_validate_literal(parser_config_string):
|
||||||
nlp = English()
|
nlp = English()
|
||||||
config = Config().from_str(parser_config_string)
|
config = Config().from_str(parser_config_string)
|
||||||
|
|
|
@ -5,10 +5,8 @@ from pathlib import Path
|
||||||
from spacy.about import __version__ as spacy_version
|
from spacy.about import __version__ as spacy_version
|
||||||
from spacy import util
|
from spacy import util
|
||||||
from spacy import prefer_gpu, require_gpu, require_cpu
|
from spacy import prefer_gpu, require_gpu, require_cpu
|
||||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
|
||||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
from spacy.util import find_available_port
|
||||||
from spacy.util import dot_to_object, SimpleFrozenList, import_file
|
|
||||||
from spacy.util import to_ternary_int
|
|
||||||
from thinc.api import Config, Optimizer, ConfigValidationError
|
from thinc.api import Config, Optimizer, ConfigValidationError
|
||||||
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
|
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
|
||||||
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
|
||||||
|
@ -81,34 +79,6 @@ def test_util_get_package_path(package):
|
||||||
assert isinstance(path, Path)
|
assert isinstance(path, Path)
|
||||||
|
|
||||||
|
|
||||||
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
|
||||||
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
|
|
||||||
assert model.get_param("W").shape == (nF, nO, nP, nI)
|
|
||||||
tensor = model.ops.alloc((10, nI))
|
|
||||||
Y, get_dX = model.begin_update(tensor)
|
|
||||||
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
|
||||||
dY = model.ops.alloc((15, nO, nP))
|
|
||||||
ids = model.ops.alloc((15, nF))
|
|
||||||
ids[1, 2] = -1
|
|
||||||
dY[1] = 1
|
|
||||||
assert not model.has_grad("pad")
|
|
||||||
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
|
||||||
assert d_pad[0, 2, 0, 0] == 1.0
|
|
||||||
ids.fill(0.0)
|
|
||||||
dY.fill(0.0)
|
|
||||||
dY[0] = 0
|
|
||||||
ids[1, 2] = 0
|
|
||||||
ids[1, 1] = -1
|
|
||||||
ids[1, 0] = -1
|
|
||||||
dY[1] = 1
|
|
||||||
ids[2, 0] = -1
|
|
||||||
dY[2] = 5
|
|
||||||
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
|
|
||||||
assert d_pad[0, 0, 0, 0] == 6
|
|
||||||
assert d_pad[0, 1, 0, 0] == 1
|
|
||||||
assert d_pad[0, 2, 0, 0] == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefer_gpu():
|
def test_prefer_gpu():
|
||||||
current_ops = get_current_ops()
|
current_ops = get_current_ops()
|
||||||
if has_cupy_gpu:
|
if has_cupy_gpu:
|
||||||
|
|
|
@ -8,7 +8,7 @@ from spacy.lang.en import English
|
||||||
from spacy.tokens import Doc, DocBin
|
from spacy.tokens import Doc, DocBin
|
||||||
from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
|
from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
|
||||||
from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
|
from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
|
||||||
from spacy.training import offsets_to_biluo_tags
|
from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
|
||||||
from spacy.training.alignment_array import AlignmentArray
|
from spacy.training.alignment_array import AlignmentArray
|
||||||
from spacy.training.align import get_alignments
|
from spacy.training.align import get_alignments
|
||||||
from spacy.training.converters import json_to_docs
|
from spacy.training.converters import json_to_docs
|
||||||
|
@ -365,6 +365,19 @@ def test_example_from_dict_some_ner(en_vocab):
|
||||||
assert ner_tags == ["U-LOC", None, None, None]
|
assert ner_tags == ["U-LOC", None, None, None]
|
||||||
|
|
||||||
|
|
||||||
|
def test_validate_distillation_examples(en_vocab):
|
||||||
|
words = ["a", "b", "c", "d"]
|
||||||
|
spaces = [True, True, False, True]
|
||||||
|
predicted = Doc(en_vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
example = Example.from_dict(predicted, {})
|
||||||
|
validate_distillation_examples([example], "test_validate_distillation_examples")
|
||||||
|
|
||||||
|
example = Example.from_dict(predicted, {"words": words + ["e"]})
|
||||||
|
with pytest.raises(ValueError, match=r"distillation"):
|
||||||
|
validate_distillation_examples([example], "test_validate_distillation_examples")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||||
def test_json_to_docs_no_ner(en_vocab):
|
def test_json_to_docs_no_ner(en_vocab):
|
||||||
data = [
|
data = [
|
||||||
|
@ -905,7 +918,9 @@ def _train_tuples(train_data):
|
||||||
optimizer = nlp.initialize()
|
optimizer = nlp.initialize()
|
||||||
for i in range(5):
|
for i in range(5):
|
||||||
losses = {}
|
losses = {}
|
||||||
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
|
batches = minibatch(
|
||||||
|
train_examples, size=compounding(4.0, 32.0, 1.001).to_generator()
|
||||||
|
)
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
nlp.update(batch, sgd=optimizer, losses=losses)
|
nlp.update(batch, sgd=optimizer, losses=losses)
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport LexemeC, SpanC, TokenC
|
from .structs cimport LexemeC, SpanC, TokenC
|
||||||
from .strings cimport StringStore
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
from .vocab cimport Vocab, LexemesOrTokens, _Cached
|
||||||
from .matcher.phrasematcher cimport PhraseMatcher
|
from .matcher.phrasematcher cimport PhraseMatcher
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
from .corpus import Corpus, JsonlCorpus # noqa: F401
|
from .corpus import Corpus, JsonlCorpus # noqa: F401
|
||||||
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
from .example import Example, validate_examples, validate_get_examples # noqa: F401
|
||||||
|
from .example import validate_distillation_examples # noqa: F401
|
||||||
from .alignment import Alignment # noqa: F401
|
from .alignment import Alignment # noqa: F401
|
||||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||||
|
|
|
@ -2,12 +2,13 @@ from typing import Union, Iterable, Sequence, TypeVar, List, Callable, Iterator
|
||||||
from typing import Optional, Any
|
from typing import Optional, Any
|
||||||
from functools import partial
|
from functools import partial
|
||||||
import itertools
|
import itertools
|
||||||
from thinc.schedules import Schedule, constant as constant_schedule
|
from thinc.schedules import Schedule
|
||||||
|
|
||||||
from ..util import registry, minibatch
|
from ..util import registry, minibatch
|
||||||
|
|
||||||
|
|
||||||
Sizing = Union[Sequence[int], int, Schedule[int]]
|
SizingSchedule = Union[Iterable[int], int, Schedule]
|
||||||
|
Sizing = Union[Iterable[int], int]
|
||||||
ItemT = TypeVar("ItemT")
|
ItemT = TypeVar("ItemT")
|
||||||
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||||
|
|
||||||
|
@ -15,7 +16,7 @@ BatcherT = Callable[[Iterable[ItemT]], Iterable[List[ItemT]]]
|
||||||
@registry.batchers("spacy.batch_by_padded.v1")
|
@registry.batchers("spacy.batch_by_padded.v1")
|
||||||
def configure_minibatch_by_padded_size(
|
def configure_minibatch_by_padded_size(
|
||||||
*,
|
*,
|
||||||
size: Sizing,
|
size: SizingSchedule,
|
||||||
buffer: int,
|
buffer: int,
|
||||||
discard_oversize: bool,
|
discard_oversize: bool,
|
||||||
get_length: Optional[Callable[[ItemT], int]] = None
|
get_length: Optional[Callable[[ItemT], int]] = None
|
||||||
|
@ -25,8 +26,8 @@ def configure_minibatch_by_padded_size(
|
||||||
The padded size is defined as the maximum length of sequences within the
|
The padded size is defined as the maximum length of sequences within the
|
||||||
batch multiplied by the number of sequences in the batch.
|
batch multiplied by the number of sequences in the batch.
|
||||||
|
|
||||||
size (int or Sequence[int]): The largest padded size to batch sequences into.
|
size (int, Iterable[int] or Schedule): The largest padded size to batch sequences
|
||||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
into. Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||||
buffer (int): The number of sequences to accumulate before sorting by length.
|
buffer (int): The number of sequences to accumulate before sorting by length.
|
||||||
A larger buffer will result in more even sizing, but if the buffer is
|
A larger buffer will result in more even sizing, but if the buffer is
|
||||||
very large, the iteration order will be less random, which can result
|
very large, the iteration order will be less random, which can result
|
||||||
|
@ -40,7 +41,7 @@ def configure_minibatch_by_padded_size(
|
||||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||||
return partial(
|
return partial(
|
||||||
minibatch_by_padded_size,
|
minibatch_by_padded_size,
|
||||||
size=size,
|
size=_schedule_to_sizing(size),
|
||||||
buffer=buffer,
|
buffer=buffer,
|
||||||
discard_oversize=discard_oversize,
|
discard_oversize=discard_oversize,
|
||||||
**optionals
|
**optionals
|
||||||
|
@ -50,14 +51,14 @@ def configure_minibatch_by_padded_size(
|
||||||
@registry.batchers("spacy.batch_by_words.v1")
|
@registry.batchers("spacy.batch_by_words.v1")
|
||||||
def configure_minibatch_by_words(
|
def configure_minibatch_by_words(
|
||||||
*,
|
*,
|
||||||
size: Sizing,
|
size: SizingSchedule,
|
||||||
tolerance: float,
|
tolerance: float,
|
||||||
discard_oversize: bool,
|
discard_oversize: bool,
|
||||||
get_length: Optional[Callable[[ItemT], int]] = None
|
get_length: Optional[Callable[[ItemT], int]] = None
|
||||||
) -> BatcherT:
|
) -> BatcherT:
|
||||||
"""Create a batcher that uses the "minibatch by words" strategy.
|
"""Create a batcher that uses the "minibatch by words" strategy.
|
||||||
|
|
||||||
size (int or Sequence[int]): The target number of words per batch.
|
size (int, Iterable[int] or Schedule): The target number of words per batch.
|
||||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||||
tolerance (float): What percentage of the size to allow batches to exceed.
|
tolerance (float): What percentage of the size to allow batches to exceed.
|
||||||
discard_oversize (bool): Whether to discard sequences that by themselves
|
discard_oversize (bool): Whether to discard sequences that by themselves
|
||||||
|
@ -68,7 +69,7 @@ def configure_minibatch_by_words(
|
||||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||||
return partial(
|
return partial(
|
||||||
minibatch_by_words,
|
minibatch_by_words,
|
||||||
size=size,
|
size=_schedule_to_sizing(size),
|
||||||
tolerance=tolerance,
|
tolerance=tolerance,
|
||||||
discard_oversize=discard_oversize,
|
discard_oversize=discard_oversize,
|
||||||
**optionals
|
**optionals
|
||||||
|
@ -77,15 +78,15 @@ def configure_minibatch_by_words(
|
||||||
|
|
||||||
@registry.batchers("spacy.batch_by_sequence.v1")
|
@registry.batchers("spacy.batch_by_sequence.v1")
|
||||||
def configure_minibatch(
|
def configure_minibatch(
|
||||||
size: Sizing, get_length: Optional[Callable[[ItemT], int]] = None
|
size: SizingSchedule, get_length: Optional[Callable[[ItemT], int]] = None
|
||||||
) -> BatcherT:
|
) -> BatcherT:
|
||||||
"""Create a batcher that creates batches of the specified size.
|
"""Create a batcher that creates batches of the specified size.
|
||||||
|
|
||||||
size (int or Sequence[int]): The target number of items per batch.
|
size (int, Iterable[int] or Schedule): The target number of items per batch.
|
||||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||||
"""
|
"""
|
||||||
optionals = {"get_length": get_length} if get_length is not None else {}
|
optionals = {"get_length": get_length} if get_length is not None else {}
|
||||||
return partial(minibatch, size=size, **optionals)
|
return partial(minibatch, size=_schedule_to_sizing(size), **optionals)
|
||||||
|
|
||||||
|
|
||||||
def minibatch_by_padded_size(
|
def minibatch_by_padded_size(
|
||||||
|
@ -101,7 +102,7 @@ def minibatch_by_padded_size(
|
||||||
The padded size is defined as the maximum length of sequences within the
|
The padded size is defined as the maximum length of sequences within the
|
||||||
batch multiplied by the number of sequences in the batch.
|
batch multiplied by the number of sequences in the batch.
|
||||||
|
|
||||||
size (int or Sequence[int]): The largest padded size to batch sequences into.
|
size (int or Iterable[int]): The largest padded size to batch sequences into.
|
||||||
buffer (int): The number of sequences to accumulate before sorting by length.
|
buffer (int): The number of sequences to accumulate before sorting by length.
|
||||||
A larger buffer will result in more even sizing, but if the buffer is
|
A larger buffer will result in more even sizing, but if the buffer is
|
||||||
very large, the iteration order will be less random, which can result
|
very large, the iteration order will be less random, which can result
|
||||||
|
@ -112,13 +113,12 @@ def minibatch_by_padded_size(
|
||||||
The `len` function is used by default.
|
The `len` function is used by default.
|
||||||
"""
|
"""
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = constant_schedule(size)
|
size_: Iterator[int] = itertools.repeat(size)
|
||||||
else:
|
else:
|
||||||
assert isinstance(size, Schedule)
|
size_ = iter(size)
|
||||||
size_ = size
|
for outer_batch in minibatch(seqs, size=buffer):
|
||||||
for step, outer_batch in enumerate(minibatch(seqs, size=buffer)):
|
|
||||||
outer_batch = list(outer_batch)
|
outer_batch = list(outer_batch)
|
||||||
target_size = size_(step)
|
target_size = next(size_)
|
||||||
for indices in _batch_by_length(outer_batch, target_size, get_length):
|
for indices in _batch_by_length(outer_batch, target_size, get_length):
|
||||||
subbatch = [outer_batch[i] for i in indices]
|
subbatch = [outer_batch[i] for i in indices]
|
||||||
padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
|
padded_size = max(len(seq) for seq in subbatch) * len(subbatch)
|
||||||
|
@ -140,7 +140,7 @@ def minibatch_by_words(
|
||||||
themselves, or be discarded if discard_oversize=True.
|
themselves, or be discarded if discard_oversize=True.
|
||||||
|
|
||||||
seqs (Iterable[Sequence]): The sequences to minibatch.
|
seqs (Iterable[Sequence]): The sequences to minibatch.
|
||||||
size (int or Sequence[int]): The target number of words per batch.
|
size (int or Iterable[int]): The target number of words per batch.
|
||||||
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
Can be a single integer, or a sequence, allowing for variable batch sizes.
|
||||||
tolerance (float): What percentage of the size to allow batches to exceed.
|
tolerance (float): What percentage of the size to allow batches to exceed.
|
||||||
discard_oversize (bool): Whether to discard sequences that by themselves
|
discard_oversize (bool): Whether to discard sequences that by themselves
|
||||||
|
@ -149,12 +149,10 @@ def minibatch_by_words(
|
||||||
item. The `len` function is used by default.
|
item. The `len` function is used by default.
|
||||||
"""
|
"""
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = constant_schedule(size)
|
size_: Iterator[int] = itertools.repeat(size)
|
||||||
else:
|
else:
|
||||||
assert isinstance(size, Schedule)
|
size_ = iter(size)
|
||||||
size_ = size
|
target_size = next(size_)
|
||||||
step = 0
|
|
||||||
target_size = size_(step)
|
|
||||||
tol_size = target_size * tolerance
|
tol_size = target_size * tolerance
|
||||||
batch = []
|
batch = []
|
||||||
overflow = []
|
overflow = []
|
||||||
|
@ -179,8 +177,7 @@ def minibatch_by_words(
|
||||||
else:
|
else:
|
||||||
if batch:
|
if batch:
|
||||||
yield batch
|
yield batch
|
||||||
step += 1
|
target_size = next(size_)
|
||||||
target_size = size_(step)
|
|
||||||
tol_size = target_size * tolerance
|
tol_size = target_size * tolerance
|
||||||
batch = overflow
|
batch = overflow
|
||||||
batch_size = overflow_size
|
batch_size = overflow_size
|
||||||
|
@ -198,8 +195,7 @@ def minibatch_by_words(
|
||||||
else:
|
else:
|
||||||
if batch:
|
if batch:
|
||||||
yield batch
|
yield batch
|
||||||
step += 1
|
target_size = next(size_)
|
||||||
target_size = size_(step)
|
|
||||||
tol_size = target_size * tolerance
|
tol_size = target_size * tolerance
|
||||||
batch = [seq]
|
batch = [seq]
|
||||||
batch_size = n_words
|
batch_size = n_words
|
||||||
|
@ -236,3 +232,9 @@ def _batch_by_length(
|
||||||
batches = [list(sorted(batch)) for batch in batches]
|
batches = [list(sorted(batch)) for batch in batches]
|
||||||
batches.reverse()
|
batches.reverse()
|
||||||
return batches
|
return batches
|
||||||
|
|
||||||
|
|
||||||
|
def _schedule_to_sizing(size: SizingSchedule) -> Sizing:
|
||||||
|
if isinstance(size, Schedule):
|
||||||
|
return size.to_generator()
|
||||||
|
return size
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from collections.abc import Iterable as IterableInstance
|
from collections.abc import Iterable as IterableInstance
|
||||||
import warnings
|
|
||||||
import numpy
|
import numpy
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
|
||||||
|
@ -47,6 +46,13 @@ def validate_examples(examples, method):
|
||||||
raise TypeError(err)
|
raise TypeError(err)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_distillation_examples(examples, method):
|
||||||
|
validate_examples(examples, method)
|
||||||
|
for eg in examples:
|
||||||
|
if [token.text for token in eg.reference] != [token.text for token in eg.predicted]:
|
||||||
|
raise ValueError(Errors.E4003)
|
||||||
|
|
||||||
|
|
||||||
def validate_get_examples(get_examples, method):
|
def validate_get_examples(get_examples, method):
|
||||||
"""Check that a generator of a batch of examples received during processing is valid:
|
"""Check that a generator of a batch of examples received during processing is valid:
|
||||||
the callable produces a non-empty list of Example objects.
|
the callable produces a non-empty list of Example objects.
|
||||||
|
|
|
@ -100,7 +100,7 @@ def train(
|
||||||
stdout.write(
|
stdout.write(
|
||||||
msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
|
msg.info(f"Set annotations on update for: {annotating_components}") + "\n"
|
||||||
)
|
)
|
||||||
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
|
stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate(step=0)}") + "\n")
|
||||||
with nlp.select_pipes(disable=frozen_components):
|
with nlp.select_pipes(disable=frozen_components):
|
||||||
log_step, finalize_logger = train_logger(nlp, stdout, stderr)
|
log_step, finalize_logger = train_logger(nlp, stdout, stderr)
|
||||||
try:
|
try:
|
||||||
|
|
|
@ -1579,12 +1579,12 @@ def minibatch(items, size):
|
||||||
so that batch-size can vary on each step.
|
so that batch-size can vary on each step.
|
||||||
"""
|
"""
|
||||||
if isinstance(size, int):
|
if isinstance(size, int):
|
||||||
size_ = constant_schedule(size)
|
size_ = itertools.repeat(size)
|
||||||
else:
|
else:
|
||||||
size_ = size
|
size_ = iter(size)
|
||||||
items = iter(items)
|
items = iter(items)
|
||||||
for step in itertools.count():
|
while True:
|
||||||
batch_size = size_(step)
|
batch_size = next(size_)
|
||||||
batch = list(itertools.islice(items, int(batch_size)))
|
batch = list(itertools.islice(items, int(batch_size)))
|
||||||
if len(batch) == 0:
|
if len(batch) == 0:
|
||||||
break
|
break
|
||||||
|
|
|
@ -553,18 +553,17 @@ for a Tok2Vec layer.
|
||||||
|
|
||||||
## Parser & NER architectures {#parser}
|
## Parser & NER architectures {#parser}
|
||||||
|
|
||||||
### spacy.TransitionBasedParser.v2 {#TransitionBasedParser source="spacy/ml/models/parser.py"}
|
### spacy.TransitionBasedParser.v3 {id="TransitionBasedParser",source="spacy/ml/models/parser.py"}
|
||||||
|
|
||||||
> #### Example Config
|
> #### Example Config
|
||||||
>
|
>
|
||||||
> ```ini
|
> ```ini
|
||||||
> [model]
|
> [model]
|
||||||
> @architectures = "spacy.TransitionBasedParser.v2"
|
> @architectures = "spacy.TransitionBasedParser.v3"
|
||||||
> state_type = "ner"
|
> state_type = "ner"
|
||||||
> extra_state_tokens = false
|
> extra_state_tokens = false
|
||||||
> hidden_width = 64
|
> hidden_width = 64
|
||||||
> maxout_pieces = 2
|
> maxout_pieces = 2
|
||||||
> use_upper = true
|
|
||||||
>
|
>
|
||||||
> [model.tok2vec]
|
> [model.tok2vec]
|
||||||
> @architectures = "spacy.HashEmbedCNN.v2"
|
> @architectures = "spacy.HashEmbedCNN.v2"
|
||||||
|
@ -595,13 +594,12 @@ consists of either two or three subnetworks:
|
||||||
as action scores directly.
|
as action scores directly.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||||
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
|
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
|
||||||
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
|
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
|
||||||
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
||||||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
|
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. ~~int~~ |
|
||||||
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
|
|
||||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||||
|
|
||||||
|
@ -610,7 +608,7 @@ consists of either two or three subnetworks:
|
||||||
[TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
|
[TransitionBasedParser.v1](/api/legacy#TransitionBasedParser_v1) had the exact
|
||||||
same signature, but the `use_upper` argument was `True` by default.
|
same signature, but the `use_upper` argument was `True` by default.
|
||||||
|
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
title: AttributeRuler
|
title: AttributeRuler
|
||||||
tag: class
|
tag: class
|
||||||
source: spacy/pipeline/attribute_ruler.py
|
source: spacy/pipeline/attribute_ruler.py
|
||||||
new: 3
|
version: 3
|
||||||
teaser: 'Pipeline component for rule-based token attribute assignment'
|
teaser: 'Pipeline component for rule-based token attribute assignment'
|
||||||
api_string_name: attribute_ruler
|
api_string_name: attribute_ruler
|
||||||
api_trainable: false
|
api_trainable: false
|
||||||
|
|
|
@ -361,7 +361,7 @@ Module spacy.language
|
||||||
File /path/to/spacy/language.py (line 64)
|
File /path/to/spacy/language.py (line 64)
|
||||||
ℹ [components.ner.model]
|
ℹ [components.ner.model]
|
||||||
Registry @architectures
|
Registry @architectures
|
||||||
Name spacy.TransitionBasedParser.v1
|
Name spacy.TransitionBasedParser.v3
|
||||||
Module spacy.ml.models.parser
|
Module spacy.ml.models.parser
|
||||||
File /path/to/spacy/ml/models/parser.py (line 11)
|
File /path/to/spacy/ml/models/parser.py (line 11)
|
||||||
ℹ [components.ner.model.tok2vec]
|
ℹ [components.ner.model.tok2vec]
|
||||||
|
@ -371,7 +371,7 @@ Module spacy.ml.models.tok2vec
|
||||||
File /path/to/spacy/ml/models/tok2vec.py (line 16)
|
File /path/to/spacy/ml/models/tok2vec.py (line 16)
|
||||||
ℹ [components.parser.model]
|
ℹ [components.parser.model]
|
||||||
Registry @architectures
|
Registry @architectures
|
||||||
Name spacy.TransitionBasedParser.v1
|
Name spacy.TransitionBasedParser.v3
|
||||||
Module spacy.ml.models.parser
|
Module spacy.ml.models.parser
|
||||||
File /path/to/spacy/ml/models/parser.py (line 11)
|
File /path/to/spacy/ml/models/parser.py (line 11)
|
||||||
ℹ [components.parser.model.tok2vec]
|
ℹ [components.parser.model.tok2vec]
|
||||||
|
@ -696,7 +696,7 @@ scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||||
update_with_oracle_cut_size = 100
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
- hidden_width = 64
|
- hidden_width = 64
|
||||||
|
@ -719,7 +719,7 @@ scorer = {"@scorers":"spacy.parser_scorer.v1"}
|
||||||
update_with_oracle_cut_size = 100
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
[components.parser.model]
|
[components.parser.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v2"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "parser"
|
state_type = "parser"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
|
|
|
@ -131,7 +131,40 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## DependencyParser.pipe {#pipe tag="method"}
|
## DependencyParser.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("parser")
|
||||||
|
> student_pipe = student.add_pipe("parser")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## DependencyParser.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -268,7 +301,28 @@ predicted scores.
|
||||||
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
|
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
## DependencyParser.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_parser = teacher.get_pipe("parser")
|
||||||
|
> student_parser = student.add_pipe("parser")
|
||||||
|
> student_scores = student_parser.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_parser.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_parser.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## DependencyParser.create_optimizer {id="create_optimizer",tag="method"}
|
||||||
|
|
||||||
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
|
Create an [`Optimizer`](https://thinc.ai/docs/api-optimizers) for the pipeline
|
||||||
component.
|
component.
|
||||||
|
|
|
@ -115,7 +115,40 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## EditTreeLemmatizer.pipe {#pipe tag="method"}
|
## EditTreeLemmatizer.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("trainable_lemmatizer")
|
||||||
|
> student_pipe = student.add_pipe("trainable_lemmatizer")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## EditTreeLemmatizer.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -269,7 +302,28 @@ Create an optimizer for the pipeline component.
|
||||||
| ----------- | ---------------------------- |
|
| ----------- | ---------------------------- |
|
||||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||||
|
|
||||||
## EditTreeLemmatizer.use_params {#use_params tag="method, contextmanager"}
|
## EditTreeLemmatizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_lemmatizer = teacher.get_pipe("trainable_lemmatizer")
|
||||||
|
> student_lemmatizer = student.add_pipe("trainable_lemmatizer")
|
||||||
|
> student_scores = student_lemmatizer.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_lemmatizer.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_lemmatizer.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## EditTreeLemmatizer.use_params {id="use_params",tag="method, contextmanager"}
|
||||||
|
|
||||||
Modify the pipe's model, to use the given parameter values. At the end of the
|
Modify the pipe's model, to use the given parameter values. At the end of the
|
||||||
context, the original parameters are restored.
|
context, the original parameters are restored.
|
||||||
|
|
|
@ -127,7 +127,40 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## EntityRecognizer.pipe {#pipe tag="method"}
|
## EntityRecognizer.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("ner")
|
||||||
|
> student_pipe = student.add_pipe("ner")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## EntityRecognizer.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -264,7 +297,28 @@ predicted scores.
|
||||||
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
|
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
## EntityRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_ner = teacher.get_pipe("ner")
|
||||||
|
> student_ner = student.add_pipe("ner")
|
||||||
|
> student_scores = student_ner.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_ner.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_ner.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## EntityRecognizer.create_optimizer {id="create_optimizer",tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
|
125
website/docs/api/entityruler.mdx
Normal file
125
website/docs/api/entityruler.mdx
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
---
|
||||||
|
title: EntityRuler
|
||||||
|
version: 2.1
|
||||||
|
teaser: 'Pipeline component for rule-based named entity recognition'
|
||||||
|
api_string_name: entity_ruler
|
||||||
|
api_trainable: false
|
||||||
|
---
|
||||||
|
|
||||||
|
<Infobox title="New in v4" variant="warning">
|
||||||
|
|
||||||
|
As of spaCy v4, there is no separate `EntityRuler` class. The entity ruler is
|
||||||
|
implemented as a special case of the `SpanRuler` component.
|
||||||
|
|
||||||
|
See the [migration guide](#migrating) below for differences between the v3
|
||||||
|
`EntityRuler` and v4 `SpanRuler` implementations of the `entity_ruler`
|
||||||
|
component.
|
||||||
|
|
||||||
|
See the [`SpanRuler`](/api/spanruler) API docs for the full API.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
The entity ruler lets you add spans to the [`Doc.ents`](/api/doc#ents) using
|
||||||
|
token-based rules or exact phrase matches. It can be combined with the
|
||||||
|
statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
|
||||||
|
used on its own to implement a purely rule-based entity recognition system. For
|
||||||
|
usage examples, see the docs on
|
||||||
|
[rule-based entity recognition](/usage/rule-based-matching#entityruler).
|
||||||
|
|
||||||
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
|
This component assigns predictions basically the same way as the
|
||||||
|
[`EntityRecognizer`](/api/entityrecognizer).
|
||||||
|
|
||||||
|
Predictions can be accessed under `Doc.ents` as a tuple. Each label will also be
|
||||||
|
reflected in each underlying token, where it is saved in the `Token.ent_type`
|
||||||
|
and `Token.ent_iob` fields. Note that by definition each token can only have one
|
||||||
|
label.
|
||||||
|
|
||||||
|
When setting `Doc.ents` to create training data, all the spans must be valid and
|
||||||
|
non-overlapping, or an error will be thrown.
|
||||||
|
|
||||||
|
| Location | Value |
|
||||||
|
| ----------------- | ----------------------------------------------------------------- |
|
||||||
|
| `Doc.ents` | The annotated spans. ~~Tuple[Span]~~ |
|
||||||
|
| `Token.ent_iob` | An enum encoding of the IOB part of the named entity tag. ~~int~~ |
|
||||||
|
| `Token.ent_iob_` | The IOB part of the named entity tag. ~~str~~ |
|
||||||
|
| `Token.ent_type` | The label part of the named entity tag (hash). ~~int~~ |
|
||||||
|
| `Token.ent_type_` | The label part of the named entity tag. ~~str~~ |
|
||||||
|
|
||||||
|
## Config and implementation {id="config"}
|
||||||
|
|
||||||
|
The default config is defined by the pipeline component factory and describes
|
||||||
|
how the component should be configured. You can override its settings via the
|
||||||
|
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||||
|
[`config.cfg` for training](/usage/training#config).
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> config = {
|
||||||
|
> "phrase_matcher_attr": None,
|
||||||
|
> "validate": True,
|
||||||
|
> "overwrite_ents": False,
|
||||||
|
> "ent_id_sep": "||",
|
||||||
|
> }
|
||||||
|
> nlp.add_pipe("entity_ruler", config=config)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Setting | Description |
|
||||||
|
| ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||||
|
| `matcher_fuzzy_compare` <Tag variant="new">3.5</Tag> | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ |
|
||||||
|
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
||||||
|
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||||
|
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ |
|
||||||
|
| `scorer` | The scoring method. Defaults to [`spacy.scorer.get_ner_prf`](/api/scorer#get_ner_prf). ~~Optional[Callable]~~ |
|
||||||
|
|
||||||
|
## Migrating from v3 {id="migrating"}
|
||||||
|
|
||||||
|
### Loading patterns
|
||||||
|
|
||||||
|
Unlike the v3 `EntityRuler`, the `SpanRuler` cannot load patterns on
|
||||||
|
initialization with `SpanRuler(patterns=patterns)` or directly from a JSONL file
|
||||||
|
path with `SpanRuler.from_disk(jsonl_path)`. Patterns should be loaded from the
|
||||||
|
JSONL file separately and then added through
|
||||||
|
[`SpanRuler.initialize`](/api/spanruler#initialize]) or
|
||||||
|
[`SpanRuler.add_patterns`](/api/spanruler#add_patterns).
|
||||||
|
|
||||||
|
```diff
|
||||||
|
ruler = nlp.get_pipe("entity_ruler")
|
||||||
|
- ruler.from_disk("patterns.jsonl")
|
||||||
|
+ import srsly
|
||||||
|
+ patterns = srsly.read_jsonl("patterns.jsonl")
|
||||||
|
+ ruler.add_patterns(patterns)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Saving patterns
|
||||||
|
|
||||||
|
`SpanRuler.to_disk` always saves the full component data to a directory and does
|
||||||
|
not include an option to save the patterns to a single JSONL file.
|
||||||
|
|
||||||
|
```diff
|
||||||
|
ruler = nlp.get_pipe("entity_ruler")
|
||||||
|
- ruler.to_disk("patterns.jsonl")
|
||||||
|
+ import srsly
|
||||||
|
+ srsly.write_jsonl("patterns.jsonl", ruler.patterns)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Accessing token and phrase patterns
|
||||||
|
|
||||||
|
The separate token patterns and phrase patterns are no longer accessible under
|
||||||
|
`ruler.token_patterns` or `ruler.phrase_patterns`. You can access the combined
|
||||||
|
patterns in their original format using the property
|
||||||
|
[`SpanRuler.patterns`](/api/spanruler#patterns).
|
||||||
|
|
||||||
|
### Removing patterns by ID
|
||||||
|
|
||||||
|
[`SpanRuler.remove`](/api/spanruler#remove) removes by label rather than ID. To
|
||||||
|
remove by ID, use [`SpanRuler.remove_by_id`](/api/spanruler#remove_by_id):
|
||||||
|
|
||||||
|
```diff
|
||||||
|
ruler = nlp.get_pipe("entity_ruler")
|
||||||
|
- ruler.remove("id")
|
||||||
|
+ ruler.remove_by_id("id")
|
||||||
|
```
|
|
@ -225,7 +225,7 @@ the others, but may not be as accurate, especially if texts are short.
|
||||||
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1}
|
### spacy.TransitionBasedParser.v1 {#TransitionBasedParser_v1}
|
||||||
|
|
||||||
Identical to
|
Identical to
|
||||||
[`spacy.TransitionBasedParser.v2`](/api/architectures#TransitionBasedParser)
|
[`spacy.TransitionBasedParser.v3`](/api/architectures#TransitionBasedParser)
|
||||||
except the `use_upper` was set to `True` by default.
|
except the `use_upper` was set to `True` by default.
|
||||||
|
|
||||||
## Layers {#layers}
|
## Layers {#layers}
|
||||||
|
|
|
@ -121,7 +121,40 @@ delegate to the [`predict`](/api/morphologizer#predict) and
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## Morphologizer.pipe {#pipe tag="method"}
|
## Morphologizer.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("morphologizer")
|
||||||
|
> student_pipe = student.add_pipe("morphologizer")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## Morphologizer.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -259,7 +292,28 @@ predicted scores.
|
||||||
| `scores` | Scores representing the model's predictions. |
|
| `scores` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## Morphologizer.create_optimizer {#create_optimizer tag="method"}
|
## Morphologizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_morphologizer = teacher.get_pipe("morphologizer")
|
||||||
|
> student_morphologizer = student.add_pipe("morphologizer")
|
||||||
|
> student_scores = student_morphologizer.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_morphologizer.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_morphologizer.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## Morphologizer.create_optimizer {id="create_optimizer",tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
|
|
@ -234,7 +234,40 @@ predictions and gold-standard annotations, and update the component's model.
|
||||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
## TrainablePipe.rehearse {#rehearse tag="method,experimental" new="3"}
|
## TrainablePipe.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("your_custom_pipe")
|
||||||
|
> student_pipe = student.add_pipe("your_custom_pipe")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## TrainablePipe.rehearse {id="rehearse",tag="method,experimental",version="3"}
|
||||||
|
|
||||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||||
current model to make predictions similar to an initial model, to try to address
|
current model to make predictions similar to an initial model, to try to address
|
||||||
|
@ -281,7 +314,35 @@ This method needs to be overwritten with your own custom `get_loss` method.
|
||||||
| `scores` | Scores representing the model's predictions. |
|
| `scores` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## TrainablePipe.score {#score tag="method" new="3"}
|
## TrainablePipe.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
<Infobox variant="danger">
|
||||||
|
|
||||||
|
This method needs to be overwritten with your own custom
|
||||||
|
`get_teacher_student_loss` method.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.get_pipe("your_custom_pipe")
|
||||||
|
> student_pipe = student.add_pipe("your_custom_pipe")
|
||||||
|
> student_scores = student_pipe.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_pipe.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_pipe.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## TrainablePipe.score {id="score",tag="method",version="3"}
|
||||||
|
|
||||||
Score a batch of examples.
|
Score a batch of examples.
|
||||||
|
|
||||||
|
|
|
@ -106,7 +106,40 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## SentenceRecognizer.pipe {#pipe tag="method"}
|
## SentenceRecognizer.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("senter")
|
||||||
|
> student_pipe = student.add_pipe("senter")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## SentenceRecognizer.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -254,7 +287,28 @@ predicted scores.
|
||||||
| `scores` | Scores representing the model's predictions. |
|
| `scores` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
|
## SentenceRecognizer.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_senter = teacher.get_pipe("senter")
|
||||||
|
> student_senter = student.add_pipe("senter")
|
||||||
|
> student_scores = student_senter.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_senter.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_senter.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## SentenceRecognizer.create_optimizer {id="create_optimizer",tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
|
|
@ -24,7 +24,7 @@ component.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
## Assigned Attributes {#assigned-attributes}
|
## Assigned Attributes {id="assigned-attributes"}
|
||||||
|
|
||||||
Matches will be saved to `Doc.spans[spans_key]` as a
|
Matches will be saved to `Doc.spans[spans_key]` as a
|
||||||
[`SpanGroup`](/api/spangroup) and/or to `Doc.ents`, where the annotation is
|
[`SpanGroup`](/api/spangroup) and/or to `Doc.ents`, where the annotation is
|
||||||
|
|
|
@ -90,7 +90,7 @@ Iterate over the stored strings in insertion order.
|
||||||
| ----------- | ------------------------------ |
|
| ----------- | ------------------------------ |
|
||||||
| **RETURNS** | A string in the store. ~~str~~ |
|
| **RETURNS** | A string in the store. ~~str~~ |
|
||||||
|
|
||||||
## StringStore.items {#iter tag="method" new="4"}
|
## StringStore.items {id="items", tag="method", version="4"}
|
||||||
|
|
||||||
Iterate over the stored string-hash pairs in insertion order.
|
Iterate over the stored string-hash pairs in insertion order.
|
||||||
|
|
||||||
|
@ -106,7 +106,7 @@ Iterate over the stored string-hash pairs in insertion order.
|
||||||
| ----------- | ------------------------------------------------------ |
|
| ----------- | ------------------------------------------------------ |
|
||||||
| **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
|
| **RETURNS** | A list of string-hash pairs. ~~List[Tuple[str, int]]~~ |
|
||||||
|
|
||||||
## StringStore.keys {#iter tag="method" new="4"}
|
## StringStore.keys {id="keys", tag="method", version="4"}
|
||||||
|
|
||||||
Iterate over the stored strings in insertion order.
|
Iterate over the stored strings in insertion order.
|
||||||
|
|
||||||
|
@ -122,7 +122,7 @@ Iterate over the stored strings in insertion order.
|
||||||
| ----------- | -------------------------------- |
|
| ----------- | -------------------------------- |
|
||||||
| **RETURNS** | A list of strings. ~~List[str]~~ |
|
| **RETURNS** | A list of strings. ~~List[str]~~ |
|
||||||
|
|
||||||
## StringStore.values {#iter tag="method" new="4"}
|
## StringStore.values {id="values", tag="method", version="4"}
|
||||||
|
|
||||||
Iterate over the stored string hashes in insertion order.
|
Iterate over the stored string hashes in insertion order.
|
||||||
|
|
||||||
|
@ -138,7 +138,7 @@ Iterate over the stored string hashes in insertion order.
|
||||||
| ----------- | -------------------------------------- |
|
| ----------- | -------------------------------------- |
|
||||||
| **RETURNS** | A list of string hashes. ~~List[int]~~ |
|
| **RETURNS** | A list of string hashes. ~~List[int]~~ |
|
||||||
|
|
||||||
## StringStore.add {#add tag="method"}
|
## StringStore.add {id="add", tag="method"}
|
||||||
|
|
||||||
Add a string to the `StringStore`.
|
Add a string to the `StringStore`.
|
||||||
|
|
||||||
|
@ -158,7 +158,7 @@ Add a string to the `StringStore`.
|
||||||
| `string` | The string to add. ~~str~~ |
|
| `string` | The string to add. ~~str~~ |
|
||||||
| **RETURNS** | The string's hash value. ~~int~~ |
|
| **RETURNS** | The string's hash value. ~~int~~ |
|
||||||
|
|
||||||
## StringStore.to_disk {#to_disk tag="method"}
|
## StringStore.to_disk {id="to_disk",tag="method"}
|
||||||
|
|
||||||
Save the current state to a directory.
|
Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -172,7 +172,7 @@ Save the current state to a directory.
|
||||||
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||||
|
|
||||||
## StringStore.from_disk {#from_disk tag="method" new="2"}
|
## StringStore.from_disk {id="from_disk",tag="method"}
|
||||||
|
|
||||||
Loads state from a directory. Modifies the object in place and returns it.
|
Loads state from a directory. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
|
|
@ -105,7 +105,40 @@ and all pipeline components are applied to the `Doc` in order. Both
|
||||||
| `doc` | The document to process. ~~Doc~~ |
|
| `doc` | The document to process. ~~Doc~~ |
|
||||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||||
|
|
||||||
## Tagger.pipe {#pipe tag="method"}
|
## Tagger.distill {id="distill", tag="method,experimental", version="4"}
|
||||||
|
|
||||||
|
Train a pipe (the student) on the predictions of another pipe (the teacher). The
|
||||||
|
student is typically trained on the probability distribution of the teacher, but
|
||||||
|
details may differ per pipe. The goal of distillation is to transfer knowledge
|
||||||
|
from the teacher to the student.
|
||||||
|
|
||||||
|
The distillation is performed on ~~Example~~ objects. The `Example.reference`
|
||||||
|
and `Example.predicted` ~~Doc~~s must have the same number of tokens and the
|
||||||
|
same orthography. Even though the reference does not need have to have gold
|
||||||
|
annotations, the teacher could adds its own annotations when necessary.
|
||||||
|
|
||||||
|
This feature is experimental.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_pipe = teacher.add_pipe("tagger")
|
||||||
|
> student_pipe = student.add_pipe("tagger")
|
||||||
|
> optimizer = nlp.resume_training()
|
||||||
|
> losses = student.distill(teacher_pipe, examples, sgd=optimizer)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `teacher_pipe` | The teacher pipe to learn from. ~~Optional[TrainablePipe]~~ |
|
||||||
|
| `examples` | Distillation examples. The reference and predicted docs must have the same number of tokens and the same orthography. ~~Iterable[Example]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `drop` | Dropout rate. ~~float~~ |
|
||||||
|
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||||
|
| `losses` | Optional record of the loss during distillation. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||||
|
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||||
|
|
||||||
|
## Tagger.pipe {id="pipe",tag="method"}
|
||||||
|
|
||||||
Apply the pipe to a stream of documents. This usually happens under the hood
|
Apply the pipe to a stream of documents. This usually happens under the hood
|
||||||
when the `nlp` object is called on a text and all pipeline components are
|
when the `nlp` object is called on a text and all pipeline components are
|
||||||
|
@ -265,7 +298,28 @@ predicted scores.
|
||||||
| `scores` | Scores representing the model's predictions. |
|
| `scores` | Scores representing the model's predictions. |
|
||||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
## Tagger.create_optimizer {#create_optimizer tag="method"}
|
## Tagger.get_teacher_student_loss {id="get_teacher_student_loss", tag="method", version="4"}
|
||||||
|
|
||||||
|
Calculate the loss and its gradient for the batch of student scores relative to
|
||||||
|
the teacher scores.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> teacher_tagger = teacher.get_pipe("tagger")
|
||||||
|
> student_tagger = student.add_pipe("tagger")
|
||||||
|
> student_scores = student_tagger.predict([eg.predicted for eg in examples])
|
||||||
|
> teacher_scores = teacher_tagger.predict([eg.predicted for eg in examples])
|
||||||
|
> loss, d_loss = student_tagger.get_teacher_student_loss(teacher_scores, student_scores)
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------- |
|
||||||
|
| `teacher_scores` | Scores representing the teacher model's predictions. |
|
||||||
|
| `student_scores` | Scores representing the student model's predictions. |
|
||||||
|
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||||
|
|
||||||
|
## Tagger.create_optimizer {id="create_optimizer",tag="method"}
|
||||||
|
|
||||||
Create an optimizer for the pipeline component.
|
Create an optimizer for the pipeline component.
|
||||||
|
|
||||||
|
|
|
@ -730,9 +730,9 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ |
|
| `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ |
|
||||||
| `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
| `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
|
||||||
| `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ |
|
| `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ |
|
||||||
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
@ -752,8 +752,8 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
||||||
Create a batcher that creates batches of the specified size.
|
Create a batcher that creates batches of the specified size.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ |
|
| **CREATES** | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~ |
|
||||||
|
|
||||||
|
@ -777,7 +777,7 @@ sequences in the batch.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
| `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Iterable[int], Schedule]~~ |
|
||||||
| `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
|
| `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
|
||||||
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
|
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
|
||||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||||
|
@ -899,7 +899,8 @@ backprop passes.
|
||||||
Recursively wrap both the models and methods of each pipe using
|
Recursively wrap both the models and methods of each pipe using
|
||||||
[NVTX](https://nvidia.github.io/NVTX/) range markers. By default, the following
|
[NVTX](https://nvidia.github.io/NVTX/) range markers. By default, the following
|
||||||
methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
|
methods are wrapped: `pipe`, `predict`, `set_annotations`, `update`, `rehearse`,
|
||||||
`get_loss`, `initialize`, `begin_update`, `finish_update`, `update`.
|
`get_loss`, `get_teacher_student_loss`, `initialize`, `begin_update`,
|
||||||
|
`finish_update`, `update`.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
@ -1330,7 +1331,7 @@ vary on each step.
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------- | ------------------------------------------------ |
|
| ---------- | ------------------------------------------------ |
|
||||||
| `items` | The items to batch up. ~~Iterable[Any]~~ |
|
| `items` | The items to batch up. ~~Iterable[Any]~~ |
|
||||||
| `size` | The batch size(s). ~~Union[int, Sequence[int]]~~ |
|
| `size` | The batch size(s). ~~Union[int, Iterable[int]]~~ |
|
||||||
| **YIELDS** | The batches. |
|
| **YIELDS** | The batches. |
|
||||||
|
|
||||||
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
|
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
|
||||||
|
|
|
@ -141,7 +141,7 @@ factory = "tok2vec"
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
[components.ner.model.tok2vec]
|
||||||
@architectures = "spacy.Tok2VecListener.v1"
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
@ -158,7 +158,7 @@ same. This makes them fully independent and doesn't require an upstream
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[components.ner.model]
|
[components.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
|
|
||||||
[components.ner.model.tok2vec]
|
[components.ner.model.tok2vec]
|
||||||
@architectures = "spacy.Tok2Vec.v2"
|
@architectures = "spacy.Tok2Vec.v2"
|
||||||
|
@ -482,7 +482,7 @@ sneakily delegates to the `Transformer` pipeline component.
|
||||||
factory = "ner"
|
factory = "ner"
|
||||||
|
|
||||||
[nlp.pipeline.ner.model]
|
[nlp.pipeline.ner.model]
|
||||||
@architectures = "spacy.TransitionBasedParser.v1"
|
@architectures = "spacy.TransitionBasedParser.v3"
|
||||||
state_type = "ner"
|
state_type = "ner"
|
||||||
extra_state_tokens = false
|
extra_state_tokens = false
|
||||||
hidden_width = 128
|
hidden_width = 128
|
||||||
|
|
|
@ -342,7 +342,7 @@ The easiest way to download a trained pipeline is via spaCy's
|
||||||
[`download`](/api/cli#download) command. It takes care of finding the
|
[`download`](/api/cli#download) command. It takes care of finding the
|
||||||
best-matching package compatible with your spaCy installation.
|
best-matching package compatible with your spaCy installation.
|
||||||
|
|
||||||
```cli
|
```bash
|
||||||
# Download best-matching version of a package for your spaCy installation
|
# Download best-matching version of a package for your spaCy installation
|
||||||
$ python -m spacy download en_core_web_sm
|
$ python -m spacy download en_core_web_sm
|
||||||
|
|
||||||
|
|
|
@ -1370,10 +1370,12 @@ customize how the model is updated from examples, how it's initialized, how the
|
||||||
loss is calculated and to add evaluation scores to the training output.
|
loss is calculated and to add evaluation scores to the training output.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
| [`update`](/api/pipe#update) | Learn from a batch of [`Example`](/api/example) objects containing the predictions and gold-standard annotations, and update the component's model. |
|
||||||
|
| [`distill`](/api/pipe#distill) | Learn from a teacher pipeline using a batch of [`Doc`](/api/doc) objects and update the component's model. |
|
||||||
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
|
| [`initialize`](/api/pipe#initialize) | Initialize the model. Typically calls into [`Model.initialize`](https://thinc.ai/docs/api-model#initialize) and can be passed custom arguments via the [`[initialize]`](/api/data-formats#config-initialize) config block that are only loaded during training or when you call [`nlp.initialize`](/api/language#initialize), not at runtime. |
|
||||||
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
| [`get_loss`](/api/pipe#get_loss) | Return a tuple of the loss and the gradient for a batch of [`Example`](/api/example) objects. |
|
||||||
|
| [`get_teacher_student_loss`](/api/pipe#get_teacher_student_loss) | Return a tuple of the loss and the gradient for the student scores relative to the teacher scores. |
|
||||||
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
| [`score`](/api/pipe#score) | Score a batch of [`Example`](/api/example) objects and return a dictionary of scores. The [`@Language.factory`](/api/language#factory) decorator can define the `default_score_weights` of the component to decide which keys of the scores to display during training and how they count towards the final score. |
|
||||||
|
|
||||||
<Infobox title="Custom trainable components and models" emoji="📖">
|
<Infobox title="Custom trainable components and models" emoji="📖">
|
||||||
|
|
|
@ -1342,7 +1342,7 @@ doc = nlp("MyCorp Inc. is a company in the U.S.")
|
||||||
print([(ent.text, ent.label_) for ent in doc.ents])
|
print([(ent.text, ent.label_) for ent in doc.ents])
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Validating and debugging entity ruler patterns {#entityruler-pattern-validation new="2.1.8"}
|
#### Validating and debugging entity ruler patterns {id="entityruler-pattern-validation",version="2.1.8"}
|
||||||
|
|
||||||
The entity ruler can validate patterns against a JSON schema with the config
|
The entity ruler can validate patterns against a JSON schema with the config
|
||||||
setting `"validate"`. See details under
|
setting `"validate"`. See details under
|
||||||
|
|
Loading…
Reference in New Issue
Block a user