mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
0ec9a696e6
* Enable Cython<->Python bindings for `Pipe` and `TrainablePipe` methods * `pipes_with_nvtx_range`: Skip hooking methods whose signature cannot be ascertained When loading pipelines from a config file, the arguments passed to individual pipeline components is validated by `pydantic` during init. For this, the validation model attempts to parse the function signature of the component's c'tor/entry point so that it can check if all mandatory parameters are present in the config file. When using the `models_and_pipes_with_nvtx_range` as a `after_pipeline_creation` callback, the methods of all pipeline components get replaced by a NVTX range wrapper **before** the above-mentioned validation takes place. This can be problematic for components that are implemented as Cython extension types - if the extension type is not compiled with Python bindings for its methods, they will have no signatures at runtime. This resulted in `pydantic` matching the *wrapper's* parameters with the those in the config and raising errors. To avoid this, we now skip applying the wrapper to any (Cython) methods that do not have signatures.
345 lines
14 KiB
Cython
345 lines
14 KiB
Cython
# cython: infer_types=True, profile=True, binding=True
|
|
from typing import Iterable, Iterator, Optional, Dict, Tuple, Callable
|
|
import srsly
|
|
from thinc.api import set_dropout_rate, Model, Optimizer
|
|
|
|
from ..tokens.doc cimport Doc
|
|
|
|
from ..training import validate_examples
|
|
from ..errors import Errors
|
|
from .pipe import Pipe, deserialize_config
|
|
from .. import util
|
|
from ..vocab import Vocab
|
|
from ..language import Language
|
|
from ..training import Example
|
|
|
|
|
|
cdef class TrainablePipe(Pipe):
|
|
"""This class is a base class and not instantiated directly. Trainable
|
|
pipeline components like the EntityRecognizer or TextCategorizer inherit
|
|
from it and it defines the interface that components should follow to
|
|
function as trainable components in a spaCy pipeline.
|
|
|
|
DOCS: https://spacy.io/api/pipe
|
|
"""
|
|
def __init__(self, vocab: Vocab, model: Model, name: str, **cfg):
|
|
"""Initialize a pipeline component.
|
|
|
|
vocab (Vocab): The shared vocabulary.
|
|
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
|
name (str): The component instance name.
|
|
**cfg: Additional settings and config parameters.
|
|
|
|
DOCS: https://spacy.io/api/pipe#init
|
|
"""
|
|
self.vocab = vocab
|
|
self.model = model
|
|
self.name = name
|
|
self.cfg = dict(cfg)
|
|
|
|
def __call__(self, Doc doc) -> Doc:
|
|
"""Apply the pipe to one document. The document is modified in place,
|
|
and returned. This usually happens under the hood when the nlp object
|
|
is called on a text and all components are applied to the Doc.
|
|
|
|
docs (Doc): The Doc to process.
|
|
RETURNS (Doc): The processed Doc.
|
|
|
|
DOCS: https://spacy.io/api/pipe#call
|
|
"""
|
|
error_handler = self.get_error_handler()
|
|
try:
|
|
scores = self.predict([doc])
|
|
self.set_annotations([doc], scores)
|
|
return doc
|
|
except Exception as e:
|
|
error_handler(self.name, self, [doc], e)
|
|
|
|
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
|
|
"""Apply the pipe to a stream of documents. This usually happens under
|
|
the hood when the nlp object is called on a text and all components are
|
|
applied to the Doc.
|
|
|
|
stream (Iterable[Doc]): A stream of documents.
|
|
batch_size (int): The number of documents to buffer.
|
|
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
|
|
deals with a failing batch of documents. The default function just reraises
|
|
the exception.
|
|
YIELDS (Doc): Processed documents in order.
|
|
|
|
DOCS: https://spacy.io/api/pipe#pipe
|
|
"""
|
|
error_handler = self.get_error_handler()
|
|
for docs in util.minibatch(stream, size=batch_size):
|
|
try:
|
|
scores = self.predict(docs)
|
|
self.set_annotations(docs, scores)
|
|
yield from docs
|
|
except Exception as e:
|
|
error_handler(self.name, self, docs, e)
|
|
|
|
def predict(self, docs: Iterable[Doc]):
|
|
"""Apply the pipeline's model to a batch of docs, without modifying them.
|
|
Returns a single tensor for a batch of documents.
|
|
|
|
docs (Iterable[Doc]): The documents to predict.
|
|
RETURNS: Vector representations of the predictions.
|
|
|
|
DOCS: https://spacy.io/api/pipe#predict
|
|
"""
|
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="predict", name=self.name))
|
|
|
|
def set_annotations(self, docs: Iterable[Doc], scores):
|
|
"""Modify a batch of documents, using pre-computed scores.
|
|
|
|
docs (Iterable[Doc]): The documents to modify.
|
|
scores: The scores to assign.
|
|
|
|
DOCS: https://spacy.io/api/pipe#set_annotations
|
|
"""
|
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="set_annotations", name=self.name))
|
|
|
|
def update(self,
|
|
examples: Iterable["Example"],
|
|
*,
|
|
drop: float=0.0,
|
|
sgd: Optimizer=None,
|
|
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
|
|
"""Learn from a batch of documents and gold-standard information,
|
|
updating the pipe's model. Delegates to predict and get_loss.
|
|
|
|
examples (Iterable[Example]): A batch of Example objects.
|
|
drop (float): The dropout rate.
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
losses (Dict[str, float]): Optional record of the loss during training.
|
|
Updated using the component name as the key.
|
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
|
|
DOCS: https://spacy.io/api/pipe#update
|
|
"""
|
|
if losses is None:
|
|
losses = {}
|
|
if not hasattr(self, "model") or self.model in (None, True, False):
|
|
return losses
|
|
losses.setdefault(self.name, 0.0)
|
|
validate_examples(examples, "TrainablePipe.update")
|
|
if not any(len(eg.predicted) if eg.predicted else 0 for eg in examples):
|
|
# Handle cases where there are no tokens in any docs.
|
|
return losses
|
|
set_dropout_rate(self.model, drop)
|
|
scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
|
|
loss, d_scores = self.get_loss(examples, scores)
|
|
bp_scores(d_scores)
|
|
if sgd not in (None, False):
|
|
self.finish_update(sgd)
|
|
losses[self.name] += loss
|
|
return losses
|
|
|
|
def rehearse(self,
|
|
examples: Iterable[Example],
|
|
*,
|
|
sgd: Optimizer=None,
|
|
losses: Dict[str, float]=None,
|
|
**config) -> Dict[str, float]:
|
|
"""Perform a "rehearsal" update from a batch of data. Rehearsal updates
|
|
teach the current model to make predictions similar to an initial model,
|
|
to try to address the "catastrophic forgetting" problem. This feature is
|
|
experimental.
|
|
|
|
examples (Iterable[Example]): A batch of Example objects.
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
losses (Dict[str, float]): Optional record of the loss during training.
|
|
Updated using the component name as the key.
|
|
RETURNS (Dict[str, float]): The updated losses dictionary.
|
|
|
|
DOCS: https://spacy.io/api/pipe#rehearse
|
|
"""
|
|
pass
|
|
|
|
def get_loss(self, examples: Iterable[Example], scores) -> Tuple[float, float]:
|
|
"""Find the loss and gradient of loss for the batch of documents and
|
|
their predicted scores.
|
|
|
|
examples (Iterable[Examples]): The batch of examples.
|
|
scores: Scores representing the model's predictions.
|
|
RETURNS (Tuple[float, float]): The loss and the gradient.
|
|
|
|
DOCS: https://spacy.io/api/pipe#get_loss
|
|
"""
|
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
|
|
|
|
def create_optimizer(self) -> Optimizer:
|
|
"""Create an optimizer for the pipeline component.
|
|
|
|
RETURNS (thinc.api.Optimizer): The optimizer.
|
|
|
|
DOCS: https://spacy.io/api/pipe#create_optimizer
|
|
"""
|
|
return util.create_default_optimizer()
|
|
|
|
def initialize(self, get_examples: Callable[[], Iterable[Example]], *, nlp: Language=None):
|
|
"""Initialize the pipe for training, using data examples if available.
|
|
This method needs to be implemented by each TrainablePipe component,
|
|
ensuring the internal model (if available) is initialized properly
|
|
using the provided sample of Example objects.
|
|
|
|
get_examples (Callable[[], Iterable[Example]]): Function that
|
|
returns a representative sample of gold-standard Example objects.
|
|
nlp (Language): The current nlp object the component is part of.
|
|
|
|
DOCS: https://spacy.io/api/pipe#initialize
|
|
"""
|
|
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="initialize", name=self.name))
|
|
|
|
def add_label(self, label: str) -> int:
|
|
"""Add an output label.
|
|
For TrainablePipe components, it is possible to
|
|
extend pretrained models with new labels, but care should be taken to
|
|
avoid the "catastrophic forgetting" problem.
|
|
|
|
label (str): The label to add.
|
|
RETURNS (int): 0 if label is already present, otherwise 1.
|
|
|
|
DOCS: https://spacy.io/api/pipe#add_label
|
|
"""
|
|
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
|
|
|
|
@property
|
|
def is_trainable(self) -> bool:
|
|
return True
|
|
|
|
@property
|
|
def is_resizable(self) -> bool:
|
|
return getattr(self, "model", None) and "resize_output" in self.model.attrs
|
|
|
|
def _allow_extra_label(self) -> None:
|
|
"""Raise an error if the component can not add any more labels."""
|
|
nO = None
|
|
if self.model.has_dim("nO"):
|
|
nO = self.model.get_dim("nO")
|
|
elif self.model.has_ref("output_layer") and self.model.get_ref("output_layer").has_dim("nO"):
|
|
nO = self.model.get_ref("output_layer").get_dim("nO")
|
|
if nO is not None and nO == len(self.labels):
|
|
if not self.is_resizable:
|
|
raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
|
|
|
|
def set_output(self, nO: int) -> None:
|
|
if self.is_resizable:
|
|
self.model.attrs["resize_output"](self.model, nO)
|
|
else:
|
|
raise NotImplementedError(Errors.E921)
|
|
|
|
def use_params(self, params: dict):
|
|
"""Modify the pipe's model, to use the given parameter values. At the
|
|
end of the context, the original parameters are restored.
|
|
|
|
params (dict): The parameter values to use in the model.
|
|
|
|
DOCS: https://spacy.io/api/pipe#use_params
|
|
"""
|
|
with self.model.use_params(params):
|
|
yield
|
|
|
|
def finish_update(self, sgd: Optimizer) -> None:
|
|
"""Update parameters using the current parameter gradients.
|
|
The Optimizer instance contains the functionality to perform
|
|
the stochastic gradient descent.
|
|
|
|
sgd (thinc.api.Optimizer): The optimizer.
|
|
|
|
DOCS: https://spacy.io/api/pipe#finish_update
|
|
"""
|
|
self.model.finish_update(sgd)
|
|
|
|
def _validate_serialization_attrs(self):
|
|
"""Check that the pipe implements the required attributes. If a subclass
|
|
implements a custom __init__ method but doesn't set these attributes,
|
|
they currently default to None, so we need to perform additonal checks.
|
|
"""
|
|
if not hasattr(self, "vocab") or self.vocab is None:
|
|
raise ValueError(Errors.E899.format(name=util.get_object_name(self)))
|
|
if not hasattr(self, "model") or self.model is None:
|
|
raise ValueError(Errors.E898.format(name=util.get_object_name(self)))
|
|
|
|
def to_bytes(self, *, exclude=tuple()):
|
|
"""Serialize the pipe to a bytestring.
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
RETURNS (bytes): The serialized object.
|
|
|
|
DOCS: https://spacy.io/api/pipe#to_bytes
|
|
"""
|
|
self._validate_serialization_attrs()
|
|
serialize = {}
|
|
if hasattr(self, "cfg") and self.cfg is not None:
|
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
|
serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
|
|
serialize["model"] = self.model.to_bytes
|
|
return util.to_bytes(serialize, exclude)
|
|
|
|
def from_bytes(self, bytes_data, *, exclude=tuple()):
|
|
"""Load the pipe from a bytestring.
|
|
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
RETURNS (TrainablePipe): The loaded object.
|
|
|
|
DOCS: https://spacy.io/api/pipe#from_bytes
|
|
"""
|
|
self._validate_serialization_attrs()
|
|
|
|
def load_model(b):
|
|
try:
|
|
self.model.from_bytes(b)
|
|
except AttributeError:
|
|
raise ValueError(Errors.E149) from None
|
|
|
|
deserialize = {}
|
|
if hasattr(self, "cfg") and self.cfg is not None:
|
|
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
|
|
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
|
|
deserialize["model"] = load_model
|
|
util.from_bytes(bytes_data, deserialize, exclude)
|
|
return self
|
|
|
|
def to_disk(self, path, *, exclude=tuple()):
|
|
"""Serialize the pipe to disk.
|
|
|
|
path (str / Path): Path to a directory.
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
|
|
DOCS: https://spacy.io/api/pipe#to_disk
|
|
"""
|
|
self._validate_serialization_attrs()
|
|
serialize = {}
|
|
if hasattr(self, "cfg") and self.cfg is not None:
|
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
|
|
serialize["model"] = lambda p: self.model.to_disk(p)
|
|
util.to_disk(path, serialize, exclude)
|
|
|
|
def from_disk(self, path, *, exclude=tuple()):
|
|
"""Load the pipe from disk.
|
|
|
|
path (str / Path): Path to a directory.
|
|
exclude (Iterable[str]): String names of serialization fields to exclude.
|
|
RETURNS (TrainablePipe): The loaded object.
|
|
|
|
DOCS: https://spacy.io/api/pipe#from_disk
|
|
"""
|
|
self._validate_serialization_attrs()
|
|
|
|
def load_model(p):
|
|
try:
|
|
with open(p, "rb") as mfile:
|
|
self.model.from_bytes(mfile.read())
|
|
except AttributeError:
|
|
raise ValueError(Errors.E149) from None
|
|
|
|
deserialize = {}
|
|
if hasattr(self, "cfg") and self.cfg is not None:
|
|
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
|
|
deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
|
|
deserialize["model"] = load_model
|
|
util.from_disk(path, deserialize, exclude)
|
|
return self
|