mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
43b960c01b
* Update with WIP * Update with WIP * Update with pipeline serialization * Update types and pipe factories * Add deep merge, tidy up and add tests * Fix pipe creation from config * Don't validate default configs on load * Update spacy/language.py Co-authored-by: Ines Montani <ines@ines.io> * Adjust factory/component meta error * Clean up factory args and remove defaults * Add test for failing empty dict defaults * Update pipeline handling and methods * provide KB as registry function instead of as object * small change in test to make functionality more clear * update example script for EL configuration * Fix typo * Simplify test * Simplify test * splitting pipes.pyx into separate files * moving default configs to each component file * fix batch_size type * removing default values from component constructors where possible (TODO: test 4725) * skip instead of xfail * Add test for config -> nlp with multiple instances * pipeline.pipes -> pipeline.pipe * Tidy up, document, remove kwargs * small cleanup/generalization for Tok2VecListener * use DEFAULT_UPSTREAM field * revert to avoid circular imports * Fix tests * Replace deprecated arg * Make model dirs require config * fix pickling of keyword-only arguments in constructor * WIP: clean up and integrate full config * Add helper to handle function args more reliably Now also includes keyword-only args * Fix config composition and serialization * Improve config debugging and add visual diff * Remove unused defaults and fix type * Remove pipeline and factories from meta * Update spacy/default_config.cfg Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/default_config.cfg * small UX edits * avoid printing stack trace for debug CLI commands * Add support for language-specific factories * specify the section of the config which holds the model to debug * WIP: add Language.from_config * Update with language data refactor WIP * Auto-format * Add backwards-compat handling for Language.factories * Update morphologizer.pyx * Fix morphologizer * Update and simplify lemmatizers * Fix Japanese tests * Port over tagger changes * Fix Chinese and tests * Update to latest Thinc * WIP: xfail first Russian lemmatizer test * Fix component-specific overrides * fix nO for output layers in debug_model * Fix default value * Fix tests and don't pass objects in config * Fix deep merging * Fix lemma lookup data registry Only load the lookups if an entry is available in the registry (and if spacy-lookups-data is installed) * Add types * Add Vocab.from_config * Fix typo * Fix tests * Make config copying more elegant * Fix pipe analysis * Fix lemmatizers and is_base_form * WIP: move language defaults to config * Fix morphology type * Fix vocab * Remove comment * Update to latest Thinc * Add morph rules to config * Tidy up * Remove set_morphology option from tagger factory * Hack use_gpu * Move [pipeline] to top-level block and make [nlp.pipeline] list Allows separating component blocks from component order – otherwise, ordering the config would mean a changed component order, which is bad. Also allows initial config to define more components and not use all of them * Fix use_gpu and resume in CLI * Auto-format * Remove resume from config * Fix formatting and error * [pipeline] -> [components] * Fix types * Fix tagger test: requires set_morphology? Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com> Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
212 lines
7.5 KiB
Python
212 lines
7.5 KiB
Python
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
|
|
from thinc.api import Model, set_dropout_rate, Optimizer, Config
|
|
|
|
from .pipe import Pipe
|
|
from ..gold import Example
|
|
from ..tokens import Doc
|
|
from ..vocab import Vocab
|
|
from ..language import Language
|
|
from ..util import link_vectors_to_models, minibatch
|
|
|
|
|
|
default_model_config = """
|
|
[model]
|
|
@architectures = "spacy.HashEmbedCNN.v1"
|
|
pretrained_vectors = null
|
|
width = 96
|
|
depth = 4
|
|
embed_size = 2000
|
|
window_size = 1
|
|
maxout_pieces = 3
|
|
subword_features = true
|
|
dropout = null
|
|
"""
|
|
DEFAULT_TOK2VEC_MODEL = Config().from_str(default_model_config)["model"]
|
|
|
|
|
|
@Language.factory(
|
|
"tok2vec", assigns=["doc.tensor"], default_config={"model": DEFAULT_TOK2VEC_MODEL}
|
|
)
|
|
def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
|
return Tok2Vec(nlp.vocab, model, name)
|
|
|
|
|
|
class Tok2Vec(Pipe):
|
|
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
|
|
"""Construct a new statistical model. Weights are not allocated on
|
|
initialisation.
|
|
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
|
instance with the `Doc` objects it will process.
|
|
"""
|
|
self.vocab = vocab
|
|
self.model = model
|
|
self.name = name
|
|
self.listeners = []
|
|
self.cfg = {}
|
|
|
|
def add_listener(self, listener: "Tok2VecListener") -> None:
|
|
self.listeners.append(listener)
|
|
|
|
def find_listeners(self, model: Model) -> None:
|
|
for node in model.walk():
|
|
if isinstance(node, Tok2VecListener) and node.upstream_name in (
|
|
"*",
|
|
self.name,
|
|
):
|
|
self.add_listener(node)
|
|
|
|
def __call__(self, doc: Doc) -> Doc:
|
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
|
model. Vectors are set to the `Doc.tensor` attribute.
|
|
docs (Doc or iterable): One or more documents to add vectors to.
|
|
RETURNS (dict or None): Intermediate computations.
|
|
"""
|
|
tokvecses = self.predict([doc])
|
|
self.set_annotations([doc], tokvecses)
|
|
return doc
|
|
|
|
def pipe(self, stream: Iterator[Doc], batch_size: int = 128) -> Iterator[Doc]:
|
|
"""Process `Doc` objects as a stream.
|
|
stream (iterator): A sequence of `Doc` objects to process.
|
|
batch_size (int): Number of `Doc` objects to group.
|
|
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
|
"""
|
|
for docs in minibatch(stream, batch_size):
|
|
docs = list(docs)
|
|
tokvecses = self.predict(docs)
|
|
self.set_annotations(docs, tokvecses)
|
|
yield from docs
|
|
|
|
def predict(self, docs: Sequence[Doc]):
|
|
"""Return a single tensor for a batch of documents.
|
|
docs (iterable): A sequence of `Doc` objects.
|
|
RETURNS (object): Vector representations for each token in the documents.
|
|
"""
|
|
tokvecs = self.model.predict(docs)
|
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
|
for listener in self.listeners:
|
|
listener.receive(batch_id, tokvecs, None)
|
|
return tokvecs
|
|
|
|
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
|
"""Set the tensor attribute for a batch of documents.
|
|
docs (iterable): A sequence of `Doc` objects.
|
|
tokvecs (object): Vector representation for each token in the documents.
|
|
"""
|
|
for doc, tokvecs in zip(docs, tokvecses):
|
|
assert tokvecs.shape[0] == len(doc)
|
|
doc.tensor = tokvecs
|
|
|
|
def update(
|
|
self,
|
|
examples: Iterable[Example],
|
|
*,
|
|
drop: float = 0.0,
|
|
sgd: Optional[Optimizer] = None,
|
|
losses: Optional[Dict[str, float]] = None,
|
|
set_annotations: bool = False,
|
|
):
|
|
"""Update the model.
|
|
examples (Iterable[Example]): A batch of examples
|
|
drop (float): The droput rate.
|
|
sgd (Optimizer): An optimizer.
|
|
losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
|
|
set_annotations (bool): whether or not to update the examples with the predictions
|
|
RETURNS (Dict[str, float]): The updated losses dictionary
|
|
"""
|
|
if losses is None:
|
|
losses = {}
|
|
docs = [eg.predicted for eg in examples]
|
|
if isinstance(docs, Doc):
|
|
docs = [docs]
|
|
set_dropout_rate(self.model, drop)
|
|
tokvecs, bp_tokvecs = self.model.begin_update(docs)
|
|
|
|
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
|
|
losses.setdefault(self.name, 0.0)
|
|
|
|
def accumulate_gradient(one_d_tokvecs):
|
|
"""Accumulate tok2vec loss and gradient. This is passed as a callback
|
|
to all but the last listener. Only the last one does the backprop.
|
|
"""
|
|
nonlocal d_tokvecs
|
|
for i in range(len(one_d_tokvecs)):
|
|
d_tokvecs[i] += one_d_tokvecs[i]
|
|
losses[self.name] += float((one_d_tokvecs[i] ** 2).sum())
|
|
|
|
def backprop(one_d_tokvecs):
|
|
"""Callback to actually do the backprop. Passed to last listener."""
|
|
accumulate_gradient(one_d_tokvecs)
|
|
d_docs = bp_tokvecs(d_tokvecs)
|
|
if sgd is not None:
|
|
self.model.finish_update(sgd)
|
|
return d_docs
|
|
|
|
batch_id = Tok2VecListener.get_batch_id(docs)
|
|
for listener in self.listeners[:-1]:
|
|
listener.receive(batch_id, tokvecs, accumulate_gradient)
|
|
self.listeners[-1].receive(batch_id, tokvecs, backprop)
|
|
if set_annotations:
|
|
self.set_annotations(docs, tokvecs)
|
|
return losses
|
|
|
|
def get_loss(self, examples, scores):
|
|
pass
|
|
|
|
def begin_training(
|
|
self,
|
|
get_examples: Callable = lambda: [],
|
|
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
|
|
sgd: Optional[Optimizer] = None,
|
|
):
|
|
"""Allocate models and pre-process training data
|
|
|
|
get_examples (function): Function returning example training data.
|
|
pipeline (list): The pipeline the model is part of.
|
|
"""
|
|
docs = [Doc(Vocab(), words=["hello"])]
|
|
self.model.initialize(X=docs)
|
|
link_vectors_to_models(self.vocab)
|
|
|
|
|
|
class Tok2VecListener(Model):
|
|
"""A layer that gets fed its answers from an upstream connection,
|
|
for instance from a component earlier in the pipeline.
|
|
"""
|
|
|
|
name = "tok2vec-listener"
|
|
|
|
def __init__(self, upstream_name: str, width: int) -> None:
|
|
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
|
|
self.upstream_name = upstream_name
|
|
self._batch_id = None
|
|
self._outputs = None
|
|
self._backprop = None
|
|
|
|
@classmethod
|
|
def get_batch_id(cls, inputs):
|
|
return sum(sum(token.orth for token in doc) for doc in inputs)
|
|
|
|
def receive(self, batch_id, outputs, backprop):
|
|
self._batch_id = batch_id
|
|
self._outputs = outputs
|
|
self._backprop = backprop
|
|
|
|
def verify_inputs(self, inputs):
|
|
if self._batch_id is None and self._outputs is None:
|
|
raise ValueError("The Tok2Vec listener did not receive valid input.")
|
|
else:
|
|
batch_id = self.get_batch_id(inputs)
|
|
if batch_id != self._batch_id:
|
|
raise ValueError(f"Mismatched IDs! {batch_id} vs {self._batch_id}")
|
|
else:
|
|
return True
|
|
|
|
|
|
def forward(model: Tok2VecListener, inputs, is_train):
|
|
if is_train:
|
|
model.verify_inputs(inputs)
|
|
return model._outputs, model._backprop
|
|
else:
|
|
return [doc.tensor for doc in inputs], lambda dX: []
|