mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-03 10:55:52 +03:00
Add docstrings for Tok2Vec component
This commit is contained in:
parent
46bc513a4e
commit
39a3d64c01
|
@ -32,11 +32,27 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
|
||||||
|
|
||||||
|
|
||||||
class Tok2Vec(Pipe):
|
class Tok2Vec(Pipe):
|
||||||
|
"""Apply a "token-to-vector" model and set its outputs in the doc.tensor
|
||||||
|
attribute. This is mostly useful to share a single subnetwork between multiple
|
||||||
|
components, e.g. to have one embedding and CNN network shared between a
|
||||||
|
parser, tagger and NER.
|
||||||
|
|
||||||
|
In order to use the `Tok2Vec` predictions, subsequent components should use
|
||||||
|
the `Tok2VecListener` layer as the tok2vec subnetwork of their model. This
|
||||||
|
layer will read data from the `doc.tensor` attribute during prediction.
|
||||||
|
During training, the `Tok2Vec` component will save its prediction and backprop
|
||||||
|
callback for each batch, so that the subsequent components can backpropagate
|
||||||
|
to the shared weights. This implementation is used because it allows us to
|
||||||
|
avoid relying on object identity within the models to achieve the parameter
|
||||||
|
sharing.
|
||||||
|
"""
|
||||||
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
|
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
|
||||||
"""Initialize a tok2vec component.
|
"""Initialize a tok2vec component.
|
||||||
|
|
||||||
vocab (Vocab): The shared vocabulary.
|
vocab (Vocab): The shared vocabulary.
|
||||||
model (thinc.api.Model): The Thinc Model powering the pipeline component.
|
model (thinc.api.Model[List[Doc], List[Floats2d]]):
|
||||||
|
The Thinc Model powering the pipeline component. It should take
|
||||||
|
a list of Doc objects as input, and output a list of 2d float arrays.
|
||||||
name (str): The component instance name.
|
name (str): The component instance name.
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/tok2vec#init
|
DOCS: https://spacy.io/api/tok2vec#init
|
||||||
|
@ -48,9 +64,18 @@ class Tok2Vec(Pipe):
|
||||||
self.cfg = {}
|
self.cfg = {}
|
||||||
|
|
||||||
def add_listener(self, listener: "Tok2VecListener") -> None:
|
def add_listener(self, listener: "Tok2VecListener") -> None:
|
||||||
|
"""Add a listener for a downstream component. Usually internals."""
|
||||||
self.listeners.append(listener)
|
self.listeners.append(listener)
|
||||||
|
|
||||||
def find_listeners(self, model: Model) -> None:
|
def find_listeners(self, model: Model) -> None:
|
||||||
|
"""Walk over a model, looking for layers that are Tok2vecListener
|
||||||
|
subclasses that have an upstream_name that matches this component.
|
||||||
|
Listeners can also set their upstream_name attribute to the wildcard
|
||||||
|
string '*' to match any `Tok2Vec`.
|
||||||
|
|
||||||
|
You're unlikely to ever need multiple `Tok2Vec` components, so it's
|
||||||
|
fine to leave your listeners upstream_name on '*'.
|
||||||
|
"""
|
||||||
for node in model.walk():
|
for node in model.walk():
|
||||||
if isinstance(node, Tok2VecListener) and node.upstream_name in (
|
if isinstance(node, Tok2VecListener) and node.upstream_name in (
|
||||||
"*",
|
"*",
|
||||||
|
@ -59,7 +84,8 @@ class Tok2Vec(Pipe):
|
||||||
self.add_listener(node)
|
self.add_listener(node)
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
def __call__(self, doc: Doc) -> Doc:
|
||||||
"""Add context-sensitive embeddings to the Doc.tensor attribute.
|
"""Add context-sensitive embeddings to the Doc.tensor attribute, allowing
|
||||||
|
them to be used as features by downstream components.
|
||||||
|
|
||||||
docs (Doc): The Doc to preocess.
|
docs (Doc): The Doc to preocess.
|
||||||
RETURNS (Doc): The processed Doc.
|
RETURNS (Doc): The processed Doc.
|
||||||
|
@ -205,11 +231,26 @@ class Tok2Vec(Pipe):
|
||||||
class Tok2VecListener(Model):
|
class Tok2VecListener(Model):
|
||||||
"""A layer that gets fed its answers from an upstream connection,
|
"""A layer that gets fed its answers from an upstream connection,
|
||||||
for instance from a component earlier in the pipeline.
|
for instance from a component earlier in the pipeline.
|
||||||
"""
|
|
||||||
|
|
||||||
|
The Tok2VecListener layer is used as a sublayer within a component such
|
||||||
|
as a parser, NER or text categorizer. Usually you'll have multiple listeners
|
||||||
|
connecting to a single upstream Tok2Vec component, that's earlier in the
|
||||||
|
pipeline. The Tok2VecListener layers act as proxies, passing the predictions
|
||||||
|
from the Tok2Vec component into downstream components, and communicating
|
||||||
|
gradients back upstream.
|
||||||
|
"""
|
||||||
name = "tok2vec-listener"
|
name = "tok2vec-listener"
|
||||||
|
|
||||||
def __init__(self, upstream_name: str, width: int) -> None:
|
def __init__(self, upstream_name: str, width: int) -> None:
|
||||||
|
"""
|
||||||
|
upstream_name (str): A string to identify the 'upstream' Tok2Vec component
|
||||||
|
to communicate with. The upstream name should either be the wildcard
|
||||||
|
string '*', or the name of the `Tok2Vec` component. You'll almost
|
||||||
|
never have multiple upstream Tok2Vec components, so the wildcard
|
||||||
|
string will almost always be fine.
|
||||||
|
width (int):
|
||||||
|
The width of the vectors produced by the upstream tok2vec component.
|
||||||
|
"""
|
||||||
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
|
Model.__init__(self, name=self.name, forward=forward, dims={"nO": width})
|
||||||
self.upstream_name = upstream_name
|
self.upstream_name = upstream_name
|
||||||
self._batch_id = None
|
self._batch_id = None
|
||||||
|
@ -217,15 +258,25 @@ class Tok2VecListener(Model):
|
||||||
self._backprop = None
|
self._backprop = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_batch_id(cls, inputs) -> int:
|
def get_batch_id(cls, inputs: List[Doc]) -> int:
|
||||||
|
"""Calculate a content-sensitive hash of the batch of documents, to check
|
||||||
|
whether the next batch of documents is unexpected.
|
||||||
|
"""
|
||||||
return sum(sum(token.orth for token in doc) for doc in inputs)
|
return sum(sum(token.orth for token in doc) for doc in inputs)
|
||||||
|
|
||||||
def receive(self, batch_id: int, outputs, backprop) -> None:
|
def receive(self, batch_id: int, outputs, backprop) -> None:
|
||||||
|
"""Store a batch of training predictions and a backprop callback. The
|
||||||
|
predictions and callback are produced by the upstream Tok2Vec component,
|
||||||
|
and later will be used when the listener's component's model is called.
|
||||||
|
"""
|
||||||
self._batch_id = batch_id
|
self._batch_id = batch_id
|
||||||
self._outputs = outputs
|
self._outputs = outputs
|
||||||
self._backprop = backprop
|
self._backprop = backprop
|
||||||
|
|
||||||
def verify_inputs(self, inputs) -> bool:
|
def verify_inputs(self, inputs) -> bool:
|
||||||
|
"""Check that the batch of Doc objects matches the ones we have a
|
||||||
|
prediction for.
|
||||||
|
"""
|
||||||
if self._batch_id is None and self._outputs is None:
|
if self._batch_id is None and self._outputs is None:
|
||||||
raise ValueError(Errors.E954)
|
raise ValueError(Errors.E954)
|
||||||
else:
|
else:
|
||||||
|
@ -237,6 +288,7 @@ class Tok2VecListener(Model):
|
||||||
|
|
||||||
|
|
||||||
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
def forward(model: Tok2VecListener, inputs, is_train: bool):
|
||||||
|
"""Supply the outputs from the upstream Tok2Vec component."""
|
||||||
if is_train:
|
if is_train:
|
||||||
model.verify_inputs(inputs)
|
model.verify_inputs(inputs)
|
||||||
return model._outputs, model._backprop
|
return model._outputs, model._backprop
|
||||||
|
|
Loading…
Reference in New Issue
Block a user