diff --git a/spacy/language.py b/spacy/language.py
index a0b65fd9e..cade90b24 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1100,13 +1100,12 @@ class Language:
return scorer.score(examples)
@contextmanager
- def use_params(self, params: dict, **cfg):
+ def use_params(self, params: dict):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
- **cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 6c2276dcd..4165dab83 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -128,7 +128,8 @@ class EntityLinker(Pipe):
def begin_training(
self,
- get_examples: Callable = lambda: [],
+ get_examples: Callable[[], Iterable[Example]] = lambda: [],
+ *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
) -> Optimizer:
@@ -273,7 +274,7 @@ class EntityLinker(Pipe):
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
- YIELDS (Doc): PRocessed documents in order.
+ YIELDS (Doc): Processed documents in order.
DOCS: https://spacy.io/api/entitylinker#pipe
"""
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 7f57d41c0..a6be129ba 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -97,7 +97,7 @@ class Morphologizer(Tagger):
"""Add a new label to the pipe.
label (str): The label to add.
- RETURNS (int): 1
+ RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/morphologizer#add_label
"""
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 0b0aeec62..78863be86 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -8,41 +8,51 @@ from ..errors import Errors
from .. import util
-def deserialize_config(path):
- if path.exists():
- return srsly.read_json(path)
- else:
- return {}
-
-
class Pipe:
- """This class is not instantiated directly. Components inherit from it, and
- it defines the interface that components should follow to function as
- components in a spaCy analysis pipeline.
+ """This class is a base class and not instantiated directly. Trainable
+ pipeline components like the EntityRecognizer or TextCategorizer inherit
+ from it and it defines the interface that components should follow to
+ function as trainable components in a spaCy pipeline.
+
+ DOCS: https://spacy.io/api/pipe
"""
name = None
def __init__(self, vocab, model, name, **cfg):
- """Create a new pipe instance."""
+ """Initialize a pipeline component.
+
+ vocab (Vocab): The shared vocabulary.
+ model (thinc.api.Model): The Thinc Model powering the pipeline component.
+ name (str): The component instance name.
+ **cfg: Additonal settings and config parameters.
+
+ DOCS: https://spacy.io/api/pipe#init
+ """
raise NotImplementedError
def __call__(self, Doc doc):
- """Apply the pipe to one document. The document is
- modified in-place, and returned.
+ """Add context-sensitive embeddings to the Doc.tensor attribute.
- Both __call__ and pipe should delegate to the `predict()`
- and `set_annotations()` methods.
+ docs (Doc): The Doc to preocess.
+ RETURNS (Doc): The processed Doc.
+
+ DOCS: https://spacy.io/api/pipe#call
"""
scores = self.predict([doc])
self.set_annotations([doc], scores)
return doc
- def pipe(self, stream, batch_size=128):
- """Apply the pipe to a stream of documents.
+ def pipe(self, stream, *, batch_size=128):
+ """Apply the pipe to a stream of documents. This usually happens under
+ the hood when the nlp object is called on a text and all components are
+ applied to the Doc.
- Both __call__ and pipe should delegate to the `predict()`
- and `set_annotations()` methods.
+ stream (Iterable[Doc]): A stream of documents.
+ batch_size (int): The number of documents to buffer.
+ YIELDS (Doc): Processed documents in order.
+
+ DOCS: https://spacy.io/api/pipe#pipe
"""
for docs in util.minibatch(stream, size=batch_size):
scores = self.predict(docs)
@@ -50,38 +60,90 @@ class Pipe:
yield from docs
def predict(self, docs):
- """Apply the pipeline's model to a batch of docs, without
- modifying them.
+ """Apply the pipeline's model to a batch of docs, without modifying them.
+ Returns a single tensor for a batch of documents.
+
+ docs (Iterable[Doc]): The documents to predict.
+ RETURNS: Vector representations for each token in the documents.
+
+ DOCS: https://spacy.io/api/pipe#predict
"""
raise NotImplementedError
def set_annotations(self, docs, scores):
- """Modify a batch of documents, using pre-computed scores."""
+ """Modify a batch of documents, using pre-computed scores.
+
+ docs (Iterable[Doc]): The documents to modify.
+ tokvecses: The tensors to set, produced by Pipe.predict.
+
+ DOCS: https://spacy.io/api/pipe#predict
+ """
raise NotImplementedError
- def rehearse(self, examples, sgd=None, losses=None, **config):
+ def rehearse(self, examples, *, sgd=None, losses=None, **config):
+ """Perform a "rehearsal" update from a batch of data. Rehearsal updates
+ teach the current model to make predictions similar to an initial model,
+ to try to address the "catastrophic forgetting" problem. This feature is
+ experimental.
+
+ examples (Iterable[Example]): A batch of Example objects.
+ drop (float): The dropout rate.
+ sgd (thinc.api.Optimizer): The optimizer.
+ losses (Dict[str, float]): Optional record of the loss during training.
+ Updated using the component name as the key.
+ RETURNS (Dict[str, float]): The updated losses dictionary.
+
+ DOCS: https://spacy.io/api/pipe#rehearse
+ """
pass
def get_loss(self, examples, scores):
- """Find the loss and gradient of loss for the batch of
- examples (with embedded docs) and their predicted scores."""
+ """Find the loss and gradient of loss for the batch of documents and
+ their predicted scores.
+
+ examples (Iterable[Examples]): The batch of examples.
+ scores: Scores representing the model's predictions.
+ RETUTNRS (Tuple[float, float]): The loss and the gradient.
+
+ DOCS: https://spacy.io/api/pipe#get_loss
+ """
raise NotImplementedError
def add_label(self, label):
- """Add an output label, to be predicted by the model.
+ """Add an output label, to be predicted by the model. It's possible to
+ extend pretrained models with new labels, but care should be taken to
+ avoid the "catastrophic forgetting" problem.
- It's possible to extend pretrained models with new labels,
- but care should be taken to avoid the "catastrophic forgetting"
- problem.
+ label (str): The label to add.
+ RETURNS (int): 0 if label is already present, otherwise 1.
+
+ DOCS: https://spacy.io/api/pipe#add_label
"""
raise NotImplementedError
def create_optimizer(self):
+ """Create an optimizer for the pipeline component.
+
+ RETURNS (thinc.api.Optimizer): The optimizer.
+
+ DOCS: https://spacy.io/api/pipe#create_optimizer
+ """
return create_default_optimizer()
def begin_training(self, get_examples=lambda: [], *, pipeline=None, sgd=None):
- """Initialize the pipe for training, using data exampes if available.
- If no model has been initialized yet, the model is added."""
+ """Initialize the pipe for training, using data examples if available.
+
+ get_examples (Callable[[], Iterable[Example]]): Optional function that
+ returns gold-standard Example objects.
+ pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
+ components that this component is part of. Corresponds to
+ nlp.pipeline.
+ sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
+ create_optimizer if it doesn't exist.
+ RETURNS (thinc.api.Optimizer): The optimizer.
+
+ DOCS: https://spacy.io/api/pipe#begin_training
+ """
self.model.initialize()
if hasattr(self, "vocab"):
link_vectors_to_models(self.vocab)
@@ -90,6 +152,7 @@ class Pipe:
return sgd
def set_output(self, nO):
+ # TODO: document this across components?
if self.model.has_dim("nO") is not False:
self.model.set_dim("nO", nO)
if self.model.has_ref("output_layer"):
@@ -99,6 +162,7 @@ class Pipe:
"""Get non-zero gradients of the model's parameters, as a dictionary
keyed by the parameter ID. The values are (weights, gradients) tuples.
"""
+ # TODO: How is this used?
gradients = {}
queue = [self.model]
seen = set()
@@ -113,18 +177,33 @@ class Pipe:
return gradients
def use_params(self, params):
- """Modify the pipe's model, to use the given parameter values."""
+ """Modify the pipe's model, to use the given parameter values. At the
+ end of the context, the original parameters are restored.
+
+ params (dict): The parameter values to use in the model.
+
+ DOCS: https://spacy.io/api/pipe#use_params
+ """
with self.model.use_params(params):
yield
def score(self, examples, **kwargs):
+ """Score a batch of examples.
+
+ examples (Iterable[Example]): The examples to score.
+ RETURNS (Dict[str, Any]): The scores.
+
+ DOCS: https://spacy.io/api/pipe#score
+ """
return {}
def to_bytes(self, exclude=tuple()):
"""Serialize the pipe to a bytestring.
- exclude (list): String names of serialization fields to exclude.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
+
+ DOCS: https://spacy.io/api/pipe#to_bytes
"""
serialize = {}
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
@@ -134,7 +213,13 @@ class Pipe:
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, exclude=tuple()):
- """Load the pipe from a bytestring."""
+ """Load the pipe from a bytestring.
+
+ exclude (Iterable[str]): String names of serialization fields to exclude.
+ RETURNS (Pipe): The loaded object.
+
+ DOCS: https://spacy.io/api/pipe#from_bytes
+ """
def load_model(b):
try:
@@ -151,7 +236,13 @@ class Pipe:
return self
def to_disk(self, path, exclude=tuple()):
- """Serialize the pipe to disk."""
+ """Serialize the pipe to disk.
+
+ path (str / Path): Path to a directory.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
+
+ DOCS: https://spacy.io/api/pipe#to_disk
+ """
serialize = {}
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
@@ -159,7 +250,14 @@ class Pipe:
util.to_disk(path, serialize, exclude)
def from_disk(self, path, exclude=tuple()):
- """Load the pipe from disk."""
+ """Load the pipe from disk.
+
+ path (str / Path): Path to a directory.
+ exclude (Iterable[str]): String names of serialization fields to exclude.
+ RETURNS (Pipe): The loaded object.
+
+ DOCS: https://spacy.io/api/pipe#from_disk
+ """
def load_model(p):
try:
@@ -173,3 +271,10 @@ class Pipe:
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
return self
+
+
+def deserialize_config(path):
+ if path.exists():
+ return srsly.read_json(path)
+ else:
+ return {}
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 565dda958..c52a7889b 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -329,7 +329,7 @@ class Tagger(Pipe):
label (str): The label to add.
values (Dict[int, str]): Optional values to map to the label, e.g. a
tag map dictionary.
- RETURNS (int): 1
+ RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/tagger#add_label
"""
@@ -355,10 +355,6 @@ class Tagger(Pipe):
self.vocab.morphology.load_tag_map(tag_map)
return 1
- def use_params(self, params):
- with self.model.use_params(params):
- yield
-
def score(self, examples, **kwargs):
"""Score a batch of examples.
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index b1c699cc3..2aaa4a769 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -56,7 +56,17 @@ dropout = null
"textcat",
assigns=["doc.cats"],
default_config={"labels": [], "model": DEFAULT_TEXTCAT_MODEL},
- scores=["cats_score", "cats_score_desc", "cats_p", "cats_r", "cats_f", "cats_macro_f", "cats_macro_auc", "cats_f_per_type", "cats_macro_auc_per_type"],
+ scores=[
+ "cats_score",
+ "cats_score_desc",
+ "cats_p",
+ "cats_r",
+ "cats_f",
+ "cats_macro_f",
+ "cats_macro_auc",
+ "cats_f_per_type",
+ "cats_macro_auc_per_type",
+ ],
default_score_weights={"cats_score": 1.0},
)
def make_textcat(
@@ -120,7 +130,7 @@ class TextCategorizer(Pipe):
stream (Iterable[Doc]): A stream of documents.
batch_size (int): The number of documents to buffer.
- YIELDS (Doc): PRocessed documents in order.
+ YIELDS (Doc): Processed documents in order.
DOCS: https://spacy.io/api/textcategorizer#pipe
"""
@@ -288,7 +298,7 @@ class TextCategorizer(Pipe):
"""Add a new label to the pipe.
label (str): The label to add.
- RETURNS (int): 1.
+ RETURNS (int): 0 if label is already present, otherwise 1.
DOCS: https://spacy.io/api/textcategorizer#add_label
"""
diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py
index 51a8b6a16..5bda12d1b 100644
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@@ -34,10 +34,13 @@ def make_tok2vec(nlp: Language, name: str, model: Model) -> "Tok2Vec":
class Tok2Vec(Pipe):
def __init__(self, vocab: Vocab, model: Model, name: str = "tok2vec") -> None:
- """Construct a new statistical model. Weights are not allocated on
- initialisation.
- vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
- instance with the `Doc` objects it will process.
+ """Initialize a tok2vec component.
+
+ vocab (Vocab): The shared vocabulary.
+ model (thinc.api.Model): The Thinc Model powering the pipeline component.
+ name (str): The component instance name.
+
+ DOCS: https://spacy.io/api/tok2vec#init
"""
self.vocab = vocab
self.model = model
@@ -57,20 +60,27 @@ class Tok2Vec(Pipe):
self.add_listener(node)
def __call__(self, doc: Doc) -> Doc:
- """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
- model. Vectors are set to the `Doc.tensor` attribute.
- docs (Doc or iterable): One or more documents to add vectors to.
- RETURNS (dict or None): Intermediate computations.
+ """Add context-sensitive embeddings to the Doc.tensor attribute.
+
+ docs (Doc): The Doc to preocess.
+ RETURNS (Doc): The processed Doc.
+
+ DOCS: https://spacy.io/api/tok2vec#call
"""
tokvecses = self.predict([doc])
self.set_annotations([doc], tokvecses)
return doc
- def pipe(self, stream: Iterator[Doc], batch_size: int = 128) -> Iterator[Doc]:
- """Process `Doc` objects as a stream.
- stream (iterator): A sequence of `Doc` objects to process.
- batch_size (int): Number of `Doc` objects to group.
- YIELDS (iterator): A sequence of `Doc` objects, in order of input.
+ def pipe(self, stream: Iterator[Doc], *, batch_size: int = 128) -> Iterator[Doc]:
+ """Apply the pipe to a stream of documents. This usually happens under
+ the hood when the nlp object is called on a text and all components are
+ applied to the Doc.
+
+ stream (Iterable[Doc]): A stream of documents.
+ batch_size (int): The number of documents to buffer.
+ YIELDS (Doc): Processed documents in order.
+
+ DOCS: https://spacy.io/api/tok2vec#pipe
"""
for docs in minibatch(stream, batch_size):
docs = list(docs)
@@ -78,10 +88,14 @@ class Tok2Vec(Pipe):
self.set_annotations(docs, tokvecses)
yield from docs
- def predict(self, docs: Sequence[Doc]):
- """Return a single tensor for a batch of documents.
- docs (iterable): A sequence of `Doc` objects.
- RETURNS (object): Vector representations for each token in the documents.
+ def predict(self, docs: Iterable[Doc]):
+ """Apply the pipeline's model to a batch of docs, without modifying them.
+ Returns a single tensor for a batch of documents.
+
+ docs (Iterable[Doc]): The documents to predict.
+ RETURNS: Vector representations for each token in the documents.
+
+ DOCS: https://spacy.io/api/tok2vec#predict
"""
tokvecs = self.model.predict(docs)
batch_id = Tok2VecListener.get_batch_id(docs)
@@ -90,9 +104,12 @@ class Tok2Vec(Pipe):
return tokvecs
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
- """Set the tensor attribute for a batch of documents.
- docs (iterable): A sequence of `Doc` objects.
- tokvecs (object): Vector representation for each token in the documents.
+ """Modify a batch of documents, using pre-computed scores.
+
+ docs (Iterable[Doc]): The documents to modify.
+ tokvecses: The tensors to set, produced by Tok2Vec.predict.
+
+ DOCS: https://spacy.io/api/tok2vec#predict
"""
for doc, tokvecs in zip(docs, tokvecses):
assert tokvecs.shape[0] == len(doc)
@@ -107,13 +124,19 @@ class Tok2Vec(Pipe):
losses: Optional[Dict[str, float]] = None,
set_annotations: bool = False,
):
- """Update the model.
- examples (Iterable[Example]): A batch of examples
- drop (float): The droput rate.
- sgd (Optimizer): An optimizer.
- losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
- set_annotations (bool): whether or not to update the examples with the predictions
- RETURNS (Dict[str, float]): The updated losses dictionary
+ """Learn from a batch of documents and gold-standard information,
+ updating the pipe's model.
+
+ examples (Iterable[Example]): A batch of Example objects.
+ drop (float): The dropout rate.
+ set_annotations (bool): Whether or not to update the Example objects
+ with the predictions.
+ sgd (thinc.api.Optimizer): The optimizer.
+ losses (Dict[str, float]): Optional record of the loss during training.
+ Updated using the component name as the key.
+ RETURNS (Dict[str, float]): The updated losses dictionary.
+
+ DOCS: https://spacy.io/api/tok2vec#update
"""
if losses is None:
losses = {}
@@ -122,7 +145,6 @@ class Tok2Vec(Pipe):
docs = [docs]
set_dropout_rate(self.model, drop)
tokvecs, bp_tokvecs = self.model.begin_update(docs)
-
d_tokvecs = [self.model.ops.alloc2f(*t2v.shape) for t2v in tokvecs]
losses.setdefault(self.name, 0.0)
@@ -156,14 +178,23 @@ class Tok2Vec(Pipe):
def begin_training(
self,
- get_examples: Callable = lambda: [],
+ get_examples: Callable[[], Iterable[Example]] = lambda: [],
+ *,
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
):
- """Allocate models and pre-process training data
+ """Initialize the pipe for training, using data examples if available.
- get_examples (function): Function returning example training data.
- pipeline (list): The pipeline the model is part of.
+ get_examples (Callable[[], Iterable[Example]]): Optional function that
+ returns gold-standard Example objects.
+ pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
+ components that this component is part of. Corresponds to
+ nlp.pipeline.
+ sgd (thinc.api.Optimizer): Optional optimizer. Will be created with
+ create_optimizer if it doesn't exist.
+ RETURNS (thinc.api.Optimizer): The optimizer.
+
+ DOCS: https://spacy.io/api/tok2vec#begin_training
"""
docs = [Doc(Vocab(), words=["hello"])]
self.model.initialize(X=docs)
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 7f115acd8..3b8636f07 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -123,6 +123,8 @@ cdef class Parser:
resized = True
if resized:
self._resize()
+ return 1
+ return 0
def _resize(self):
self.model.attrs["resize_output"](self.model, self.moves.n_moves)
diff --git a/spacy/util.py b/spacy/util.py
index 9c4908a78..f4c810e07 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1182,6 +1182,7 @@ VECTORS_KEY = "spacy_pretrained_vectors"
def create_default_optimizer() -> Optimizer:
+ # TODO: Do we still want to allow env_opt?
learn_rate = env_opt("learn_rate", 0.001)
beta1 = env_opt("optimizer_B1", 0.9)
beta2 = env_opt("optimizer_B2", 0.999)
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index 2a65f61d7..425b669ce 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -248,19 +248,20 @@ component.
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
> #### Example
>
> ```python
> parser = DependencyParser(nlp.vocab)
-> with parser.use_params():
+> with parser.use_params(optimizer.averages):
> parser.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## DependencyParser.add_label {#add_label tag="method"}
@@ -273,9 +274,10 @@ Add a new label to the pipe.
> parser.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ---- | ----------------- |
-| `label` | str | The label to add. |
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `label` | str | The label to add. |
+| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## DependencyParser.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 433ff6f72..b2b1eec32 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -239,7 +239,8 @@ Create an optimizer for the pipeline component.
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's EL model, to use the given parameter values.
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
> #### Example
>
@@ -249,9 +250,9 @@ Modify the pipe's EL model, to use the given parameter values.
> entity_linker.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## EntityLinker.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 4651af03c..63404e087 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -247,7 +247,8 @@ Create an optimizer for the pipeline component.
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
> #### Example
>
@@ -257,9 +258,9 @@ Modify the pipe's model, to use the given parameter values.
> ner.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## EntityRecognizer.add_label {#add_label tag="method"}
@@ -272,9 +273,10 @@ Add a new label to the pipe.
> ner.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ---- | ----------------- |
-| `label` | str | The label to add. |
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `label` | str | The label to add. |
+| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## EntityRecognizer.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index a61249dcb..d685c014b 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -271,7 +271,6 @@ their original weights after the block.
| Name | Type | Description |
| -------- | ---- | --------------------------------------------- |
| `params` | dict | A dictionary of parameters keyed by model ID. |
-| `**cfg` | - | Config parameters. |
## Language.create_pipe {#create_pipe tag="method" new="2"}
diff --git a/website/docs/api/morphologizer.md b/website/docs/api/morphologizer.md
index 64843a3e0..8ac300de3 100644
--- a/website/docs/api/morphologizer.md
+++ b/website/docs/api/morphologizer.md
@@ -233,19 +233,20 @@ Create an optimizer for the pipeline component.
## Morphologizer.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
> #### Example
>
> ```python
> morphologizer = nlp.add_pipe("morphologizer")
-> with morphologizer.use_params():
+> with morphologizer.use_params(optimizer.averages):
> morphologizer.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## Morphologizer.add_label {#add_label tag="method"}
@@ -259,9 +260,10 @@ both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
> morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin")
> ```
-| Name | Type | Description |
-| ------- | ---- | ----------------- |
-| `label` | str | The label to add. |
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `label` | str | The label to add. |
+| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## Morphologizer.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/pipe.md b/website/docs/api/pipe.md
index f87cf7d20..c03a1b4da 100644
--- a/website/docs/api/pipe.md
+++ b/website/docs/api/pipe.md
@@ -1,6 +1,381 @@
---
title: Pipe
tag: class
+teaser: Base class for trainable pipeline components
---
-TODO: write
+This class is a base class and **not instantiated directly**. Trainable pipeline
+components like the [`EntityRecognizer`](/api/entityrecognizer) or
+[`TextCategorizer`](/api/textcategorizer) inherit from it and it defines the
+interface that components should follow to function as trainable components in a
+spaCy pipeline.
+
+```python
+https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/pipe.pyx
+```
+
+## Pipe.\_\_init\_\_ {#init tag="method"}
+
+> #### Example
+>
+> ```python
+> from spacy.pipeline import Pipe
+> from spacy.language import Language
+>
+> class CustomPipe(Pipe):
+> ...
+>
+> @Language.factory("your_custom_pipe", default_config={"model": MODEL})
+> def make_custom_pipe(nlp, name, model):
+> return CustomPipe(nlp.vocab, model, name)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#create_pipe).
+
+
+
+This method needs to be overwritten with your own custom `__init__` method.
+
+
+
+| Name | Type | Description |
+| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+| `**cfg` | | Additional config parameters and settings. |
+
+## Pipe.\_\_call\_\_ {#call tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/pipe#call) and [`pipe`](/api/pipe#pipe) delegate to the
+[`predict`](/api/pipe#predict) and
+[`set_annotations`](/api/pipe#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> pipe = nlp.add_pipe("your_custom_pipe")
+> # This usually happens under the hood
+> processed = pipe(doc)
+> ```
+
+| Name | Type | Description |
+| ----------- | ----- | ------------------------ |
+| `doc` | `Doc` | The document to process. |
+| **RETURNS** | `Doc` | The processed document. |
+
+## Pipe.pipe {#pipe tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
+[`pipe`](/api/pipe#pipe) delegate to the [`predict`](/api/pipe#predict) and
+[`set_annotations`](/api/pipe#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> for doc in pipe.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Type | Description |
+| -------------- | --------------- | ----------------------------------------------------- |
+| `stream` | `Iterable[Doc]` | A stream of documents. |
+| _keyword-only_ | | |
+| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
+| **YIELDS** | `Doc` | The processed documents in order. |
+
+## Pipe.begin_training {#begin_training tag="method"}
+
+Initialize the pipe for training, using data examples if available. Return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> optimizer = pipe.begin_training(pipeline=nlp.pipeline)
+> ```
+
+| Name | Type | Description |
+| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
+| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
+| _keyword-only_ | | |
+| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
+| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/pipe#create_optimizer) if not set. |
+| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+
+## Pipe.predict {#predict tag="method"}
+
+Apply the pipeline's model to a batch of docs, without modifying them.
+
+
+
+This method needs to be overwritten with your own custom `predict` method.
+
+
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> scores = pipe.predict([doc1, doc2])
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------- | ----------------------------------------- |
+| `docs` | `Iterable[Doc]` | The documents to predict. |
+| **RETURNS** | - | The model's prediction for each document. |
+
+## Pipe.set_annotations {#set_annotations tag="method"}
+
+Modify a batch of documents, using pre-computed scores.
+
+
+
+This method needs to be overwritten with your own custom `set_annotations`
+method.
+
+
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> scores = pipe.predict(docs)
+> pipe.set_annotations(docs, scores)
+> ```
+
+| Name | Type | Description |
+| -------- | --------------- | ---------------------------------------------- |
+| `docs` | `Iterable[Doc]` | The documents to modify. |
+| `scores` | - | The scores to set, produced by `Pipe.predict`. |
+
+## Pipe.update {#update tag="method"}
+
+Learn from a batch of documents and gold-standard information, updating the
+pipe's model. Delegates to [`predict`](/api/pipe#predict).
+
+
+
+This method needs to be overwritten with your own custom `update` method.
+
+
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> optimizer = nlp.begin_training()
+> losses = pipe.update(examples, sgd=optimizer)
+> ```
+
+| Name | Type | Description |
+| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
+| _keyword-only_ | | |
+| `drop` | float | The dropout rate. |
+| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/pipe#set_annotations). |
+| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
+| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
+
+## Pipe.rehearse {#rehearse tag="method,experimental"}
+
+Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
+current model to make predictions similar to an initial model, to try to address
+the "catastrophic forgetting" problem. This feature is experimental.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> optimizer = nlp.begin_training()
+> losses = pipe.rehearse(examples, sgd=optimizer)
+> ```
+
+| Name | Type | Description |
+| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
+| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
+| _keyword-only_ | | |
+| `drop` | float | The dropout rate. |
+| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
+| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
+
+## Pipe.get_loss {#get_loss tag="method"}
+
+Find the loss and gradient of loss for the batch of documents and their
+predicted scores.
+
+> #### Example
+>
+> ```python
+> ner = nlp.add_pipe("ner")
+> scores = ner.predict([eg.predicted for eg in examples])
+> loss, d_loss = ner.get_loss(examples, scores)
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------------- | --------------------------------------------------- |
+| `examples` | `Iterable[Example]` | The batch of examples. |
+| `scores` | | Scores representing the model's predictions. |
+| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
+
+## Pipe.score {#score tag="method" new="3"}
+
+Score a batch of examples.
+
+> #### Example
+>
+> ```python
+> scores = pipe.score(examples)
+> ```
+
+| Name | Type | Description |
+| ----------- | ------------------- | --------------------------------------------------------- |
+| `examples` | `Iterable[Example]` | The examples to score. |
+| **RETURNS** | `Dict[str, Any]` | The scores, e.g. produced by the [`Scorer`](/api/scorer). |
+
+## Pipe.create_optimizer {#create_optimizer tag="method"}
+
+Create an optimizer for the pipeline component. Defaults to
+[`Adam`](https://thinc.ai/docs/api-optimizers#adam) with default settings.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> optimizer = pipe.create_optimizer()
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------------------------------------------- | -------------- |
+| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+
+## Pipe.add_label {#add_label tag="method"}
+
+Add a new label to the pipe. It's possible to extend pretrained models with new
+labels, but care should be taken to avoid the "catastrophic forgetting" problem.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> pipe.add_label("MY_LABEL")
+> ```
+
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `label` | str | The label to add. |
+| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
+
+## Pipe.use_params {#use_params tag="method, contextmanager"}
+
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> with pipe.use_params(optimizer.averages):
+> pipe.to_disk("/best_model")
+> ```
+
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
+
+## Pipe.to_disk {#to_disk tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> pipe.to_disk("/path/to/pipe")
+> ```
+
+| Name | Type | Description |
+| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+
+## Pipe.from_disk {#from_disk tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> pipe.from_disk("/path/to/pipe")
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Pipe` | The modified pipe. |
+
+## Pipe.to_bytes {#to_bytes tag="method"}
+
+> #### Example
+>
+> ```python
+> pipe = nlp.add_pipe("your_custom_pipe")
+> pipe_bytes = pipe.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name | Type | Description |
+| ----------- | --------------- | ------------------------------------------------------------------------- |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | bytes | The serialized form of the pipe. |
+
+## Pipe.from_bytes {#from_bytes tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> pipe_bytes = pipe.to_bytes()
+> pipe = nlp.add_pipe("your_custom_pipe")
+> pipe.from_bytes(pipe_bytes)
+> ```
+
+| Name | Type | Description |
+| ------------ | --------------- | ------------------------------------------------------------------------- |
+| `bytes_data` | bytes | The data to load from. |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Pipe` | The pipe. |
+
+## Serialization fields {#serialization-fields}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = pipe.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |
diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md
index 5927fbaac..2c0944b1f 100644
--- a/website/docs/api/sentencerecognizer.md
+++ b/website/docs/api/sentencerecognizer.md
@@ -265,19 +265,20 @@ Create an optimizer for the pipeline component.
## SentenceRecognizer.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
> #### Example
>
> ```python
> senter = nlp.add_pipe("senter")
-> with senter.use_params():
+> with senter.use_params(optimizer.averages):
> senter.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## SentenceRecognizer.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index 9de4667bf..351492aa9 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -263,19 +263,20 @@ Create an optimizer for the pipeline component.
## Tagger.use_params {#use_params tag="method, contextmanager"}
-Modify the pipe's model, to use the given parameter values.
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
> #### Example
>
> ```python
> tagger = nlp.add_pipe("tagger")
-> with tagger.use_params():
+> with tagger.use_params(optimizer.averages):
> tagger.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## Tagger.add_label {#add_label tag="method"}
@@ -289,10 +290,11 @@ Add a new label to the pipe.
> tagger.add_label("MY_LABEL", {POS: "NOUN"})
> ```
-| Name | Type | Description |
-| -------- | ---------------- | --------------------------------------------------------------- |
-| `label` | str | The label to add. |
-| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
+| Name | Type | Description |
+| ----------- | ---------------- | --------------------------------------------------------------- |
+| `label` | str | The label to add. |
+| `values` | `Dict[int, str]` | Optional values to map to the label, e.g. a tag map dictionary. |
+| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## Tagger.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 5f981de82..c4327dca7 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -262,7 +262,8 @@ Score a batch of examples.
| Name | Type | Description |
| ---------------- | ------------------- | ---------------------------------------------------------------------- |
-| `examples` | `Iterable[Example]` | The examples to score. | _keyword-only_ | | |
+| `examples` | `Iterable[Example]` | The examples to score. |
+| _keyword-only_ | | |
| `positive_label` | str | Optional positive label. |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). |
@@ -292,9 +293,10 @@ Add a new label to the pipe.
> textcat.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ---- | ----------------- |
-| `label` | str | The label to add. |
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `label` | str | The label to add. |
+| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
@@ -304,13 +306,13 @@ Modify the pipe's model, to use the given parameter values.
>
> ```python
> textcat = nlp.add_pipe("textcat")
-> with textcat.use_params():
+> with textcat.use_params(optimizer.averages):
> textcat.to_disk("/best_model")
> ```
-| Name | Type | Description |
-| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
-| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
## TextCategorizer.to_disk {#to_disk tag="method"}
diff --git a/website/docs/api/tok2vec.md b/website/docs/api/tok2vec.md
index 449712d49..29f91afe6 100644
--- a/website/docs/api/tok2vec.md
+++ b/website/docs/api/tok2vec.md
@@ -8,4 +8,295 @@ api_string_name: tok2vec
api_trainable: true
---
-TODO:
+
+
+## Config and implementation {#config}
+
+The default config is defined by the pipeline component factory and describes
+how the component should be configured. You can override its settings via the
+`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
+[`config.cfg` for training](/usage/training#config). See the
+[model architectures](/api/architectures) documentation for details on the
+architectures and their arguments and hyperparameters.
+
+> #### Example
+>
+> ```python
+> from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
+> config = {"model": DEFAULT_TOK2VEC_MODEL}
+> nlp.add_pipe("tok2vec", config=config)
+> ```
+
+| Setting | Type | Description | Default |
+| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
+
+```python
+https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
+```
+
+## Tok2Vec.\_\_init\_\_ {#init tag="method"}
+
+> #### Example
+>
+> ```python
+> # Construction via add_pipe with default model
+> tok2vec = nlp.add_pipe("tok2vec")
+>
+> # Construction via add_pipe with custom model
+> config = {"model": {"@architectures": "my_tok2vec"}}
+> parser = nlp.add_pipe("tok2vec", config=config)
+>
+> # Construction from class
+> from spacy.pipeline import Tok2Vec
+> tok2vec = Tok2Vec(nlp.vocab, model)
+> ```
+
+Create a new pipeline instance. In your application, you would normally use a
+shortcut for this and instantiate the component using its string name and
+[`nlp.add_pipe`](/api/language#create_pipe).
+
+| Name | Type | Description |
+| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The shared vocabulary. |
+| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
+| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
+
+## Tok2Vec.\_\_call\_\_ {#call tag="method"}
+
+Apply the pipe to one document. The document is modified in place, and returned.
+This usually happens under the hood when the `nlp` object is called on a text
+and all pipeline components are applied to the `Doc` in order. Both
+[`__call__`](/api/tok2vec#call) and [`pipe`](/api/tok2vec#pipe) delegate to the
+[`predict`](/api/tok2vec#predict) and
+[`set_annotations`](/api/tok2vec#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> doc = nlp("This is a sentence.")
+> tok2vec = nlp.add_pipe("tok2vec")
+> # This usually happens under the hood
+> processed = tok2vec(doc)
+> ```
+
+| Name | Type | Description |
+| ----------- | ----- | ------------------------ |
+| `doc` | `Doc` | The document to process. |
+| **RETURNS** | `Doc` | The processed document. |
+
+## Tok2Vec.pipe {#pipe tag="method"}
+
+Apply the pipe to a stream of documents. This usually happens under the hood
+when the `nlp` object is called on a text and all pipeline components are
+applied to the `Doc` in order. Both [`__call__`](/api/tok2vec#call) and
+[`pipe`](/api/tok2vec#pipe) delegate to the [`predict`](/api/tok2vec#predict)
+and [`set_annotations`](/api/tok2vec#set_annotations) methods.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> for doc in tok2vec.pipe(docs, batch_size=50):
+> pass
+> ```
+
+| Name | Type | Description |
+| -------------- | --------------- | ----------------------------------------------------- |
+| `stream` | `Iterable[Doc]` | A stream of documents. |
+| _keyword-only_ | | |
+| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
+| **YIELDS** | `Doc` | The processed documents in order. |
+
+## Tok2Vec.begin_training {#begin_training tag="method"}
+
+Initialize the pipe for training, using data examples if available. Return an
+[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> optimizer = tok2vec.begin_training(pipeline=nlp.pipeline)
+> ```
+
+| Name | Type | Description |
+| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
+| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
+| _keyword-only_ | | |
+| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
+| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tok2vec#create_optimizer) if not set. |
+| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+
+## Tok2Vec.predict {#predict tag="method"}
+
+Apply the pipeline's model to a batch of docs, without modifying them.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> scores = tok2vec.predict([doc1, doc2])
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------- | ----------------------------------------- |
+| `docs` | `Iterable[Doc]` | The documents to predict. |
+| **RETURNS** | - | The model's prediction for each document. |
+
+## Tok2Vec.set_annotations {#set_annotations tag="method"}
+
+Modify a batch of documents, using pre-computed scores.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> scores = tok2vec.predict(docs)
+> tok2vec.set_annotations(docs, scores)
+> ```
+
+| Name | Type | Description |
+| -------- | --------------- | ------------------------------------------------- |
+| `docs` | `Iterable[Doc]` | The documents to modify. |
+| `scores` | - | The scores to set, produced by `Tok2Vec.predict`. |
+
+## Tok2Vec.update {#update tag="method"}
+
+Learn from a batch of documents and gold-standard information, updating the
+pipe's model. Delegates to [`predict`](/api/tok2vec#predict).
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> optimizer = nlp.begin_training()
+> losses = tok2vec.update(examples, sgd=optimizer)
+> ```
+
+| Name | Type | Description |
+| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
+| _keyword-only_ | | |
+| `drop` | float | The dropout rate. |
+| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tok2vec#set_annotations). |
+| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
+| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
+
+## Tok2Vec.create_optimizer {#create_optimizer tag="method"}
+
+Create an optimizer for the pipeline component.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> optimizer = tok2vec.create_optimizer()
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------------------------------------------- | -------------- |
+| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
+
+## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
+
+Modify the pipe's model, to use the given parameter values. At the end of the
+context, the original parameters are restored.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> with tok2vec.use_params(optimizer.averages):
+> tok2vec.to_disk("/best_model")
+> ```
+
+| Name | Type | Description |
+| -------- | ---- | ----------------------------------------- |
+| `params` | dict | The parameter values to use in the model. |
+
+## Tok2Vec.to_disk {#to_disk tag="method"}
+
+Serialize the pipe to disk.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> tok2vec.to_disk("/path/to/tok2vec")
+> ```
+
+| Name | Type | Description |
+| --------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+
+## Tok2Vec.from_disk {#from_disk tag="method"}
+
+Load the pipe from disk. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> tok2vec.from_disk("/path/to/tok2vec")
+> ```
+
+| Name | Type | Description |
+| ----------- | --------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. |
+
+## Tok2Vec.to_bytes {#to_bytes tag="method"}
+
+> #### Example
+>
+> ```python
+> tok2vec = nlp.add_pipe("tok2vec")
+> tok2vec_bytes = tok2vec.to_bytes()
+> ```
+
+Serialize the pipe to a bytestring.
+
+| Name | Type | Description |
+| ----------- | --------------- | ------------------------------------------------------------------------- |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. |
+
+## Tok2Vec.from_bytes {#from_bytes tag="method"}
+
+Load the pipe from a bytestring. Modifies the object in place and returns it.
+
+> #### Example
+>
+> ```python
+> tok2vec_bytes = tok2vec.to_bytes()
+> tok2vec = nlp.add_pipe("tok2vec")
+> tok2vec.from_bytes(tok2vec_bytes)
+> ```
+
+| Name | Type | Description |
+| ------------ | --------------- | ------------------------------------------------------------------------- |
+| `bytes_data` | bytes | The data to load from. |
+| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. |
+
+## Serialization fields {#serialization-fields}
+
+During serialization, spaCy will export several data fields used to restore
+different aspects of the object. If needed, you can exclude them from
+serialization by passing in the string names via the `exclude` argument.
+
+> #### Example
+>
+> ```python
+> data = tok2vec.to_disk("/path", exclude=["vocab"])
+> ```
+
+| Name | Description |
+| ------- | -------------------------------------------------------------- |
+| `vocab` | The shared [`Vocab`](/api/vocab). |
+| `cfg` | The config file. You usually don't want to exclude this. |
+| `model` | The binary model data. You usually don't want to exclude this. |