Update docs and types

This commit is contained in:
Ines Montani 2020-07-31 17:02:54 +02:00
parent dab31426e1
commit e9e8fa2466
22 changed files with 232 additions and 137 deletions

View File

@ -1,6 +1,7 @@
from typing import List from typing import List
from thinc.api import Model from thinc.api import Model
from thinc.types import Floats2d from thinc.types import Floats2d
from ..tokens import Doc from ..tokens import Doc
@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
) )
def init(model, X=None, Y=None): def init(model: Model, X=None, Y=None):
vectors_table = model.ops.alloc3f( vectors_table = model.ops.alloc3f(
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM") model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
) )
model.set_param("E", vectors_table) model.set_param("E", vectors_table)
def forward(model, docs, is_train): def forward(model: Model, docs: List[Doc], is_train: bool):
if docs is None: if docs is None:
return [] return []
ids = [] ids = []

View File

@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]:
) )
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None): def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
if X is not None and Y is not None: if X is not None and Y is not None:
if X.data.shape != Y.data.shape: if X.data.shape != Y.data.shape:
# TODO: Fix error # TODO: Fix error

View File

@ -4,14 +4,14 @@ from thinc.api import Model
from ..attrs import LOWER from ..attrs import LOWER
def extract_ngrams(ngram_size, attr=LOWER) -> Model: def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
model = Model("extract_ngrams", forward) model = Model("extract_ngrams", forward)
model.attrs["ngram_size"] = ngram_size model.attrs["ngram_size"] = ngram_size
model.attrs["attr"] = attr model.attrs["attr"] = attr
return model return model
def forward(model, docs, is_train: bool): def forward(model: Model, docs, is_train: bool):
batch_keys = [] batch_keys = []
batch_vals = [] batch_vals = []
for doc in docs: for doc in docs:

View File

@ -1,5 +1,4 @@
from pathlib import Path from typing import Optional
from thinc.api import chain, clone, list2ragged, reduce_mean, residual from thinc.api import chain, clone, list2ragged, reduce_mean, residual
from thinc.api import Model, Maxout, Linear from thinc.api import Model, Maxout, Linear
@ -9,7 +8,7 @@ from ...vocab import Vocab
@registry.architectures.register("spacy.EntityLinker.v1") @registry.architectures.register("spacy.EntityLinker.v1")
def build_nel_encoder(tok2vec, nO=None): def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):
token_width = tok2vec.get_dim("nO") token_width = tok2vec.get_dim("nO")
output_layer = Linear(nO=nO, nI=token_width) output_layer = Linear(nO=nO, nI=token_width)
@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None):
@registry.assets.register("spacy.KBFromFile.v1") @registry.assets.register("spacy.KBFromFile.v1")
def load_kb(vocab_path, kb_path) -> KnowledgeBase: def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
vocab = Vocab().from_disk(vocab_path) vocab = Vocab().from_disk(vocab_path)
kb = KnowledgeBase(vocab=vocab) kb = KnowledgeBase(vocab=vocab)
kb.load_bulk(kb_path) kb.load_bulk(kb_path)

View File

@ -1,10 +1,20 @@
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
import numpy import numpy
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array from thinc.api import MultiSoftmax, list2array
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from ...vocab import Vocab # noqa: F401
from ...tokens import Doc # noqa: F401
def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
def build_multi_task_model(
tok2vec: Model,
maxout_pieces: int,
token_vector_width: int,
nO: Optional[int] = None,
) -> Model:
softmax = Softmax(nO=nO, nI=token_vector_width * 2) softmax = Softmax(nO=nO, nI=token_vector_width * 2)
model = chain( model = chain(
tok2vec, tok2vec,
@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
return model return model
def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None): def build_cloze_multi_task_model(
vocab: "Vocab",
tok2vec: Model,
maxout_pieces: int,
hidden_size: int,
nO: Optional[int] = None,
) -> Model:
# nO = vocab.vectors.data.shape[1] # nO = vocab.vectors.data.shape[1]
output_layer = chain( output_layer = chain(
list2array(), list2array(),
@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=
def build_cloze_characters_multi_task_model( def build_cloze_characters_multi_task_model(
vocab, tok2vec, maxout_pieces, hidden_size, nr_char vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
): ) -> Model:
output_layer = chain( output_layer = chain(
list2array(), list2array(),
Maxout(hidden_size, nP=maxout_pieces), Maxout(hidden_size, nP=maxout_pieces),
LayerNorm(nI=hidden_size), LayerNorm(nI=hidden_size),
MultiSoftmax([256] * nr_char, nI=hidden_size), MultiSoftmax([256] * nr_char, nI=hidden_size),
) )
model = build_masked_language_model(vocab, chain(tok2vec, output_layer)) model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)
model.set_ref("output_layer", output_layer) model.set_ref("output_layer", output_layer)
return model return model
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15): def build_masked_language_model(
vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
) -> Model:
"""Convert a model into a BERT-style masked language model""" """Convert a model into a BERT-style masked language model"""
random_words = _RandomWords(vocab) random_words = _RandomWords(vocab)
def mlm_forward(model, docs, is_train): def mlm_forward(model, docs, is_train):
@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
return output, mlm_backward return output, mlm_backward
def mlm_initialize(model, X=None, Y=None): def mlm_initialize(model: Model, X=None, Y=None):
wrapped = model.layers[0] wrapped = model.layers[0]
wrapped.initialize(X=X, Y=Y) wrapped.initialize(X=X, Y=Y)
for dim in wrapped.dim_names: for dim in wrapped.dim_names:
@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
dims={dim: None for dim in wrapped_model.dim_names}, dims={dim: None for dim in wrapped_model.dim_names},
) )
mlm_model.set_ref("wrapped", wrapped_model) mlm_model.set_ref("wrapped", wrapped_model)
return mlm_model return mlm_model
class _RandomWords: class _RandomWords:
def __init__(self, vocab): def __init__(self, vocab: "Vocab") -> None:
self.words = [lex.text for lex in vocab if lex.prob != 0.0] self.words = [lex.text for lex in vocab if lex.prob != 0.0]
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0] self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
self.words = self.words[:10000] self.words = self.words[:10000]
@ -104,7 +119,7 @@ class _RandomWords:
self.probs /= self.probs.sum() self.probs /= self.probs.sum()
self._cache = [] self._cache = []
def next(self): def next(self) -> str:
if not self._cache: if not self._cache:
self._cache.extend( self._cache.extend(
numpy.random.choice(len(self.words), 10000, p=self.probs) numpy.random.choice(len(self.words), 10000, p=self.probs)
@ -113,9 +128,11 @@ class _RandomWords:
return self.words[index] return self.words[index]
def _apply_mask(docs, random_words, mask_prob=0.15): def _apply_mask(
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
) -> Tuple[numpy.ndarray, List["Doc"]]:
# This needs to be here to avoid circular imports # This needs to be here to avoid circular imports
from ...tokens import Doc from ...tokens import Doc # noqa: F811
N = sum(len(doc) for doc in docs) N = sum(len(doc) for doc in docs)
mask = numpy.random.uniform(0.0, 1.0, (N,)) mask = numpy.random.uniform(0.0, 1.0, (N,))
@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15):
return mask, masked_docs return mask, masked_docs
def _replace_word(word, random_words, mask="[MASK]"): def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
roll = numpy.random.random() roll = numpy.random.random()
if roll < 0.8: if roll < 0.8:
return mask return mask

View File

@ -1,6 +1,5 @@
from pydantic import StrictInt from typing import Optional
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from thinc.api import LayerNorm, Maxout, Mish
from ...util import registry from ...util import registry
from .._precomputable_affine import PrecomputableAffine from .._precomputable_affine import PrecomputableAffine
@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel
@registry.architectures.register("spacy.TransitionBasedParser.v1") @registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model( def build_tb_parser_model(
tok2vec: Model, tok2vec: Model,
nr_feature_tokens: StrictInt, nr_feature_tokens: int,
hidden_width: StrictInt, hidden_width: int,
maxout_pieces: StrictInt, maxout_pieces: int,
use_upper=True, use_upper: bool = True,
nO=None, nO: Optional[int] = None,
): ) -> Model:
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
tok2vec.set_dim("nO", hidden_width) tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine( lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO, nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens, nF=nr_feature_tokens,

View File

@ -26,7 +26,6 @@ def BiluoTagger(
with_array(softmax_activation()), with_array(softmax_activation()),
padded2list(), padded2list(),
) )
return Model( return Model(
"biluo-tagger", "biluo-tagger",
forward, forward,
@ -52,7 +51,6 @@ def IOBTagger(
with_array(softmax_activation()), with_array(softmax_activation()),
padded2list(), padded2list(),
) )
return Model( return Model(
"iob-tagger", "iob-tagger",
forward, forward,

View File

@ -1,10 +1,11 @@
from typing import Optional
from thinc.api import zero_init, with_array, Softmax, chain, Model from thinc.api import zero_init, with_array, Softmax, chain, Model
from ...util import registry from ...util import registry
@registry.architectures.register("spacy.Tagger.v1") @registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec, nO=None) -> Model: def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?! # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
output_layer = Softmax(nO, t2v_width, init_W=zero_init) output_layer = Softmax(nO, t2v_width, init_W=zero_init)

View File

@ -2,10 +2,9 @@ from typing import Optional
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window, FeatureExtractor from thinc.api import Relu, residual, expand_window, FeatureExtractor
from ... import util
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...util import registry from ...util import registry
from ..extract_ngrams import extract_ngrams from ..extract_ngrams import extract_ngrams
@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier(
@registry.architectures.register("spacy.TextCatBOW.v1") @registry.architectures.register("spacy.TextCatBOW.v1")
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None): def build_bow_text_classifier(
exclusive_classes: bool,
ngram_size: int,
no_output_layer: bool,
nO: Optional[int] = None,
) -> Model:
with Model.define_operators({">>": chain}): with Model.define_operators({">>": chain}):
sparse_linear = SparseLinear(nO) sparse_linear = SparseLinear(nO)
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
@registry.architectures.register("spacy.TextCatEnsemble.v1") @registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier( def build_text_classifier(
width, width: int,
embed_size, embed_size: int,
pretrained_vectors, pretrained_vectors: Optional[bool],
exclusive_classes, exclusive_classes: bool,
ngram_size, ngram_size: int,
window_size, window_size: int,
conv_depth, conv_depth: int,
dropout, dropout: Optional[float],
nO=None, nO: Optional[int] = None,
): ) -> Model:
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed( lower = HashEmbed(
@ -91,7 +95,6 @@ def build_text_classifier(
dropout=dropout, dropout=dropout,
seed=13, seed=13,
) )
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array( trained_vectors = FeatureExtractor(cols) >> with_array(
uniqued( uniqued(
@ -100,7 +103,6 @@ def build_text_classifier(
column=cols.index(ORTH), column=cols.index(ORTH),
) )
) )
if pretrained_vectors: if pretrained_vectors:
static_vectors = StaticVectors(width) static_vectors = StaticVectors(width)
vector_layer = trained_vectors | static_vectors vector_layer = trained_vectors | static_vectors
@ -152,7 +154,12 @@ def build_text_classifier(
@registry.architectures.register("spacy.TextCatLowData.v1") @registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): def build_text_classifier_lowdata(
width: int,
pretrained_vectors: Optional[bool],
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims" # Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
with Model.define_operators({">>": chain, "**": clone}): with Model.define_operators({">>": chain, "**": clone}):
model = ( model = (

View File

@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
from thinc.types import Floats2d from thinc.types import Floats2d
from ...tokens import Doc from ...tokens import Doc
from ... import util
from ...util import registry from ...util import registry
from ...ml import _character_embed from ...ml import _character_embed
from ..staticvectors import StaticVectors from ..staticvectors import StaticVectors
from ...pipeline.tok2vec import Tok2VecListener from ...pipeline.tok2vec import Tok2VecListener
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
@registry.architectures.register("spacy.Tok2VecListener.v1") @registry.architectures.register("spacy.Tok2VecListener.v1")
def tok2vec_listener_v1(width, upstream="*"): def tok2vec_listener_v1(width: int, upstream: str = "*"):
tok2vec = Tok2VecListener(upstream_name=upstream, width=width) tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
return tok2vec return tok2vec
@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec(
width=width, width=width,
depth=depth, depth=depth,
window_size=window_size, window_size=window_size,
maxout_pieces=maxout_pieces maxout_pieces=maxout_pieces,
) ),
) )
@registry.architectures.register("spacy.Tok2Vec.v1") @registry.architectures.register("spacy.Tok2Vec.v1")
def build_Tok2Vec_model( def build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]], embed: Model[List[Doc], List[Floats2d]],
@ -68,7 +68,6 @@ def MultiHashEmbed(
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
): ):
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
seed = 7 seed = 7
def make_hash_embed(feature): def make_hash_embed(feature):
@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
chain( chain(
FeatureExtractor([NORM]), FeatureExtractor([NORM]),
list2ragged(), list2ragged(),
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)) with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
) ),
), ),
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)), with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
ragged2list() ragged2list(),
) )
return model return model
@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
def MishWindowEncoder(width, window_size, depth): def MishWindowEncoder(width, window_size, depth):
cnn = chain( cnn = chain(
expand_window(window_size=window_size), expand_window(window_size=window_size),
Mish( Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
nO=width,
nI=width * ((window_size * 2) + 1),
dropout=0.0,
normalize=True
),
) )
model = clone(residual(cnn), depth) model = clone(residual(cnn), depth)
model.set_dim("nO", width) model.set_dim("nO", width)

View File

@ -7,7 +7,7 @@ import importlib.util
import re import re
from pathlib import Path from pathlib import Path
import thinc import thinc
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
import functools import functools
import itertools import itertools
import numpy.random import numpy.random
@ -24,8 +24,6 @@ import tempfile
import shutil import shutil
import shlex import shlex
import inspect import inspect
from thinc.types import Unserializable
try: try:
import cupy.random import cupy.random

View File

@ -6,6 +6,7 @@ menu:
- ['Tok2Vec', 'tok2vec'] - ['Tok2Vec', 'tok2vec']
- ['Transformers', 'transformers'] - ['Transformers', 'transformers']
- ['Parser & NER', 'parser'] - ['Parser & NER', 'parser']
- ['Tagging', 'tagger']
- ['Text Classification', 'textcat'] - ['Text Classification', 'textcat']
- ['Entity Linking', 'entitylinker'] - ['Entity Linking', 'entitylinker']
--- ---
@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN} ### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.HashEmbedCNN.v1"
> # TODO: ...
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| -------------------- | ----- | ----------- |
| `width` | int | |
| `depth` | int | |
| `embed_size` | int | |
| `window_size` | int | |
| `maxout_pieces` | int | |
| `subword_features` | bool | |
| `dropout` | float | |
| `pretrained_vectors` | bool | |
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN} ### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM} ### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
@ -99,6 +124,28 @@ architectures into your training config.
| `use_upper` | bool | | | `use_upper` | bool | |
| `nO` | int | | | `nO` | int | |
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
### spacy.Tagger.v1 {#Tagger}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.Tagger.v1"
> nO = null
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nO` | int | |
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble} ### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
@ -112,3 +159,21 @@ architectures into your training config.
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
### spacy.EntityLinker.v1 {#EntityLinker} ### spacy.EntityLinker.v1 {#EntityLinker}
<!-- TODO: intro -->
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.EntityLinker.v1"
> nO = null
>
> [model.tok2vec]
> # ...
> ```
| Name | Type | Description |
| --------- | ------------------------------------------ | ----------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
| `nO` | int | |

View File

@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("parser", config=config) > nlp.add_pipe("parser", config=config)
> ``` > ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default | | Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
| `moves` | list | <!-- TODO: --> | `None` | | `moves` | list | | `None` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python ```python
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description | | Name | Type | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `moves` | list | <!-- TODO: --> | | `moves` | list | |
| _keyword-only_ | | | | _keyword-only_ | | |
| `update_with_oracle_cut_size` | int | <!-- TODO: --> | | `update_with_oracle_cut_size` | int | |
| `multitasks` | `Iterable` | <!-- TODO: --> | | `multitasks` | `Iterable` | |
| `learn_tokens` | bool | <!-- TODO: --> | | `learn_tokens` | bool | |
| `min_action_freq` | int | <!-- TODO: --> | | `min_action_freq` | int | |
## DependencyParser.\_\_call\_\_ {#call tag="method"} ## DependencyParser.\_\_call\_\_ {#call tag="method"}

View File

@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config) > nlp.add_pipe("entity_linker", config=config)
> ``` > ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default | | Setting | Type | Description | Default |
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- | | ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
| `kb` | `KnowledgeBase` | <!-- TODO: --> | `None` | | `kb` | `KnowledgeBase` | | `None` |
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | `[]` | | `labels_discard` | `Iterable[str]` | | `[]` |
| `incl_prior` | bool | <!-- TODO: --> |  `True` | | `incl_prior` | bool | |  `True` |
| `incl_context` | bool | <!-- TODO: --> | `True` | | `incl_context` | bool | | `True` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
```python ```python
@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description | | Name | Type | Description |
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- | | ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `kb` | `KnowlegeBase` | <!-- TODO: --> | | `kb` | `KnowlegeBase` | |
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | | `labels_discard` | `Iterable[str]` | |
| `incl_prior` | bool | <!-- TODO: --> | | `incl_prior` | bool | |
| `incl_context` | bool | <!-- TODO: --> | | `incl_context` | bool | |
## EntityLinker.\_\_call\_\_ {#call tag="method"} ## EntityLinker.\_\_call\_\_ {#call tag="method"}

View File

@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("ner", config=config) > nlp.add_pipe("ner", config=config)
> ``` > ```
<!-- TODO: finish API docs -->
| Setting | Type | Description | Default | | Setting | Type | Description | Default |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- | | ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
| `moves` | list | <!-- TODO: --> | `None` | | `moves` | list | | `None` |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
```python ```python
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description | | Name | Type | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| `moves` | list | <!-- TODO: --> | | `moves` | list | |
| _keyword-only_ | | | | _keyword-only_ | | |
| `update_with_oracle_cut_size` | int | <!-- TODO: --> | | `update_with_oracle_cut_size` | int | |
| `multitasks` | `Iterable` | <!-- TODO: --> | | `multitasks` | `Iterable` | |
| `learn_tokens` | bool | <!-- TODO: --> | | `learn_tokens` | bool | |
| `min_action_freq` | int | <!-- TODO: --> | | `min_action_freq` | int | |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"} ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}

View File

@ -8,9 +8,8 @@ new: 3.0
An `Example` holds the information for one training instance. It stores two An `Example` holds the information for one training instance. It stores two
`Doc` objects: one for holding the gold-standard reference data, and one for `Doc` objects: one for holding the gold-standard reference data, and one for
holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? --> holding the predictions of the pipeline. An `Alignment` object stores the
object stores the alignment between these two documents, as they can differ in alignment between these two documents, as they can differ in tokenization.
tokenization.
## Example.\_\_init\_\_ {#init tag="method"} ## Example.\_\_init\_\_ {#init tag="method"}

View File

@ -98,9 +98,9 @@ decorator. For more details and examples, see the
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | The name of the component factory. | | `name` | str | The name of the component factory. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> | | `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@ -146,9 +146,9 @@ examples, see the
| `name` | str | The name of the component factory. | | `name` | str | The name of the component factory. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | | `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> | | `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | | `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
@ -833,8 +833,8 @@ instance and factory instance.
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | str | The name of the registered component factory. | | `factory` | str | The name of the registered component factory. |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | | `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> | | `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something -->  | | `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis.  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something -->  | | `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis.  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. | | `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |

View File

@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
<!-- TODO: finish API docs -->
| Name | Type | Description | | Name | Type | Description |
| -------------- | ------- | ------------------------------------------------------------------------------------------- | | -------------- | ------- | ------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | | | _keyword-only_ | | |
| `labels_morph` | dict | <!-- TODO: --> | | `labels_morph` | dict | |
| `labels_pos` | dict | <!-- TODO: --> | | `labels_pos` | dict | |
## Morphologizer.\_\_call\_\_ {#call tag="method"} ## Morphologizer.\_\_call\_\_ {#call tag="method"}

View File

@ -290,6 +290,8 @@ factories.
> return Model("custom", forward, dims={"nO": nO}) > return Model("custom", forward, dims={"nO": nO})
> ``` > ```
<!-- TODO: finish table -->
| Registry name | Description | | Registry name | Description |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | | `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
@ -297,7 +299,7 @@ factories.
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | | `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | | `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
| `assets` | <!-- TODO: what is this used for again?--> | | `assets` | |
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | | `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | | `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | | `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |

View File

@ -347,41 +347,43 @@ serialization by passing in the string names via the `exclude` argument.
Transformer tokens and outputs for one `Doc` object. Transformer tokens and outputs for one `Doc` object.
<!-- TODO: finish API docs, also mention "width" is property -->
| Name | Type | Description | | Name | Type | Description |
| --------- | -------------------------------------------------- | ----------------------------------------- | | --------- | -------------------------------------------------- | ----------- |
| `tokens` | `Dict` | <!-- TODO: --> | | `tokens` | `Dict` | |
| `tensors` | `List[FloatsXd]` | <!-- TODO: --> | | `tensors` | `List[FloatsXd]` | |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> | | `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
| `width` | int | <!-- TODO: also mention it's property --> | | `width` | int | |
### TransformerData.empty {#transformerdata-emoty tag="classmethod"} ### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
<!-- TODO: --> <!-- TODO: finish API docs -->
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------- | -------------- | | ----------- | ----------------- | ----------- |
| **RETURNS** | `TransformerData` | <!-- TODO: --> | | **RETURNS** | `TransformerData` | |
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
<!-- TODO: --> <!-- TODO: write, also mention doc_data is property -->
| Name | Type | Description | | Name | Type | Description |
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | | ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
| `spans` | `List[List[Span]]` | <!-- TODO: --> | | `spans` | `List[List[Span]]` | |
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | <!-- TODO: --> | | `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | |
| `tensors` | `List[torch.Tensor]` | <!-- TODO: --> | | `tensors` | `List[torch.Tensor]` | |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> | | `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
| `doc_data` | `List[TransformerData]` | <!-- TODO: also mention it's property --> | | `doc_data` | `List[TransformerData]` | |
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
<!-- TODO: --> <!-- TODO: write -->
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------------- | -------------- | | ----------- | ---------------------- | ----------- |
| `arrays` | `List[List[Floats3d]]` | <!-- TODO: --> | | `arrays` | `List[List[Floats3d]]` | |
| **RETURNS** | `FullTransformerBatch` | <!-- TODO: --> | | **RETURNS** | `FullTransformerBatch` | |
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} ### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
@ -389,8 +391,8 @@ Split a `TransformerData` object that represents a batch into a list with one
`TransformerData` per `Doc`. `TransformerData` per `Doc`.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------------- | -------------- | | ----------- | ----------------------- | ----------- |
| **RETURNS** | `List[TransformerData]` | <!-- TODO: --> | | **RETURNS** | `List[TransformerData]` | |
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} ## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator.
The following built-in functions are available: The following built-in functions are available:
<!-- TODO: finish API docs -->
| Name | Description | | Name | Description |
| ------------------ | ------------------------------------------------------------------ | | ------------------ | ------------------------------------------------------------------ |
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). | | `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. | | `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
| `strided_spans.v1` | <!-- TODO: --> | | `strided_spans.v1` | |
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}

View File

@ -231,10 +231,10 @@ available pipeline components and component functions.
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. | | `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. | | `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. | | `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | <!-- TODO: --> | | `tok2vec` | [`Tok2Vec`](/api/tok2vec) | |
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. | | `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
<!-- TODO: update with more components --> <!-- TODO: finish and update with more components -->
<!-- TODO: explain default config and factories --> <!-- TODO: explain default config and factories -->

View File

@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md'
### Serializing the pipeline {#pipeline} ### Serializing the pipeline {#pipeline}
<!-- TODO: update this -->
When serializing the pipeline, keep in mind that this will only save out the When serializing the pipeline, keep in mind that this will only save out the
**binary data for the individual components** to allow spaCy to restore them **binary data for the individual components** to allow spaCy to restore them
not the entire objects. This is a good thing, because it makes serialization not the entire objects. This is a good thing, because it makes serialization