mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update docs and types
This commit is contained in:
parent
dab31426e1
commit
e9e8fa2466
|
@ -1,6 +1,7 @@
|
|||
from typing import List
|
||||
from thinc.api import Model
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ..tokens import Doc
|
||||
|
||||
|
||||
|
@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
|||
)
|
||||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
def init(model: Model, X=None, Y=None):
|
||||
vectors_table = model.ops.alloc3f(
|
||||
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
||||
)
|
||||
model.set_param("E", vectors_table)
|
||||
|
||||
|
||||
def forward(model, docs, is_train):
|
||||
def forward(model: Model, docs: List[Doc], is_train: bool):
|
||||
if docs is None:
|
||||
return []
|
||||
ids = []
|
||||
|
|
|
@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]:
|
|||
)
|
||||
|
||||
|
||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
||||
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
|
||||
if X is not None and Y is not None:
|
||||
if X.data.shape != Y.data.shape:
|
||||
# TODO: Fix error
|
||||
|
|
|
@ -4,14 +4,14 @@ from thinc.api import Model
|
|||
from ..attrs import LOWER
|
||||
|
||||
|
||||
def extract_ngrams(ngram_size, attr=LOWER) -> Model:
|
||||
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||
model = Model("extract_ngrams", forward)
|
||||
model.attrs["ngram_size"] = ngram_size
|
||||
model.attrs["attr"] = attr
|
||||
return model
|
||||
|
||||
|
||||
def forward(model, docs, is_train: bool):
|
||||
def forward(model: Model, docs, is_train: bool):
|
||||
batch_keys = []
|
||||
batch_vals = []
|
||||
for doc in docs:
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
from pathlib import Path
|
||||
|
||||
from typing import Optional
|
||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||
from thinc.api import Model, Maxout, Linear
|
||||
|
||||
|
@ -9,7 +8,7 @@ from ...vocab import Vocab
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||
def build_nel_encoder(tok2vec, nO=None):
|
||||
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||
with Model.define_operators({">>": chain, "**": clone}):
|
||||
token_width = tok2vec.get_dim("nO")
|
||||
output_layer = Linear(nO=nO, nI=token_width)
|
||||
|
@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None):
|
|||
|
||||
|
||||
@registry.assets.register("spacy.KBFromFile.v1")
|
||||
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
|
||||
def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
|
||||
vocab = Vocab().from_disk(vocab_path)
|
||||
kb = KnowledgeBase(vocab=vocab)
|
||||
kb.load_bulk(kb_path)
|
||||
|
|
|
@ -1,10 +1,20 @@
|
|||
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
||||
import numpy
|
||||
|
||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||
from thinc.api import MultiSoftmax, list2array
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
from ...vocab import Vocab # noqa: F401
|
||||
from ...tokens import Doc # noqa: F401
|
||||
|
||||
def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
||||
|
||||
def build_multi_task_model(
|
||||
tok2vec: Model,
|
||||
maxout_pieces: int,
|
||||
token_vector_width: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||
model = chain(
|
||||
tok2vec,
|
||||
|
@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
|||
return model
|
||||
|
||||
|
||||
def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None):
|
||||
def build_cloze_multi_task_model(
|
||||
vocab: "Vocab",
|
||||
tok2vec: Model,
|
||||
maxout_pieces: int,
|
||||
hidden_size: int,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# nO = vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
list2array(),
|
||||
|
@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=
|
|||
|
||||
|
||||
def build_cloze_characters_multi_task_model(
|
||||
vocab, tok2vec, maxout_pieces, hidden_size, nr_char
|
||||
):
|
||||
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
|
||||
) -> Model:
|
||||
output_layer = chain(
|
||||
list2array(),
|
||||
Maxout(hidden_size, nP=maxout_pieces),
|
||||
LayerNorm(nI=hidden_size),
|
||||
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
||||
)
|
||||
|
||||
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
model.set_ref("output_layer", output_layer)
|
||||
return model
|
||||
|
||||
|
||||
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||
def build_masked_language_model(
|
||||
vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
|
||||
) -> Model:
|
||||
"""Convert a model into a BERT-style masked language model"""
|
||||
|
||||
random_words = _RandomWords(vocab)
|
||||
|
||||
def mlm_forward(model, docs, is_train):
|
||||
|
@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
|||
|
||||
return output, mlm_backward
|
||||
|
||||
def mlm_initialize(model, X=None, Y=None):
|
||||
def mlm_initialize(model: Model, X=None, Y=None):
|
||||
wrapped = model.layers[0]
|
||||
wrapped.initialize(X=X, Y=Y)
|
||||
for dim in wrapped.dim_names:
|
||||
|
@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
|||
dims={dim: None for dim in wrapped_model.dim_names},
|
||||
)
|
||||
mlm_model.set_ref("wrapped", wrapped_model)
|
||||
|
||||
return mlm_model
|
||||
|
||||
|
||||
class _RandomWords:
|
||||
def __init__(self, vocab):
|
||||
def __init__(self, vocab: "Vocab") -> None:
|
||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||
self.words = self.words[:10000]
|
||||
|
@ -104,7 +119,7 @@ class _RandomWords:
|
|||
self.probs /= self.probs.sum()
|
||||
self._cache = []
|
||||
|
||||
def next(self):
|
||||
def next(self) -> str:
|
||||
if not self._cache:
|
||||
self._cache.extend(
|
||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||
|
@ -113,9 +128,11 @@ class _RandomWords:
|
|||
return self.words[index]
|
||||
|
||||
|
||||
def _apply_mask(docs, random_words, mask_prob=0.15):
|
||||
def _apply_mask(
|
||||
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
|
||||
) -> Tuple[numpy.ndarray, List["Doc"]]:
|
||||
# This needs to be here to avoid circular imports
|
||||
from ...tokens import Doc
|
||||
from ...tokens import Doc # noqa: F811
|
||||
|
||||
N = sum(len(doc) for doc in docs)
|
||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||
|
@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15):
|
|||
return mask, masked_docs
|
||||
|
||||
|
||||
def _replace_word(word, random_words, mask="[MASK]"):
|
||||
def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
|
||||
roll = numpy.random.random()
|
||||
if roll < 0.8:
|
||||
return mask
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from pydantic import StrictInt
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
|
||||
from thinc.api import LayerNorm, Maxout, Mish
|
||||
from typing import Optional
|
||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
|
@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel
|
|||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model,
|
||||
nr_feature_tokens: StrictInt,
|
||||
hidden_width: StrictInt,
|
||||
maxout_pieces: StrictInt,
|
||||
use_upper=True,
|
||||
nO=None,
|
||||
):
|
||||
nr_feature_tokens: int,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
use_upper: bool = True,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
|
||||
lower = PrecomputableAffine(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
nF=nr_feature_tokens,
|
||||
|
|
|
@ -26,7 +26,6 @@ def BiluoTagger(
|
|||
with_array(softmax_activation()),
|
||||
padded2list(),
|
||||
)
|
||||
|
||||
return Model(
|
||||
"biluo-tagger",
|
||||
forward,
|
||||
|
@ -52,7 +51,6 @@ def IOBTagger(
|
|||
with_array(softmax_activation()),
|
||||
padded2list(),
|
||||
)
|
||||
|
||||
return Model(
|
||||
"iob-tagger",
|
||||
forward,
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
from typing import Optional
|
||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||
|
||||
from ...util import registry
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tagger.v1")
|
||||
def build_tagger_model(tok2vec, nO=None) -> Model:
|
||||
def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
||||
|
|
|
@ -2,10 +2,9 @@ from typing import Optional
|
|||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
|
||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||
|
||||
from ... import util
|
||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
|
@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatBOW.v1")
|
||||
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
|
||||
def build_bow_text_classifier(
|
||||
exclusive_classes: bool,
|
||||
ngram_size: int,
|
||||
no_output_layer: bool,
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
with Model.define_operators({">>": chain}):
|
||||
sparse_linear = SparseLinear(nO)
|
||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||
|
@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
|||
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||
def build_text_classifier(
|
||||
width,
|
||||
embed_size,
|
||||
pretrained_vectors,
|
||||
exclusive_classes,
|
||||
ngram_size,
|
||||
window_size,
|
||||
conv_depth,
|
||||
dropout,
|
||||
nO=None,
|
||||
):
|
||||
width: int,
|
||||
embed_size: int,
|
||||
pretrained_vectors: Optional[bool],
|
||||
exclusive_classes: bool,
|
||||
ngram_size: int,
|
||||
window_size: int,
|
||||
conv_depth: int,
|
||||
dropout: Optional[float],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(
|
||||
|
@ -91,7 +95,6 @@ def build_text_classifier(
|
|||
dropout=dropout,
|
||||
seed=13,
|
||||
)
|
||||
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||
uniqued(
|
||||
|
@ -100,7 +103,6 @@ def build_text_classifier(
|
|||
column=cols.index(ORTH),
|
||||
)
|
||||
)
|
||||
|
||||
if pretrained_vectors:
|
||||
static_vectors = StaticVectors(width)
|
||||
vector_layer = trained_vectors | static_vectors
|
||||
|
@ -152,7 +154,12 @@ def build_text_classifier(
|
|||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
|
||||
def build_text_classifier_lowdata(
|
||||
width: int,
|
||||
pretrained_vectors: Optional[bool],
|
||||
dropout: Optional[float],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||
with Model.define_operators({">>": chain, "**": clone}):
|
||||
model = (
|
||||
|
|
|
@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
|||
from thinc.types import Floats2d
|
||||
|
||||
from ...tokens import Doc
|
||||
from ... import util
|
||||
from ...util import registry
|
||||
from ...ml import _character_embed
|
||||
from ..staticvectors import StaticVectors
|
||||
from ...pipeline.tok2vec import Tok2VecListener
|
||||
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||
def tok2vec_listener_v1(width, upstream="*"):
|
||||
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
||||
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||
return tok2vec
|
||||
|
||||
|
@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec(
|
|||
width=width,
|
||||
depth=depth,
|
||||
window_size=window_size,
|
||||
maxout_pieces=maxout_pieces
|
||||
)
|
||||
maxout_pieces=maxout_pieces,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
||||
def build_Tok2Vec_model(
|
||||
embed: Model[List[Doc], List[Floats2d]],
|
||||
|
@ -68,7 +68,6 @@ def MultiHashEmbed(
|
|||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
||||
):
|
||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||
|
||||
seed = 7
|
||||
|
||||
def make_hash_embed(feature):
|
||||
|
@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
|||
chain(
|
||||
FeatureExtractor([NORM]),
|
||||
list2ragged(),
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5))
|
||||
)
|
||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||
),
|
||||
),
|
||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||
ragged2list()
|
||||
ragged2list(),
|
||||
)
|
||||
return model
|
||||
|
||||
|
@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
|
|||
def MishWindowEncoder(width, window_size, depth):
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Mish(
|
||||
nO=width,
|
||||
nI=width * ((window_size * 2) + 1),
|
||||
dropout=0.0,
|
||||
normalize=True
|
||||
),
|
||||
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
|
|
|
@ -7,7 +7,7 @@ import importlib.util
|
|||
import re
|
||||
from pathlib import Path
|
||||
import thinc
|
||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model
|
||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
||||
import functools
|
||||
import itertools
|
||||
import numpy.random
|
||||
|
@ -24,8 +24,6 @@ import tempfile
|
|||
import shutil
|
||||
import shlex
|
||||
import inspect
|
||||
from thinc.types import Unserializable
|
||||
|
||||
|
||||
try:
|
||||
import cupy.random
|
||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
|||
- ['Tok2Vec', 'tok2vec']
|
||||
- ['Transformers', 'transformers']
|
||||
- ['Parser & NER', 'parser']
|
||||
- ['Tagging', 'tagger']
|
||||
- ['Text Classification', 'textcat']
|
||||
- ['Entity Linking', 'entitylinker']
|
||||
---
|
||||
|
@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to
|
|||
|
||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||
|
||||
<!-- TODO: intro -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||
> # TODO: ...
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------- | ----- | ----------- |
|
||||
| `width` | int | |
|
||||
| `depth` | int | |
|
||||
| `embed_size` | int | |
|
||||
| `window_size` | int | |
|
||||
| `maxout_pieces` | int | |
|
||||
| `subword_features` | bool | |
|
||||
| `dropout` | float | |
|
||||
| `pretrained_vectors` | bool | |
|
||||
|
||||
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
|
||||
|
||||
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
||||
|
@ -99,6 +124,28 @@ architectures into your training config.
|
|||
| `use_upper` | bool | |
|
||||
| `nO` | int | |
|
||||
|
||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||
|
||||
### spacy.Tagger.v1 {#Tagger}
|
||||
|
||||
<!-- TODO: intro -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.Tagger.v1"
|
||||
> nO = null
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||
| `nO` | int | |
|
||||
|
||||
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
||||
|
||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
||||
|
@ -112,3 +159,21 @@ architectures into your training config.
|
|||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||
|
||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||
|
||||
<!-- TODO: intro -->
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.EntityLinker.v1"
|
||||
> nO = null
|
||||
>
|
||||
> [model.tok2vec]
|
||||
> # ...
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||
| `nO` | int | |
|
||||
|
|
|
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("parser", config=config)
|
||||
> ```
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||
| `moves` | list | <!-- TODO: --> | `None` |
|
||||
| `moves` | list | | `None` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||
|
||||
```python
|
||||
|
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `moves` | list | <!-- TODO: --> |
|
||||
| `moves` | list | |
|
||||
| _keyword-only_ | | |
|
||||
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
||||
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
||||
| `learn_tokens` | bool | <!-- TODO: --> |
|
||||
| `min_action_freq` | int | <!-- TODO: --> |
|
||||
| `update_with_oracle_cut_size` | int | |
|
||||
| `multitasks` | `Iterable` | |
|
||||
| `learn_tokens` | bool | |
|
||||
| `min_action_freq` | int | |
|
||||
|
||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
||||
| `kb` | `KnowledgeBase` | <!-- TODO: --> | `None` |
|
||||
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | `[]` |
|
||||
| `incl_prior` | bool | <!-- TODO: --> | `True` |
|
||||
| `incl_context` | bool | <!-- TODO: --> | `True` |
|
||||
| `kb` | `KnowledgeBase` | | `None` |
|
||||
| `labels_discard` | `Iterable[str]` | | `[]` |
|
||||
| `incl_prior` | bool | | `True` |
|
||||
| `incl_context` | bool | | `True` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||
|
||||
```python
|
||||
|
@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `kb` | `KnowlegeBase` | <!-- TODO: --> |
|
||||
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> |
|
||||
| `incl_prior` | bool | <!-- TODO: --> |
|
||||
| `incl_context` | bool | <!-- TODO: --> |
|
||||
| `kb` | `KnowlegeBase` | |
|
||||
| `labels_discard` | `Iterable[str]` | |
|
||||
| `incl_prior` | bool | |
|
||||
| `incl_context` | bool | |
|
||||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("ner", config=config)
|
||||
> ```
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||
| `moves` | list | <!-- TODO: --> | `None` |
|
||||
| `moves` | list | | `None` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||
|
||||
```python
|
||||
|
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `moves` | list | <!-- TODO: --> |
|
||||
| `moves` | list | |
|
||||
| _keyword-only_ | | |
|
||||
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
||||
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
||||
| `learn_tokens` | bool | <!-- TODO: --> |
|
||||
| `min_action_freq` | int | <!-- TODO: --> |
|
||||
| `update_with_oracle_cut_size` | int | |
|
||||
| `multitasks` | `Iterable` | |
|
||||
| `learn_tokens` | bool | |
|
||||
| `min_action_freq` | int | |
|
||||
|
||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -8,9 +8,8 @@ new: 3.0
|
|||
|
||||
An `Example` holds the information for one training instance. It stores two
|
||||
`Doc` objects: one for holding the gold-standard reference data, and one for
|
||||
holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? -->
|
||||
object stores the alignment between these two documents, as they can differ in
|
||||
tokenization.
|
||||
holding the predictions of the pipeline. An `Alignment` object stores the
|
||||
alignment between these two documents, as they can differ in tokenization.
|
||||
|
||||
## Example.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
|
|
@ -98,9 +98,9 @@ decorator. For more details and examples, see the
|
|||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | The name of the component factory. |
|
||||
| _keyword-only_ | | |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||
|
@ -146,9 +146,9 @@ examples, see the
|
|||
| `name` | str | The name of the component factory. |
|
||||
| _keyword-only_ | | |
|
||||
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||
|
@ -833,8 +833,8 @@ instance and factory instance.
|
|||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory` | str | The name of the registered component factory. |
|
||||
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||
|
|
|
@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `labels_morph` | dict | <!-- TODO: --> |
|
||||
| `labels_pos` | dict | <!-- TODO: --> |
|
||||
| `labels_morph` | dict | |
|
||||
| `labels_pos` | dict | |
|
||||
|
||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -290,6 +290,8 @@ factories.
|
|||
> return Model("custom", forward, dims={"nO": nO})
|
||||
> ```
|
||||
|
||||
<!-- TODO: finish table -->
|
||||
|
||||
| Registry name | Description |
|
||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||
|
@ -297,7 +299,7 @@ factories.
|
|||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||
| `assets` | <!-- TODO: what is this used for again?--> |
|
||||
| `assets` | |
|
||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||
|
|
|
@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument.
|
|||
|
||||
Transformer tokens and outputs for one `Doc` object.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------------------------------------------------- | ----------------------------------------- |
|
||||
| `tokens` | `Dict` | <!-- TODO: --> |
|
||||
| `tensors` | `List[FloatsXd]` | <!-- TODO: --> |
|
||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
|
||||
| `width` | int | <!-- TODO: also mention it's property --> |
|
||||
<!-- TODO: finish API docs, also mention "width" is property -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------------------------------------------------- | ----------- |
|
||||
| `tokens` | `Dict` | |
|
||||
| `tensors` | `List[FloatsXd]` | |
|
||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
||||
| `width` | int | |
|
||||
|
||||
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
||||
|
||||
<!-- TODO: -->
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------- | -------------- |
|
||||
| **RETURNS** | `TransformerData` | <!-- TODO: --> |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------- | ----------- |
|
||||
| **RETURNS** | `TransformerData` | |
|
||||
|
||||
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
||||
|
||||
<!-- TODO: -->
|
||||
<!-- TODO: write, also mention doc_data is property -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
|
||||
| `spans` | `List[List[Span]]` | <!-- TODO: --> |
|
||||
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | <!-- TODO: --> |
|
||||
| `tensors` | `List[torch.Tensor]` | <!-- TODO: --> |
|
||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
|
||||
| `doc_data` | `List[TransformerData]` | <!-- TODO: also mention it's property --> |
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
||||
| `spans` | `List[List[Span]]` | |
|
||||
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | |
|
||||
| `tensors` | `List[torch.Tensor]` | |
|
||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
||||
| `doc_data` | `List[TransformerData]` | |
|
||||
|
||||
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
||||
|
||||
<!-- TODO: -->
|
||||
<!-- TODO: write -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------- | -------------- |
|
||||
| `arrays` | `List[List[Floats3d]]` | <!-- TODO: --> |
|
||||
| **RETURNS** | `FullTransformerBatch` | <!-- TODO: --> |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------- | ----------- |
|
||||
| `arrays` | `List[List[Floats3d]]` | |
|
||||
| **RETURNS** | `FullTransformerBatch` | |
|
||||
|
||||
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
||||
|
||||
Split a `TransformerData` object that represents a batch into a list with one
|
||||
`TransformerData` per `Doc`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------- | -------------- |
|
||||
| **RETURNS** | `List[TransformerData]` | <!-- TODO: --> |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------- | ----------- |
|
||||
| **RETURNS** | `List[TransformerData]` | |
|
||||
|
||||
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
|
||||
|
@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator.
|
|||
|
||||
The following built-in functions are available:
|
||||
|
||||
<!-- TODO: finish API docs -->
|
||||
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------ |
|
||||
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
||||
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
||||
| `strided_spans.v1` | <!-- TODO: --> |
|
||||
| `strided_spans.v1` | |
|
||||
|
||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||
|
||||
|
|
|
@ -231,10 +231,10 @@ available pipeline components and component functions.
|
|||
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
||||
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | <!-- TODO: --> |
|
||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | |
|
||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||
|
||||
<!-- TODO: update with more components -->
|
||||
<!-- TODO: finish and update with more components -->
|
||||
|
||||
<!-- TODO: explain default config and factories -->
|
||||
|
||||
|
|
|
@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
|||
|
||||
### Serializing the pipeline {#pipeline}
|
||||
|
||||
<!-- TODO: update this -->
|
||||
|
||||
When serializing the pipeline, keep in mind that this will only save out the
|
||||
**binary data for the individual components** to allow spaCy to restore them –
|
||||
not the entire objects. This is a good thing, because it makes serialization
|
||||
|
|
Loading…
Reference in New Issue
Block a user