mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Update docs and types
This commit is contained in:
parent
dab31426e1
commit
e9e8fa2466
|
@ -1,6 +1,7 @@
|
||||||
from typing import List
|
from typing import List
|
||||||
from thinc.api import Model
|
from thinc.api import Model
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,14 +16,14 @@ def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init(model, X=None, Y=None):
|
def init(model: Model, X=None, Y=None):
|
||||||
vectors_table = model.ops.alloc3f(
|
vectors_table = model.ops.alloc3f(
|
||||||
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
||||||
)
|
)
|
||||||
model.set_param("E", vectors_table)
|
model.set_param("E", vectors_table)
|
||||||
|
|
||||||
|
|
||||||
def forward(model, docs, is_train):
|
def forward(model: Model, docs: List[Doc], is_train: bool):
|
||||||
if docs is None:
|
if docs is None:
|
||||||
return []
|
return []
|
||||||
ids = []
|
ids = []
|
||||||
|
|
|
@ -14,7 +14,7 @@ def IOB() -> Model[Padded, Padded]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
|
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
|
||||||
if X is not None and Y is not None:
|
if X is not None and Y is not None:
|
||||||
if X.data.shape != Y.data.shape:
|
if X.data.shape != Y.data.shape:
|
||||||
# TODO: Fix error
|
# TODO: Fix error
|
||||||
|
|
|
@ -4,14 +4,14 @@ from thinc.api import Model
|
||||||
from ..attrs import LOWER
|
from ..attrs import LOWER
|
||||||
|
|
||||||
|
|
||||||
def extract_ngrams(ngram_size, attr=LOWER) -> Model:
|
def extract_ngrams(ngram_size: int, attr: int = LOWER) -> Model:
|
||||||
model = Model("extract_ngrams", forward)
|
model = Model("extract_ngrams", forward)
|
||||||
model.attrs["ngram_size"] = ngram_size
|
model.attrs["ngram_size"] = ngram_size
|
||||||
model.attrs["attr"] = attr
|
model.attrs["attr"] = attr
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def forward(model, docs, is_train: bool):
|
def forward(model: Model, docs, is_train: bool):
|
||||||
batch_keys = []
|
batch_keys = []
|
||||||
batch_vals = []
|
batch_vals = []
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from pathlib import Path
|
from typing import Optional
|
||||||
|
|
||||||
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
from thinc.api import chain, clone, list2ragged, reduce_mean, residual
|
||||||
from thinc.api import Model, Maxout, Linear
|
from thinc.api import Model, Maxout, Linear
|
||||||
|
|
||||||
|
@ -9,7 +8,7 @@ from ...vocab import Vocab
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.EntityLinker.v1")
|
@registry.architectures.register("spacy.EntityLinker.v1")
|
||||||
def build_nel_encoder(tok2vec, nO=None):
|
def build_nel_encoder(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
token_width = tok2vec.get_dim("nO")
|
token_width = tok2vec.get_dim("nO")
|
||||||
output_layer = Linear(nO=nO, nI=token_width)
|
output_layer = Linear(nO=nO, nI=token_width)
|
||||||
|
@ -26,7 +25,7 @@ def build_nel_encoder(tok2vec, nO=None):
|
||||||
|
|
||||||
|
|
||||||
@registry.assets.register("spacy.KBFromFile.v1")
|
@registry.assets.register("spacy.KBFromFile.v1")
|
||||||
def load_kb(vocab_path, kb_path) -> KnowledgeBase:
|
def load_kb(vocab_path: str, kb_path: str) -> KnowledgeBase:
|
||||||
vocab = Vocab().from_disk(vocab_path)
|
vocab = Vocab().from_disk(vocab_path)
|
||||||
kb = KnowledgeBase(vocab=vocab)
|
kb = KnowledgeBase(vocab=vocab)
|
||||||
kb.load_bulk(kb_path)
|
kb.load_bulk(kb_path)
|
||||||
|
|
|
@ -1,10 +1,20 @@
|
||||||
|
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||||
from thinc.api import MultiSoftmax, list2array
|
from thinc.api import MultiSoftmax, list2array
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
# This lets us add type hints for mypy etc. without causing circular imports
|
||||||
|
from ...vocab import Vocab # noqa: F401
|
||||||
|
from ...tokens import Doc # noqa: F401
|
||||||
|
|
||||||
def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
|
||||||
|
def build_multi_task_model(
|
||||||
|
tok2vec: Model,
|
||||||
|
maxout_pieces: int,
|
||||||
|
token_vector_width: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
softmax = Softmax(nO=nO, nI=token_vector_width * 2)
|
||||||
model = chain(
|
model = chain(
|
||||||
tok2vec,
|
tok2vec,
|
||||||
|
@ -22,7 +32,13 @@ def build_multi_task_model(tok2vec, maxout_pieces, token_vector_width, nO=None):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=None):
|
def build_cloze_multi_task_model(
|
||||||
|
vocab: "Vocab",
|
||||||
|
tok2vec: Model,
|
||||||
|
maxout_pieces: int,
|
||||||
|
hidden_size: int,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
# nO = vocab.vectors.data.shape[1]
|
# nO = vocab.vectors.data.shape[1]
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
|
@ -43,24 +59,24 @@ def build_cloze_multi_task_model(vocab, tok2vec, maxout_pieces, hidden_size, nO=
|
||||||
|
|
||||||
|
|
||||||
def build_cloze_characters_multi_task_model(
|
def build_cloze_characters_multi_task_model(
|
||||||
vocab, tok2vec, maxout_pieces, hidden_size, nr_char
|
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int, nr_char: int
|
||||||
):
|
) -> Model:
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
Maxout(hidden_size, nP=maxout_pieces),
|
Maxout(hidden_size, nP=maxout_pieces),
|
||||||
LayerNorm(nI=hidden_size),
|
LayerNorm(nI=hidden_size),
|
||||||
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
||||||
)
|
)
|
||||||
|
|
||||||
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
|
model = build_masked_language_model(vocab, chain(tok2vec, output_layer))
|
||||||
model.set_ref("tok2vec", tok2vec)
|
model.set_ref("tok2vec", tok2vec)
|
||||||
model.set_ref("output_layer", output_layer)
|
model.set_ref("output_layer", output_layer)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
def build_masked_language_model(
|
||||||
|
vocab: "Vocab", wrapped_model: Model, mask_prob: float = 0.15
|
||||||
|
) -> Model:
|
||||||
"""Convert a model into a BERT-style masked language model"""
|
"""Convert a model into a BERT-style masked language model"""
|
||||||
|
|
||||||
random_words = _RandomWords(vocab)
|
random_words = _RandomWords(vocab)
|
||||||
|
|
||||||
def mlm_forward(model, docs, is_train):
|
def mlm_forward(model, docs, is_train):
|
||||||
|
@ -74,7 +90,7 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||||
|
|
||||||
return output, mlm_backward
|
return output, mlm_backward
|
||||||
|
|
||||||
def mlm_initialize(model, X=None, Y=None):
|
def mlm_initialize(model: Model, X=None, Y=None):
|
||||||
wrapped = model.layers[0]
|
wrapped = model.layers[0]
|
||||||
wrapped.initialize(X=X, Y=Y)
|
wrapped.initialize(X=X, Y=Y)
|
||||||
for dim in wrapped.dim_names:
|
for dim in wrapped.dim_names:
|
||||||
|
@ -90,12 +106,11 @@ def build_masked_language_model(vocab, wrapped_model, mask_prob=0.15):
|
||||||
dims={dim: None for dim in wrapped_model.dim_names},
|
dims={dim: None for dim in wrapped_model.dim_names},
|
||||||
)
|
)
|
||||||
mlm_model.set_ref("wrapped", wrapped_model)
|
mlm_model.set_ref("wrapped", wrapped_model)
|
||||||
|
|
||||||
return mlm_model
|
return mlm_model
|
||||||
|
|
||||||
|
|
||||||
class _RandomWords:
|
class _RandomWords:
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab: "Vocab") -> None:
|
||||||
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
self.words = [lex.text for lex in vocab if lex.prob != 0.0]
|
||||||
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
self.probs = [lex.prob for lex in vocab if lex.prob != 0.0]
|
||||||
self.words = self.words[:10000]
|
self.words = self.words[:10000]
|
||||||
|
@ -104,7 +119,7 @@ class _RandomWords:
|
||||||
self.probs /= self.probs.sum()
|
self.probs /= self.probs.sum()
|
||||||
self._cache = []
|
self._cache = []
|
||||||
|
|
||||||
def next(self):
|
def next(self) -> str:
|
||||||
if not self._cache:
|
if not self._cache:
|
||||||
self._cache.extend(
|
self._cache.extend(
|
||||||
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
numpy.random.choice(len(self.words), 10000, p=self.probs)
|
||||||
|
@ -113,9 +128,11 @@ class _RandomWords:
|
||||||
return self.words[index]
|
return self.words[index]
|
||||||
|
|
||||||
|
|
||||||
def _apply_mask(docs, random_words, mask_prob=0.15):
|
def _apply_mask(
|
||||||
|
docs: Iterable["Doc"], random_words: _RandomWords, mask_prob: float = 0.15
|
||||||
|
) -> Tuple[numpy.ndarray, List["Doc"]]:
|
||||||
# This needs to be here to avoid circular imports
|
# This needs to be here to avoid circular imports
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc # noqa: F811
|
||||||
|
|
||||||
N = sum(len(doc) for doc in docs)
|
N = sum(len(doc) for doc in docs)
|
||||||
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
mask = numpy.random.uniform(0.0, 1.0, (N,))
|
||||||
|
@ -141,7 +158,7 @@ def _apply_mask(docs, random_words, mask_prob=0.15):
|
||||||
return mask, masked_docs
|
return mask, masked_docs
|
||||||
|
|
||||||
|
|
||||||
def _replace_word(word, random_words, mask="[MASK]"):
|
def _replace_word(word: str, random_words: _RandomWords, mask: str = "[MASK]") -> str:
|
||||||
roll = numpy.random.random()
|
roll = numpy.random.random()
|
||||||
if roll < 0.8:
|
if roll < 0.8:
|
||||||
return mask
|
return mask
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
from pydantic import StrictInt
|
from typing import Optional
|
||||||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops, with_array
|
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||||
from thinc.api import LayerNorm, Maxout, Mish
|
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from .._precomputable_affine import PrecomputableAffine
|
from .._precomputable_affine import PrecomputableAffine
|
||||||
|
@ -10,16 +9,15 @@ from ..tb_framework import TransitionModel
|
||||||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||||
def build_tb_parser_model(
|
def build_tb_parser_model(
|
||||||
tok2vec: Model,
|
tok2vec: Model,
|
||||||
nr_feature_tokens: StrictInt,
|
nr_feature_tokens: int,
|
||||||
hidden_width: StrictInt,
|
hidden_width: int,
|
||||||
maxout_pieces: StrictInt,
|
maxout_pieces: int,
|
||||||
use_upper=True,
|
use_upper: bool = True,
|
||||||
nO=None,
|
nO: Optional[int] = None,
|
||||||
):
|
) -> Model:
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||||
tok2vec.set_dim("nO", hidden_width)
|
tok2vec.set_dim("nO", hidden_width)
|
||||||
|
|
||||||
lower = PrecomputableAffine(
|
lower = PrecomputableAffine(
|
||||||
nO=hidden_width if use_upper else nO,
|
nO=hidden_width if use_upper else nO,
|
||||||
nF=nr_feature_tokens,
|
nF=nr_feature_tokens,
|
||||||
|
|
|
@ -26,7 +26,6 @@ def BiluoTagger(
|
||||||
with_array(softmax_activation()),
|
with_array(softmax_activation()),
|
||||||
padded2list(),
|
padded2list(),
|
||||||
)
|
)
|
||||||
|
|
||||||
return Model(
|
return Model(
|
||||||
"biluo-tagger",
|
"biluo-tagger",
|
||||||
forward,
|
forward,
|
||||||
|
@ -52,7 +51,6 @@ def IOBTagger(
|
||||||
with_array(softmax_activation()),
|
with_array(softmax_activation()),
|
||||||
padded2list(),
|
padded2list(),
|
||||||
)
|
)
|
||||||
|
|
||||||
return Model(
|
return Model(
|
||||||
"iob-tagger",
|
"iob-tagger",
|
||||||
forward,
|
forward,
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
|
from typing import Optional
|
||||||
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
from thinc.api import zero_init, with_array, Softmax, chain, Model
|
||||||
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tagger.v1")
|
@registry.architectures.register("spacy.Tagger.v1")
|
||||||
def build_tagger_model(tok2vec, nO=None) -> Model:
|
def build_tagger_model(tok2vec: Model, nO: Optional[int] = None) -> Model:
|
||||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
||||||
|
|
|
@ -2,10 +2,9 @@ from typing import Optional
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_ragged, with_array, with_cpu, uniqued
|
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||||
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
from thinc.api import Relu, residual, expand_window, FeatureExtractor
|
||||||
|
|
||||||
from ... import util
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
|
@ -40,7 +39,12 @@ def build_simple_cnn_text_classifier(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatBOW.v1")
|
@registry.architectures.register("spacy.TextCatBOW.v1")
|
||||||
def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO=None):
|
def build_bow_text_classifier(
|
||||||
|
exclusive_classes: bool,
|
||||||
|
ngram_size: int,
|
||||||
|
no_output_layer: bool,
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
with Model.define_operators({">>": chain}):
|
with Model.define_operators({">>": chain}):
|
||||||
sparse_linear = SparseLinear(nO)
|
sparse_linear = SparseLinear(nO)
|
||||||
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
model = extract_ngrams(ngram_size, attr=ORTH) >> sparse_linear
|
||||||
|
@ -55,16 +59,16 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||||
def build_text_classifier(
|
def build_text_classifier(
|
||||||
width,
|
width: int,
|
||||||
embed_size,
|
embed_size: int,
|
||||||
pretrained_vectors,
|
pretrained_vectors: Optional[bool],
|
||||||
exclusive_classes,
|
exclusive_classes: bool,
|
||||||
ngram_size,
|
ngram_size: int,
|
||||||
window_size,
|
window_size: int,
|
||||||
conv_depth,
|
conv_depth: int,
|
||||||
dropout,
|
dropout: Optional[float],
|
||||||
nO=None,
|
nO: Optional[int] = None,
|
||||||
):
|
) -> Model:
|
||||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
lower = HashEmbed(
|
lower = HashEmbed(
|
||||||
|
@ -91,7 +95,6 @@ def build_text_classifier(
|
||||||
dropout=dropout,
|
dropout=dropout,
|
||||||
seed=13,
|
seed=13,
|
||||||
)
|
)
|
||||||
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||||
uniqued(
|
uniqued(
|
||||||
|
@ -100,7 +103,6 @@ def build_text_classifier(
|
||||||
column=cols.index(ORTH),
|
column=cols.index(ORTH),
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if pretrained_vectors:
|
if pretrained_vectors:
|
||||||
static_vectors = StaticVectors(width)
|
static_vectors = StaticVectors(width)
|
||||||
vector_layer = trained_vectors | static_vectors
|
vector_layer = trained_vectors | static_vectors
|
||||||
|
@ -152,7 +154,12 @@ def build_text_classifier(
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||||
def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None):
|
def build_text_classifier_lowdata(
|
||||||
|
width: int,
|
||||||
|
pretrained_vectors: Optional[bool],
|
||||||
|
dropout: Optional[float],
|
||||||
|
nO: Optional[int] = None,
|
||||||
|
) -> Model:
|
||||||
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
# Note, before v.3, this was the default if setting "low_data" and "pretrained_dims"
|
||||||
with Model.define_operators({">>": chain, "**": clone}):
|
with Model.define_operators({">>": chain, "**": clone}):
|
||||||
model = (
|
model = (
|
||||||
|
|
|
@ -6,16 +6,15 @@ from thinc.api import expand_window, residual, Maxout, Mish, PyTorchLSTM
|
||||||
from thinc.types import Floats2d
|
from thinc.types import Floats2d
|
||||||
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from ... import util
|
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ...ml import _character_embed
|
from ...ml import _character_embed
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ...pipeline.tok2vec import Tok2VecListener
|
from ...pipeline.tok2vec import Tok2VecListener
|
||||||
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
from ...attrs import ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
@registry.architectures.register("spacy.Tok2VecListener.v1")
|
||||||
def tok2vec_listener_v1(width, upstream="*"):
|
def tok2vec_listener_v1(width: int, upstream: str = "*"):
|
||||||
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
tok2vec = Tok2VecListener(upstream_name=upstream, width=width)
|
||||||
return tok2vec
|
return tok2vec
|
||||||
|
|
||||||
|
@ -45,10 +44,11 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
width=width,
|
width=width,
|
||||||
depth=depth,
|
depth=depth,
|
||||||
window_size=window_size,
|
window_size=window_size,
|
||||||
maxout_pieces=maxout_pieces
|
maxout_pieces=maxout_pieces,
|
||||||
)
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
@registry.architectures.register("spacy.Tok2Vec.v1")
|
||||||
def build_Tok2Vec_model(
|
def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
|
@ -68,7 +68,6 @@ def MultiHashEmbed(
|
||||||
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
|
||||||
):
|
):
|
||||||
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
|
||||||
seed = 7
|
seed = 7
|
||||||
|
|
||||||
def make_hash_embed(feature):
|
def make_hash_embed(feature):
|
||||||
|
@ -124,11 +123,11 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
|
||||||
chain(
|
chain(
|
||||||
FeatureExtractor([NORM]),
|
FeatureExtractor([NORM]),
|
||||||
list2ragged(),
|
list2ragged(),
|
||||||
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5))
|
with_array(HashEmbed(nO=width, nV=rows, column=0, seed=5)),
|
||||||
)
|
),
|
||||||
),
|
),
|
||||||
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
with_array(Maxout(width, nM * nC + width, nP=3, normalize=True, dropout=0.0)),
|
||||||
ragged2list()
|
ragged2list(),
|
||||||
)
|
)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
@ -155,12 +154,7 @@ def MaxoutWindowEncoder(width: int, window_size: int, maxout_pieces: int, depth:
|
||||||
def MishWindowEncoder(width, window_size, depth):
|
def MishWindowEncoder(width, window_size, depth):
|
||||||
cnn = chain(
|
cnn = chain(
|
||||||
expand_window(window_size=window_size),
|
expand_window(window_size=window_size),
|
||||||
Mish(
|
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||||
nO=width,
|
|
||||||
nI=width * ((window_size * 2) + 1),
|
|
||||||
dropout=0.0,
|
|
||||||
normalize=True
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
model = clone(residual(cnn), depth)
|
model = clone(residual(cnn), depth)
|
||||||
model.set_dim("nO", width)
|
model.set_dim("nO", width)
|
||||||
|
|
|
@ -7,7 +7,7 @@ import importlib.util
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import thinc
|
import thinc
|
||||||
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer, Model
|
from thinc.api import NumpyOps, get_current_ops, Adam, Config, Optimizer
|
||||||
import functools
|
import functools
|
||||||
import itertools
|
import itertools
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
@ -24,8 +24,6 @@ import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
import shlex
|
import shlex
|
||||||
import inspect
|
import inspect
|
||||||
from thinc.types import Unserializable
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import cupy.random
|
import cupy.random
|
||||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
||||||
- ['Tok2Vec', 'tok2vec']
|
- ['Tok2Vec', 'tok2vec']
|
||||||
- ['Transformers', 'transformers']
|
- ['Transformers', 'transformers']
|
||||||
- ['Parser & NER', 'parser']
|
- ['Parser & NER', 'parser']
|
||||||
|
- ['Tagging', 'tagger']
|
||||||
- ['Text Classification', 'textcat']
|
- ['Text Classification', 'textcat']
|
||||||
- ['Entity Linking', 'entitylinker']
|
- ['Entity Linking', 'entitylinker']
|
||||||
---
|
---
|
||||||
|
@ -18,6 +19,30 @@ TODO: intro and how architectures work, link to
|
||||||
|
|
||||||
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
### spacy.HashEmbedCNN.v1 {#HashEmbedCNN}
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
> # TODO: ...
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| -------------------- | ----- | ----------- |
|
||||||
|
| `width` | int | |
|
||||||
|
| `depth` | int | |
|
||||||
|
| `embed_size` | int | |
|
||||||
|
| `window_size` | int | |
|
||||||
|
| `maxout_pieces` | int | |
|
||||||
|
| `subword_features` | bool | |
|
||||||
|
| `dropout` | float | |
|
||||||
|
| `pretrained_vectors` | bool | |
|
||||||
|
|
||||||
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
|
### spacy.HashCharEmbedCNN.v1 {#HashCharEmbedCNN}
|
||||||
|
|
||||||
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
### spacy.HashCharEmbedBiLSTM.v1 {#HashCharEmbedBiLSTM}
|
||||||
|
@ -99,6 +124,28 @@ architectures into your training config.
|
||||||
| `use_upper` | bool | |
|
| `use_upper` | bool | |
|
||||||
| `nO` | int | |
|
| `nO` | int | |
|
||||||
|
|
||||||
|
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||||
|
|
||||||
|
### spacy.Tagger.v1 {#Tagger}
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.Tagger.v1"
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------- | ------------------------------------------ | ----------- |
|
||||||
|
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||||
|
| `nO` | int | |
|
||||||
|
|
||||||
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
||||||
|
|
||||||
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
### spacy.TextCatEnsemble.v1 {#TextCatEnsemble}
|
||||||
|
@ -112,3 +159,21 @@ architectures into your training config.
|
||||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||||
|
|
||||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||||
|
|
||||||
|
<!-- TODO: intro -->
|
||||||
|
|
||||||
|
> #### Example Config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [model]
|
||||||
|
> @architectures = "spacy.EntityLinker.v1"
|
||||||
|
> nO = null
|
||||||
|
>
|
||||||
|
> [model.tok2vec]
|
||||||
|
> # ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Type | Description |
|
||||||
|
| --------- | ------------------------------------------ | ----------- |
|
||||||
|
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | |
|
||||||
|
| `nO` | int | |
|
||||||
|
|
|
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("parser", config=config)
|
> nlp.add_pipe("parser", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Setting | Type | Description | Default |
|
| Setting | Type | Description | Default |
|
||||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||||
| `moves` | list | <!-- TODO: --> | `None` |
|
| `moves` | list | | `None` |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| `moves` | list | <!-- TODO: --> |
|
| `moves` | list | |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
| `update_with_oracle_cut_size` | int | |
|
||||||
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
| `multitasks` | `Iterable` | |
|
||||||
| `learn_tokens` | bool | <!-- TODO: --> |
|
| `learn_tokens` | bool | |
|
||||||
| `min_action_freq` | int | <!-- TODO: --> |
|
| `min_action_freq` | int | |
|
||||||
|
|
||||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -32,12 +32,14 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("entity_linker", config=config)
|
> nlp.add_pipe("entity_linker", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Setting | Type | Description | Default |
|
| Setting | Type | Description | Default |
|
||||||
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
|
||||||
| `kb` | `KnowledgeBase` | <!-- TODO: --> | `None` |
|
| `kb` | `KnowledgeBase` | | `None` |
|
||||||
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> | `[]` |
|
| `labels_discard` | `Iterable[str]` | | `[]` |
|
||||||
| `incl_prior` | bool | <!-- TODO: --> | `True` |
|
| `incl_prior` | bool | | `True` |
|
||||||
| `incl_context` | bool | <!-- TODO: --> | `True` |
|
| `incl_context` | bool | | `True` |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -65,16 +67,18 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
| ---------------- | --------------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `kb` | `KnowlegeBase` | <!-- TODO: --> |
|
| `kb` | `KnowlegeBase` | |
|
||||||
| `labels_discard` | `Iterable[str]` | <!-- TODO: --> |
|
| `labels_discard` | `Iterable[str]` | |
|
||||||
| `incl_prior` | bool | <!-- TODO: --> |
|
| `incl_prior` | bool | |
|
||||||
| `incl_context` | bool | <!-- TODO: --> |
|
| `incl_context` | bool | |
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,11 @@ architectures and their arguments and hyperparameters.
|
||||||
> nlp.add_pipe("ner", config=config)
|
> nlp.add_pipe("ner", config=config)
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Setting | Type | Description | Default |
|
| Setting | Type | Description | Default |
|
||||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
| ------- | ------------------------------------------ | ----------------- | ----------------------------------------------------------------- |
|
||||||
| `moves` | list | <!-- TODO: --> | `None` |
|
| `moves` | list | | `None` |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -59,17 +61,19 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| `moves` | list | <!-- TODO: --> |
|
| `moves` | list | |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `update_with_oracle_cut_size` | int | <!-- TODO: --> |
|
| `update_with_oracle_cut_size` | int | |
|
||||||
| `multitasks` | `Iterable` | <!-- TODO: --> |
|
| `multitasks` | `Iterable` | |
|
||||||
| `learn_tokens` | bool | <!-- TODO: --> |
|
| `learn_tokens` | bool | |
|
||||||
| `min_action_freq` | int | <!-- TODO: --> |
|
| `min_action_freq` | int | |
|
||||||
|
|
||||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,8 @@ new: 3.0
|
||||||
|
|
||||||
An `Example` holds the information for one training instance. It stores two
|
An `Example` holds the information for one training instance. It stores two
|
||||||
`Doc` objects: one for holding the gold-standard reference data, and one for
|
`Doc` objects: one for holding the gold-standard reference data, and one for
|
||||||
holding the predictions of the pipeline. An `Alignment` <!-- TODO: link? -->
|
holding the predictions of the pipeline. An `Alignment` object stores the
|
||||||
object stores the alignment between these two documents, as they can differ in
|
alignment between these two documents, as they can differ in tokenization.
|
||||||
tokenization.
|
|
||||||
|
|
||||||
## Example.\_\_init\_\_ {#init tag="method"}
|
## Example.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -98,9 +98,9 @@ decorator. For more details and examples, see the
|
||||||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `name` | str | The name of the component factory. |
|
| `name` | str | The name of the component factory. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
@ -146,9 +146,9 @@ examples, see the
|
||||||
| `name` | str | The name of the component factory. |
|
| `name` | str | The name of the component factory. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||||
|
@ -833,8 +833,8 @@ instance and factory instance.
|
||||||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `factory` | str | The name of the registered component factory. |
|
| `factory` | str | The name of the registered component factory. |
|
||||||
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
||||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for pipeline analysis. |
|
||||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. <!-- TODO: link to something --> |
|
| `retokenizes` | bool | Whether the component changes tokenization. Used for pipeline analysis. |
|
||||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. |
|
||||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||||
|
|
|
@ -63,14 +63,16 @@ Create a new pipeline instance. In your application, you would normally use a
|
||||||
shortcut for this and instantiate the component using its string name and
|
shortcut for this and instantiate the component using its string name and
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||||
| _keyword-only_ | | |
|
| _keyword-only_ | | |
|
||||||
| `labels_morph` | dict | <!-- TODO: --> |
|
| `labels_morph` | dict | |
|
||||||
| `labels_pos` | dict | <!-- TODO: --> |
|
| `labels_pos` | dict | |
|
||||||
|
|
||||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -290,6 +290,8 @@ factories.
|
||||||
> return Model("custom", forward, dims={"nO": nO})
|
> return Model("custom", forward, dims={"nO": nO})
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
<!-- TODO: finish table -->
|
||||||
|
|
||||||
| Registry name | Description |
|
| Registry name | Description |
|
||||||
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. |
|
||||||
|
@ -297,7 +299,7 @@ factories.
|
||||||
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
| `lookups` | Registry for large lookup tables available via `vocab.lookups`. |
|
||||||
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). |
|
||||||
| `assets` | <!-- TODO: what is this used for again?--> |
|
| `assets` | |
|
||||||
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). |
|
||||||
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). |
|
||||||
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). |
|
||||||
|
|
|
@ -347,50 +347,52 @@ serialization by passing in the string names via the `exclude` argument.
|
||||||
|
|
||||||
Transformer tokens and outputs for one `Doc` object.
|
Transformer tokens and outputs for one `Doc` object.
|
||||||
|
|
||||||
| Name | Type | Description |
|
<!-- TODO: finish API docs, also mention "width" is property -->
|
||||||
| --------- | -------------------------------------------------- | ----------------------------------------- |
|
|
||||||
| `tokens` | `Dict` | <!-- TODO: --> |
|
| Name | Type | Description |
|
||||||
| `tensors` | `List[FloatsXd]` | <!-- TODO: --> |
|
| --------- | -------------------------------------------------- | ----------- |
|
||||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
|
| `tokens` | `Dict` | |
|
||||||
| `width` | int | <!-- TODO: also mention it's property --> |
|
| `tensors` | `List[FloatsXd]` | |
|
||||||
|
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
||||||
|
| `width` | int | |
|
||||||
|
|
||||||
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------- | -------------- |
|
| ----------- | ----------------- | ----------- |
|
||||||
| **RETURNS** | `TransformerData` | <!-- TODO: --> |
|
| **RETURNS** | `TransformerData` | |
|
||||||
|
|
||||||
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: write, also mention doc_data is property -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- |
|
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | ----------- |
|
||||||
| `spans` | `List[List[Span]]` | <!-- TODO: --> |
|
| `spans` | `List[List[Span]]` | |
|
||||||
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | <!-- TODO: --> |
|
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | |
|
||||||
| `tensors` | `List[torch.Tensor]` | <!-- TODO: --> |
|
| `tensors` | `List[torch.Tensor]` | |
|
||||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | <!-- TODO: --> |
|
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | |
|
||||||
| `doc_data` | `List[TransformerData]` | <!-- TODO: also mention it's property --> |
|
| `doc_data` | `List[TransformerData]` | |
|
||||||
|
|
||||||
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
||||||
|
|
||||||
<!-- TODO: -->
|
<!-- TODO: write -->
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ---------------------- | -------------- |
|
| ----------- | ---------------------- | ----------- |
|
||||||
| `arrays` | `List[List[Floats3d]]` | <!-- TODO: --> |
|
| `arrays` | `List[List[Floats3d]]` | |
|
||||||
| **RETURNS** | `FullTransformerBatch` | <!-- TODO: --> |
|
| **RETURNS** | `FullTransformerBatch` | |
|
||||||
|
|
||||||
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
||||||
|
|
||||||
Split a `TransformerData` object that represents a batch into a list with one
|
Split a `TransformerData` object that represents a batch into a list with one
|
||||||
`TransformerData` per `Doc`.
|
`TransformerData` per `Doc`.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------- | ----------------------- | -------------- |
|
| ----------- | ----------------------- | ----------- |
|
||||||
| **RETURNS** | `List[TransformerData]` | <!-- TODO: --> |
|
| **RETURNS** | `List[TransformerData]` | |
|
||||||
|
|
||||||
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
## Span getters {#span_getters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||||
|
|
||||||
|
@ -421,11 +423,13 @@ getters using the `@registry.span_getters` decorator.
|
||||||
|
|
||||||
The following built-in functions are available:
|
The following built-in functions are available:
|
||||||
|
|
||||||
|
<!-- TODO: finish API docs -->
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------ | ------------------------------------------------------------------ |
|
| ------------------ | ------------------------------------------------------------------ |
|
||||||
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
| `doc_spans.v1` | Create a span for each doc (no transformation, process each text). |
|
||||||
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
| `sent_spans.v1` | Create a span for each sentence if sentence boundaries are set. |
|
||||||
| `strided_spans.v1` | <!-- TODO: --> |
|
| `strided_spans.v1` | |
|
||||||
|
|
||||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||||
|
|
||||||
|
|
|
@ -231,10 +231,10 @@ available pipeline components and component functions.
|
||||||
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
| `morphologizer` | [`Morphologizer`](/api/morphologizer) | Assign morphological features and coarse-grained POS tags. |
|
||||||
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
| `senter` | [`SentenceRecognizer`](/api/sentencerecognizer) | Assign sentence boundaries. |
|
||||||
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
| `sentencizer` | [`Sentencizer`](/api/sentencizer) | Add rule-based sentence segmentation without the dependency parse. |
|
||||||
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | <!-- TODO: --> |
|
| `tok2vec` | [`Tok2Vec`](/api/tok2vec) | |
|
||||||
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
| `transformer` | [`Transformer`](/api/transformer) | Assign the tokens and outputs of a transformer model. |
|
||||||
|
|
||||||
<!-- TODO: update with more components -->
|
<!-- TODO: finish and update with more components -->
|
||||||
|
|
||||||
<!-- TODO: explain default config and factories -->
|
<!-- TODO: explain default config and factories -->
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,6 @@ import Serialization101 from 'usage/101/\_serialization.md'
|
||||||
|
|
||||||
### Serializing the pipeline {#pipeline}
|
### Serializing the pipeline {#pipeline}
|
||||||
|
|
||||||
<!-- TODO: update this -->
|
|
||||||
|
|
||||||
When serializing the pipeline, keep in mind that this will only save out the
|
When serializing the pipeline, keep in mind that this will only save out the
|
||||||
**binary data for the individual components** to allow spaCy to restore them –
|
**binary data for the individual components** to allow spaCy to restore them –
|
||||||
not the entire objects. This is a good thing, because it makes serialization
|
not the entire objects. This is a good thing, because it makes serialization
|
||||||
|
|
Loading…
Reference in New Issue
Block a user