diff --git a/spacy/errors.py b/spacy/errors.py index 8cbcbe6d9..72df888a5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -463,6 +463,12 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " + "If you're using a custom function, make sure the code is available. " + "If the function is provided by a third-party package, e.g. " + "spacy-transformers, make sure the package is installed in your " + "environment.\n\nAvailable names: {available}") + E894 = ("Unknown function registry: '{name}'.") E895 = ("The 'textcat' component received gold-standard annotations with " "multiple labels per document. In spaCy 3 you should use the " "'textcat_multilabel' component for this instead. " diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 000ca5066..7a4ee3856 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -4,15 +4,13 @@ from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum -from thinc.api import HashEmbed, with_array, with_cpu, uniqued -from thinc.api import Relu, residual, expand_window +from thinc.api import with_cpu, Relu, residual from thinc.layers.chain import init as init_chain -from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER +from ...attrs import ORTH from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors -from ..featureextractor import FeatureExtractor from ...tokens import Doc from .tok2vec import get_tok2vec_width @@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model: return model -# TODO: move to legacy -@registry.architectures.register("spacy.TextCatEnsemble.v1") -def build_text_classifier_v1( - width: int, - embed_size: int, - pretrained_vectors: Optional[bool], - exclusive_classes: bool, - ngram_size: int, - window_size: int, - conv_depth: int, - dropout: Optional[float], - nO: Optional[int] = None, -) -> Model: - # Don't document this yet, I'm not sure it's right. - cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed( - nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10 - ) - prefix = HashEmbed( - nO=width // 2, - nV=embed_size, - column=cols.index(PREFIX), - dropout=dropout, - seed=11, - ) - suffix = HashEmbed( - nO=width // 2, - nV=embed_size, - column=cols.index(SUFFIX), - dropout=dropout, - seed=12, - ) - shape = HashEmbed( - nO=width // 2, - nV=embed_size, - column=cols.index(SHAPE), - dropout=dropout, - seed=13, - ) - width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) - trained_vectors = FeatureExtractor(cols) >> with_array( - uniqued( - (lower | prefix | suffix | shape) - >> Maxout(nO=width, nI=width_nI, normalize=True), - column=cols.index(ORTH), - ) - ) - if pretrained_vectors: - static_vectors = StaticVectors(width) - vector_layer = trained_vectors | static_vectors - vectors_width = width * 2 - else: - vector_layer = trained_vectors - vectors_width = width - tok2vec = vector_layer >> with_array( - Maxout(width, vectors_width, normalize=True) - >> residual( - ( - expand_window(window_size=window_size) - >> Maxout( - nO=width, nI=width * ((window_size * 2) + 1), normalize=True - ) - ) - ) - ** conv_depth, - pad=conv_depth, - ) - cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention(width) - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) - ) - - linear_model = build_bow_text_classifier( - nO=nO, - ngram_size=ngram_size, - exclusive_classes=exclusive_classes, - no_output_layer=False, - ) - nO_double = nO * 2 if nO else None - if exclusive_classes: - output_layer = Softmax(nO=nO, nI=nO_double) - else: - output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() - model = (linear_model | cnn_model) >> output_layer - model.set_ref("tok2vec", tok2vec) - if model.has_dim("nO") is not False: - model.set_dim("nO", nO) - model.set_ref("output_layer", linear_model.get_ref("output_layer")) - model.attrs["multi_label"] = not exclusive_classes - return model - - @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( width: int, dropout: Optional[float], nO: Optional[int] = None diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index dd4b6deee..f013d54d4 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec( ) -# TODO: archive -@registry.architectures.register("spacy.Tok2Vec.v1") -def _build_Tok2Vec_model( - embed: Model[List[Doc], List[Floats2d]], - encode: Model[List[Floats2d], List[Floats2d]], -) -> Model[List[Doc], List[Floats2d]]: - """Construct a tok2vec model out of embedding and encoding subnetworks. - See https://explosion.ai/blog/deep-learning-formula-nlp - - embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent - word vector representations. - encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the - embeddings, using an architecture such as a CNN, BiLSTM or transformer. - """ - receptive_field = encode.attrs.get("receptive_field", 0) - tok2vec = chain(embed, with_array(encode, pad=receptive_field)) - tok2vec.set_dim("nO", encode.get_dim("nO")) - tok2vec.set_ref("embed", embed) - tok2vec.set_ref("encode", encode) - return tok2vec - - @registry.architectures.register("spacy.Tok2Vec.v2") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], @@ -276,39 +254,6 @@ def CharacterEmbed( return model -# TODO: archive -@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def _MaxoutWindowEncoder( - width: int, window_size: int, maxout_pieces: int, depth: int -) -> Model[List[Floats2d], List[Floats2d]]: - """Encode context using convolutions with maxout activation, layer - normalization and residual connections. - - width (int): The input and output width. These are required to be the same, - to allow residual connections. This value will be determined by the - width of the inputs. Recommended values are between 64 and 300. - window_size (int): The number of words to concatenate around each token - to construct the convolution. Recommended value is 1. - maxout_pieces (int): The number of maxout pieces to use. Recommended - values are 2 or 3. - depth (int): The number of convolutional layers. Recommended value is 4. - """ - cnn = chain( - expand_window(window_size=window_size), - Maxout( - nO=width, - nI=width * ((window_size * 2) + 1), - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", width) - model.attrs["receptive_field"] = window_size * depth - return model - - @registry.architectures.register("spacy.MaxoutWindowEncoder.v2") def MaxoutWindowEncoder( width: int, window_size: int, maxout_pieces: int, depth: int @@ -341,30 +286,6 @@ def MaxoutWindowEncoder( return with_array(model, pad=receptive_field) -# TODO: archive -@registry.architectures.register("spacy.MishWindowEncoder.v1") -def _MishWindowEncoder( - width: int, window_size: int, depth: int -) -> Model[List[Floats2d], List[Floats2d]]: - """Encode context using convolutions with mish activation, layer - normalization and residual connections. - - width (int): The input and output width. These are required to be the same, - to allow residual connections. This value will be determined by the - width of the inputs. Recommended values are between 64 and 300. - window_size (int): The number of words to concatenate around each token - to construct the convolution. Recommended value is 1. - depth (int): The number of convolutional layers. Recommended value is 4. - """ - cnn = chain( - expand_window(window_size=window_size), - Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", width) - return model - - @registry.architectures.register("spacy.MishWindowEncoder.v2") def MishWindowEncoder( width: int, window_size: int, depth: int diff --git a/spacy/util.py b/spacy/util.py index 32eb84894..5703afde2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -105,6 +105,42 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) + @classmethod + def get(cls, registry_name: str, func_name: str) -> Callable: + """Get a registered function from the registry.""" + # We're overwriting this classmethod so we're able to provide more + # specific error messages and implement a fallback to spacy-legacy. + if not hasattr(cls, registry_name): + raise ValueError(Errors.E894.format(name=registry_name)) + reg = getattr(cls, registry_name) + try: + func = reg.get(func_name) + except catalogue.RegistryError: + if func_name.startswith("spacy."): + legacy_name = func_name.replace("spacy.", "spacy-legacy.") + try: + return reg.get(legacy_name) + except catalogue.RegistryError: + pass + available = ", ".join(sorted(reg.get_all().keys())) or "none" + raise ValueError( + Errors.E893.format( + name=func_name, reg_name=registry_name, available=available + ) + ) from None + return func + + @classmethod + def has(cls, registry_name: str, func_name: str) -> bool: + """Check whether a function is available in a registry.""" + if not hasattr(cls, registry_name): + return False + reg = getattr(cls, registry_name) + if func_name.startswith("spacy."): + legacy_name = func_name.replace("spacy.", "spacy-legacy.") + return func_name in reg or legacy_name in reg + return func_name in reg + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default