Support spacy-legacy via the registry

2025-12-15 06:04:33 +03:00 · 2021-01-15 21:42:40 +11:00 · 2021-01-15 21:42:40 +11:00 · a203e3dbb8
commit a203e3dbb8
parent 330f9818c0
4 changed files with 44 additions and 181 deletions
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -463,6 +463,12 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")
    # TODO: fix numbering after merging develop into master
    E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
            "If you're using a custom function, make sure the code is available. "
            "If the function is provided by a third-party package, e.g. "
            "spacy-transformers, make sure the package is installed in your "
            "environment.\n\nAvailable names: {available}")
    E894 = ("Unknown function registry: '{name}'.")
    E895 = ("The 'textcat' component received gold-standard annotations with "
            "multiple labels per document. In spaCy 3 you should use the "
            "'textcat_multilabel' component for this instead. "
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -4,15 +4,13 @@ from thinc.types import Floats2d
 from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-from thinc.api import HashEmbed, with_array, with_cpu, uniqued
+from thinc.api import with_cpu, Relu, residual
 from thinc.api import Relu, residual, expand_window
 from thinc.layers.chain import init as init_chain
-from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
+from ...attrs import ORTH
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...tokens import Doc
 from .tok2vec import get_tok2vec_width
@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model:
    return model
 # TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1")
 def build_text_classifier_v1(
    width: int,
    embed_size: int,
    pretrained_vectors: Optional[bool],
    exclusive_classes: bool,
    ngram_size: int,
    window_size: int,
    conv_depth: int,
    dropout: Optional[float],
    nO: Optional[int] = None,
 ) -> Model:
    # Don't document this yet, I'm not sure it's right.
    cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
        lower = HashEmbed(
            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
        )
        prefix = HashEmbed(
            nO=width // 2,
            nV=embed_size,
            column=cols.index(PREFIX),
            dropout=dropout,
            seed=11,
        )
        suffix = HashEmbed(
            nO=width // 2,
            nV=embed_size,
            column=cols.index(SUFFIX),
            dropout=dropout,
            seed=12,
        )
        shape = HashEmbed(
            nO=width // 2,
            nV=embed_size,
            column=cols.index(SHAPE),
            dropout=dropout,
            seed=13,
        )
        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
        trained_vectors = FeatureExtractor(cols) >> with_array(
            uniqued(
                (lower | prefix | suffix | shape)
                >> Maxout(nO=width, nI=width_nI, normalize=True),
                column=cols.index(ORTH),
            )
        )
        if pretrained_vectors:
            static_vectors = StaticVectors(width)
            vector_layer = trained_vectors | static_vectors
            vectors_width = width * 2
        else:
            vector_layer = trained_vectors
            vectors_width = width
        tok2vec = vector_layer >> with_array(
            Maxout(width, vectors_width, normalize=True)
            >> residual(
                (
                    expand_window(window_size=window_size)
                    >> Maxout(
                        nO=width, nI=width * ((window_size * 2) + 1), normalize=True
                    )
                )
            )
            ** conv_depth,
            pad=conv_depth,
        )
        cnn_model = (
            tok2vec
            >> list2ragged()
            >> ParametricAttention(width)
            >> reduce_sum()
            >> residual(Maxout(nO=width, nI=width))
            >> Linear(nO=nO, nI=width)
            >> Dropout(0.0)
        )
        linear_model = build_bow_text_classifier(
            nO=nO,
            ngram_size=ngram_size,
            exclusive_classes=exclusive_classes,
            no_output_layer=False,
        )
        nO_double = nO * 2 if nO else None
        if exclusive_classes:
            output_layer = Softmax(nO=nO, nI=nO_double)
        else:
            output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
        model = (linear_model | cnn_model) >> output_layer
        model.set_ref("tok2vec", tok2vec)
    if model.has_dim("nO") is not False:
        model.set_dim("nO", nO)
    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
    model.attrs["multi_label"] = not exclusive_classes
    return model
@registry.architectures.register("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
    width: int, dropout: Optional[float], nO: Optional[int] = None
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec(
    )
 # TODO: archive
@registry.architectures.register("spacy.Tok2Vec.v1")
 def _build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
    encode: Model[List[Floats2d], List[Floats2d]],
 ) -> Model[List[Doc], List[Floats2d]]:
    """Construct a tok2vec model out of embedding and encoding subnetworks.
    See https://explosion.ai/blog/deep-learning-formula-nlp
    embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
        word vector representations.
    encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
        embeddings, using an architecture such as a CNN, BiLSTM or transformer.
    """
    receptive_field = encode.attrs.get("receptive_field", 0)
    tok2vec = chain(embed, with_array(encode, pad=receptive_field))
    tok2vec.set_dim("nO", encode.get_dim("nO"))
    tok2vec.set_ref("embed", embed)
    tok2vec.set_ref("encode", encode)
    return tok2vec
@registry.architectures.register("spacy.Tok2Vec.v2")
 def build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
@ -276,39 +254,6 @@ def CharacterEmbed(
    return model
 # TODO: archive
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
 def _MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
    """Encode context using convolutions with maxout activation, layer
    normalization and residual connections.
    width (int): The input and output width. These are required to be the same,
        to allow residual connections. This value will be determined by the
        width of the inputs. Recommended values are between 64 and 300.
    window_size (int): The number of words to concatenate around each token
        to construct the convolution. Recommended value is 1.
    maxout_pieces (int): The number of maxout pieces to use. Recommended
        values are 2 or 3.
    depth (int): The number of convolutional layers. Recommended value is 4.
    """
    cnn = chain(
        expand_window(window_size=window_size),
        Maxout(
            nO=width,
            nI=width * ((window_size * 2) + 1),
            nP=maxout_pieces,
            dropout=0.0,
            normalize=True,
        ),
    )
    model = clone(residual(cnn), depth)
    model.set_dim("nO", width)
    model.attrs["receptive_field"] = window_size * depth
    return model
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
 def MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
@ -341,30 +286,6 @@ def MaxoutWindowEncoder(
    return with_array(model, pad=receptive_field)
 # TODO: archive
@registry.architectures.register("spacy.MishWindowEncoder.v1")
 def _MishWindowEncoder(
    width: int, window_size: int, depth: int
 ) -> Model[List[Floats2d], List[Floats2d]]:
    """Encode context using convolutions with mish activation, layer
    normalization and residual connections.
    width (int): The input and output width. These are required to be the same,
        to allow residual connections. This value will be determined by the
        width of the inputs. Recommended values are between 64 and 300.
    window_size (int): The number of words to concatenate around each token
        to construct the convolution. Recommended value is 1.
    depth (int): The number of convolutional layers. Recommended value is 4.
    """
    cnn = chain(
        expand_window(window_size=window_size),
        Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
    )
    model = clone(residual(cnn), depth)
    model.set_dim("nO", width)
    return model
@registry.architectures.register("spacy.MishWindowEncoder.v2")
 def MishWindowEncoder(
    width: int, window_size: int, depth: int
--- a/spacy/util.py
+++ b/spacy/util.py
@ -105,6 +105,42 @@ class registry(thinc.registry):
    models = catalogue.create("spacy", "models", entry_points=True)
    cli = catalogue.create("spacy", "cli", entry_points=True)
    @classmethod
    def get(cls, registry_name: str, func_name: str) -> Callable:
        """Get a registered function from the registry."""
        # We're overwriting this classmethod so we're able to provide more
        # specific error messages and implement a fallback to spacy-legacy.
        if not hasattr(cls, registry_name):
            raise ValueError(Errors.E894.format(name=registry_name))
        reg = getattr(cls, registry_name)
        try:
            func = reg.get(func_name)
        except catalogue.RegistryError:
            if func_name.startswith("spacy."):
                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
                try:
                    return reg.get(legacy_name)
                except catalogue.RegistryError:
                    pass
            available = ", ".join(sorted(reg.get_all().keys())) or "none"
            raise ValueError(
                Errors.E893.format(
                    name=func_name, reg_name=registry_name, available=available
                )
            ) from None
        return func
    @classmethod
    def has(cls, registry_name: str, func_name: str) -> bool:
        """Check whether a function is available in a registry."""
        if not hasattr(cls, registry_name):
            return False
        reg = getattr(cls, registry_name)
        if func_name.startswith("spacy."):
            legacy_name = func_name.replace("spacy.", "spacy-legacy.")
            return func_name in reg or legacy_name in reg
        return func_name in reg
 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default