Merge pull request #6731 from explosion/feature/spacy-legacy

2025-08-09 06:34:54 +03:00 · 2021-01-18 12:21:03 +11:00 · 2021-01-18 12:21:03 +11:00 · b331653ade
commit b331653ade
parent 09cacbb7ee f4d547b73c
6 changed files with 57 additions and 181 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 # Our libraries
+spacy-legacy>=3.0.0.dev0,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.0rc3,<8.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -37,6 +37,7 @@ setup_requires =
    thinc>=8.0.0rc3,<8.1.0
 install_requires =
    # Our libraries
+    spacy-legacy>=3.0.0.dev0,<3.1.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -463,6 +463,12 @@ class Errors:
            "issue tracker: http://github.com/explosion/spaCy/issues")

    # TODO: fix numbering after merging develop into master
+    E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
+    E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
+            "If you're using a custom function, make sure the code is available. "
+            "If the function is provided by a third-party package, e.g. "
+            "spacy-transformers, make sure the package is installed in your "
+            "environment.\n\nAvailable names: {available}")
    E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
            "'{lang}'.")
    E895 = ("The 'textcat' component received gold-standard annotations with "
--- a/spacy/ml/models/textcat.py
+++ b/spacy/ml/models/textcat.py
@ -4,15 +4,13 @@ from thinc.types import Floats2d
 from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
 from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
 from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
-from thinc.api import HashEmbed, with_array, with_cpu, uniqued
-from thinc.api import Relu, residual, expand_window
+from thinc.api import with_cpu, Relu, residual
 from thinc.layers.chain import init as init_chain

-from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
+from ...attrs import ORTH
 from ...util import registry
 from ..extract_ngrams import extract_ngrams
 from ..staticvectors import StaticVectors
-from ..featureextractor import FeatureExtractor
 from ...tokens import Doc
 from .tok2vec import get_tok2vec_width

@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model:
    return model


-# TODO: move to legacy
-@registry.architectures.register("spacy.TextCatEnsemble.v1")
-def build_text_classifier_v1(
-    width: int,
-    embed_size: int,
-    pretrained_vectors: Optional[bool],
-    exclusive_classes: bool,
-    ngram_size: int,
-    window_size: int,
-    conv_depth: int,
-    dropout: Optional[float],
-    nO: Optional[int] = None,
-) -> Model:
-    # Don't document this yet, I'm not sure it's right.
-    cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
-    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
-        lower = HashEmbed(
-            nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
-        )
-        prefix = HashEmbed(
-            nO=width // 2,
-            nV=embed_size,
-            column=cols.index(PREFIX),
-            dropout=dropout,
-            seed=11,
-        )
-        suffix = HashEmbed(
-            nO=width // 2,
-            nV=embed_size,
-            column=cols.index(SUFFIX),
-            dropout=dropout,
-            seed=12,
-        )
-        shape = HashEmbed(
-            nO=width // 2,
-            nV=embed_size,
-            column=cols.index(SHAPE),
-            dropout=dropout,
-            seed=13,
-        )
-        width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
-        trained_vectors = FeatureExtractor(cols) >> with_array(
-            uniqued(
-                (lower | prefix | suffix | shape)
-                >> Maxout(nO=width, nI=width_nI, normalize=True),
-                column=cols.index(ORTH),
-            )
-        )
-        if pretrained_vectors:
-            static_vectors = StaticVectors(width)
-            vector_layer = trained_vectors | static_vectors
-            vectors_width = width * 2
-        else:
-            vector_layer = trained_vectors
-            vectors_width = width
-        tok2vec = vector_layer >> with_array(
-            Maxout(width, vectors_width, normalize=True)
-            >> residual(
-                (
-                    expand_window(window_size=window_size)
-                    >> Maxout(
-                        nO=width, nI=width * ((window_size * 2) + 1), normalize=True
-                    )
-                )
-            )
-            ** conv_depth,
-            pad=conv_depth,
-        )
-        cnn_model = (
-            tok2vec
-            >> list2ragged()
-            >> ParametricAttention(width)
-            >> reduce_sum()
-            >> residual(Maxout(nO=width, nI=width))
-            >> Linear(nO=nO, nI=width)
-            >> Dropout(0.0)
-        )
-
-        linear_model = build_bow_text_classifier(
-            nO=nO,
-            ngram_size=ngram_size,
-            exclusive_classes=exclusive_classes,
-            no_output_layer=False,
-        )
-        nO_double = nO * 2 if nO else None
-        if exclusive_classes:
-            output_layer = Softmax(nO=nO, nI=nO_double)
-        else:
-            output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
-        model = (linear_model | cnn_model) >> output_layer
-        model.set_ref("tok2vec", tok2vec)
-    if model.has_dim("nO") is not False:
-        model.set_dim("nO", nO)
-    model.set_ref("output_layer", linear_model.get_ref("output_layer"))
-    model.attrs["multi_label"] = not exclusive_classes
-    return model
-
-
@registry.architectures.register("spacy.TextCatLowData.v1")
 def build_text_classifier_lowdata(
    width: int, dropout: Optional[float], nO: Optional[int] = None
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec(
    )


-# TODO: archive
-@registry.architectures.register("spacy.Tok2Vec.v1")
-def _build_Tok2Vec_model(
-    embed: Model[List[Doc], List[Floats2d]],
-    encode: Model[List[Floats2d], List[Floats2d]],
-) -> Model[List[Doc], List[Floats2d]]:
-    """Construct a tok2vec model out of embedding and encoding subnetworks.
-    See https://explosion.ai/blog/deep-learning-formula-nlp
-
-    embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
-        word vector representations.
-    encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
-        embeddings, using an architecture such as a CNN, BiLSTM or transformer.
-    """
-    receptive_field = encode.attrs.get("receptive_field", 0)
-    tok2vec = chain(embed, with_array(encode, pad=receptive_field))
-    tok2vec.set_dim("nO", encode.get_dim("nO"))
-    tok2vec.set_ref("embed", embed)
-    tok2vec.set_ref("encode", encode)
-    return tok2vec
-
-
@registry.architectures.register("spacy.Tok2Vec.v2")
 def build_Tok2Vec_model(
    embed: Model[List[Doc], List[Floats2d]],
@ -276,39 +254,6 @@ def CharacterEmbed(
    return model


-# TODO: archive
-@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
-def _MaxoutWindowEncoder(
-    width: int, window_size: int, maxout_pieces: int, depth: int
-) -> Model[List[Floats2d], List[Floats2d]]:
-    """Encode context using convolutions with maxout activation, layer
-    normalization and residual connections.
-
-    width (int): The input and output width. These are required to be the same,
-        to allow residual connections. This value will be determined by the
-        width of the inputs. Recommended values are between 64 and 300.
-    window_size (int): The number of words to concatenate around each token
-        to construct the convolution. Recommended value is 1.
-    maxout_pieces (int): The number of maxout pieces to use. Recommended
-        values are 2 or 3.
-    depth (int): The number of convolutional layers. Recommended value is 4.
-    """
-    cnn = chain(
-        expand_window(window_size=window_size),
-        Maxout(
-            nO=width,
-            nI=width * ((window_size * 2) + 1),
-            nP=maxout_pieces,
-            dropout=0.0,
-            normalize=True,
-        ),
-    )
-    model = clone(residual(cnn), depth)
-    model.set_dim("nO", width)
-    model.attrs["receptive_field"] = window_size * depth
-    return model
-
-
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
 def MaxoutWindowEncoder(
    width: int, window_size: int, maxout_pieces: int, depth: int
@ -341,30 +286,6 @@ def MaxoutWindowEncoder(
    return with_array(model, pad=receptive_field)


-# TODO: archive
-@registry.architectures.register("spacy.MishWindowEncoder.v1")
-def _MishWindowEncoder(
-    width: int, window_size: int, depth: int
-) -> Model[List[Floats2d], List[Floats2d]]:
-    """Encode context using convolutions with mish activation, layer
-    normalization and residual connections.
-
-    width (int): The input and output width. These are required to be the same,
-        to allow residual connections. This value will be determined by the
-        width of the inputs. Recommended values are between 64 and 300.
-    window_size (int): The number of words to concatenate around each token
-        to construct the convolution. Recommended value is 1.
-    depth (int): The number of convolutional layers. Recommended value is 4.
-    """
-    cnn = chain(
-        expand_window(window_size=window_size),
-        Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
-    )
-    model = clone(residual(cnn), depth)
-    model.set_dim("nO", width)
-    return model
-
-
@registry.architectures.register("spacy.MishWindowEncoder.v2")
 def MishWindowEncoder(
    width: int, window_size: int, depth: int
--- a/spacy/util.py
+++ b/spacy/util.py
@ -15,6 +15,7 @@ import numpy.random
 import numpy
 import srsly
 import catalogue
+from catalogue import RegistryError, Registry
 import sys
 import warnings
 from packaging.specifiers import SpecifierSet, InvalidSpecifier
@ -105,6 +106,52 @@ class registry(thinc.registry):
    models = catalogue.create("spacy", "models", entry_points=True)
    cli = catalogue.create("spacy", "cli", entry_points=True)

+    @classmethod
+    def get_registry_names(cls) -> List[str]:
+        """List all available registries."""
+        names = []
+        for name, value in inspect.getmembers(cls):
+            if not name.startswith("_") and isinstance(value, Registry):
+                names.append(name)
+        return sorted(names)
+
+    @classmethod
+    def get(cls, registry_name: str, func_name: str) -> Callable:
+        """Get a registered function from the registry."""
+        # We're overwriting this classmethod so we're able to provide more
+        # specific error messages and implement a fallback to spacy-legacy.
+        if not hasattr(cls, registry_name):
+            names = ", ".join(cls.get_registry_names()) or "none"
+            raise RegistryError(Errors.E892.format(name=registry_name, available=names))
+        reg = getattr(cls, registry_name)
+        try:
+            func = reg.get(func_name)
+        except RegistryError:
+            if func_name.startswith("spacy."):
+                legacy_name = func_name.replace("spacy.", "spacy-legacy.")
+                try:
+                    return reg.get(legacy_name)
+                except catalogue.RegistryError:
+                    pass
+            available = ", ".join(sorted(reg.get_all().keys())) or "none"
+            raise RegistryError(
+                Errors.E893.format(
+                    name=func_name, reg_name=registry_name, available=available
+                )
+            ) from None
+        return func
+
+    @classmethod
+    def has(cls, registry_name: str, func_name: str) -> bool:
+        """Check whether a function is available in a registry."""
+        if not hasattr(cls, registry_name):
+            return False
+        reg = getattr(cls, registry_name)
+        if func_name.startswith("spacy."):
+            legacy_name = func_name.replace("spacy.", "spacy-legacy.")
+            return func_name in reg or legacy_name in reg
+        return func_name in reg
+

 class SimpleFrozenDict(dict):
    """Simplified implementation of a frozen dict, mainly used as default