From a203e3dbb8ae3c202f4f24d2bcceed7ed7568b15 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 15 Jan 2021 21:42:40 +1100 Subject: [PATCH 1/5] Support spacy-legacy via the registry --- spacy/errors.py | 6 +++ spacy/ml/models/textcat.py | 104 +------------------------------------ spacy/ml/models/tok2vec.py | 79 ---------------------------- spacy/util.py | 36 +++++++++++++ 4 files changed, 44 insertions(+), 181 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 8cbcbe6d9..72df888a5 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -463,6 +463,12 @@ class Errors: "issue tracker: http://github.com/explosion/spaCy/issues") # TODO: fix numbering after merging develop into master + E893 = ("Could not find function '{name}' in function registry '{reg_name}'. " + "If you're using a custom function, make sure the code is available. " + "If the function is provided by a third-party package, e.g. " + "spacy-transformers, make sure the package is installed in your " + "environment.\n\nAvailable names: {available}") + E894 = ("Unknown function registry: '{name}'.") E895 = ("The 'textcat' component received gold-standard annotations with " "multiple labels per document. In spaCy 3 you should use the " "'textcat_multilabel' component for this instead. " diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index 000ca5066..7a4ee3856 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -4,15 +4,13 @@ from thinc.types import Floats2d from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum -from thinc.api import HashEmbed, with_array, with_cpu, uniqued -from thinc.api import Relu, residual, expand_window +from thinc.api import with_cpu, Relu, residual from thinc.layers.chain import init as init_chain -from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER +from ...attrs import ORTH from ...util import registry from ..extract_ngrams import extract_ngrams from ..staticvectors import StaticVectors -from ..featureextractor import FeatureExtractor from ...tokens import Doc from .tok2vec import get_tok2vec_width @@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model: return model -# TODO: move to legacy -@registry.architectures.register("spacy.TextCatEnsemble.v1") -def build_text_classifier_v1( - width: int, - embed_size: int, - pretrained_vectors: Optional[bool], - exclusive_classes: bool, - ngram_size: int, - window_size: int, - conv_depth: int, - dropout: Optional[float], - nO: Optional[int] = None, -) -> Model: - # Don't document this yet, I'm not sure it's right. - cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed( - nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10 - ) - prefix = HashEmbed( - nO=width // 2, - nV=embed_size, - column=cols.index(PREFIX), - dropout=dropout, - seed=11, - ) - suffix = HashEmbed( - nO=width // 2, - nV=embed_size, - column=cols.index(SUFFIX), - dropout=dropout, - seed=12, - ) - shape = HashEmbed( - nO=width // 2, - nV=embed_size, - column=cols.index(SHAPE), - dropout=dropout, - seed=13, - ) - width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) - trained_vectors = FeatureExtractor(cols) >> with_array( - uniqued( - (lower | prefix | suffix | shape) - >> Maxout(nO=width, nI=width_nI, normalize=True), - column=cols.index(ORTH), - ) - ) - if pretrained_vectors: - static_vectors = StaticVectors(width) - vector_layer = trained_vectors | static_vectors - vectors_width = width * 2 - else: - vector_layer = trained_vectors - vectors_width = width - tok2vec = vector_layer >> with_array( - Maxout(width, vectors_width, normalize=True) - >> residual( - ( - expand_window(window_size=window_size) - >> Maxout( - nO=width, nI=width * ((window_size * 2) + 1), normalize=True - ) - ) - ) - ** conv_depth, - pad=conv_depth, - ) - cnn_model = ( - tok2vec - >> list2ragged() - >> ParametricAttention(width) - >> reduce_sum() - >> residual(Maxout(nO=width, nI=width)) - >> Linear(nO=nO, nI=width) - >> Dropout(0.0) - ) - - linear_model = build_bow_text_classifier( - nO=nO, - ngram_size=ngram_size, - exclusive_classes=exclusive_classes, - no_output_layer=False, - ) - nO_double = nO * 2 if nO else None - if exclusive_classes: - output_layer = Softmax(nO=nO, nI=nO_double) - else: - output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic() - model = (linear_model | cnn_model) >> output_layer - model.set_ref("tok2vec", tok2vec) - if model.has_dim("nO") is not False: - model.set_dim("nO", nO) - model.set_ref("output_layer", linear_model.get_ref("output_layer")) - model.attrs["multi_label"] = not exclusive_classes - return model - - @registry.architectures.register("spacy.TextCatLowData.v1") def build_text_classifier_lowdata( width: int, dropout: Optional[float], nO: Optional[int] = None diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index dd4b6deee..f013d54d4 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec( ) -# TODO: archive -@registry.architectures.register("spacy.Tok2Vec.v1") -def _build_Tok2Vec_model( - embed: Model[List[Doc], List[Floats2d]], - encode: Model[List[Floats2d], List[Floats2d]], -) -> Model[List[Doc], List[Floats2d]]: - """Construct a tok2vec model out of embedding and encoding subnetworks. - See https://explosion.ai/blog/deep-learning-formula-nlp - - embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent - word vector representations. - encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the - embeddings, using an architecture such as a CNN, BiLSTM or transformer. - """ - receptive_field = encode.attrs.get("receptive_field", 0) - tok2vec = chain(embed, with_array(encode, pad=receptive_field)) - tok2vec.set_dim("nO", encode.get_dim("nO")) - tok2vec.set_ref("embed", embed) - tok2vec.set_ref("encode", encode) - return tok2vec - - @registry.architectures.register("spacy.Tok2Vec.v2") def build_Tok2Vec_model( embed: Model[List[Doc], List[Floats2d]], @@ -276,39 +254,6 @@ def CharacterEmbed( return model -# TODO: archive -@registry.architectures.register("spacy.MaxoutWindowEncoder.v1") -def _MaxoutWindowEncoder( - width: int, window_size: int, maxout_pieces: int, depth: int -) -> Model[List[Floats2d], List[Floats2d]]: - """Encode context using convolutions with maxout activation, layer - normalization and residual connections. - - width (int): The input and output width. These are required to be the same, - to allow residual connections. This value will be determined by the - width of the inputs. Recommended values are between 64 and 300. - window_size (int): The number of words to concatenate around each token - to construct the convolution. Recommended value is 1. - maxout_pieces (int): The number of maxout pieces to use. Recommended - values are 2 or 3. - depth (int): The number of convolutional layers. Recommended value is 4. - """ - cnn = chain( - expand_window(window_size=window_size), - Maxout( - nO=width, - nI=width * ((window_size * 2) + 1), - nP=maxout_pieces, - dropout=0.0, - normalize=True, - ), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", width) - model.attrs["receptive_field"] = window_size * depth - return model - - @registry.architectures.register("spacy.MaxoutWindowEncoder.v2") def MaxoutWindowEncoder( width: int, window_size: int, maxout_pieces: int, depth: int @@ -341,30 +286,6 @@ def MaxoutWindowEncoder( return with_array(model, pad=receptive_field) -# TODO: archive -@registry.architectures.register("spacy.MishWindowEncoder.v1") -def _MishWindowEncoder( - width: int, window_size: int, depth: int -) -> Model[List[Floats2d], List[Floats2d]]: - """Encode context using convolutions with mish activation, layer - normalization and residual connections. - - width (int): The input and output width. These are required to be the same, - to allow residual connections. This value will be determined by the - width of the inputs. Recommended values are between 64 and 300. - window_size (int): The number of words to concatenate around each token - to construct the convolution. Recommended value is 1. - depth (int): The number of convolutional layers. Recommended value is 4. - """ - cnn = chain( - expand_window(window_size=window_size), - Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True), - ) - model = clone(residual(cnn), depth) - model.set_dim("nO", width) - return model - - @registry.architectures.register("spacy.MishWindowEncoder.v2") def MishWindowEncoder( width: int, window_size: int, depth: int diff --git a/spacy/util.py b/spacy/util.py index 32eb84894..5703afde2 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -105,6 +105,42 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) + @classmethod + def get(cls, registry_name: str, func_name: str) -> Callable: + """Get a registered function from the registry.""" + # We're overwriting this classmethod so we're able to provide more + # specific error messages and implement a fallback to spacy-legacy. + if not hasattr(cls, registry_name): + raise ValueError(Errors.E894.format(name=registry_name)) + reg = getattr(cls, registry_name) + try: + func = reg.get(func_name) + except catalogue.RegistryError: + if func_name.startswith("spacy."): + legacy_name = func_name.replace("spacy.", "spacy-legacy.") + try: + return reg.get(legacy_name) + except catalogue.RegistryError: + pass + available = ", ".join(sorted(reg.get_all().keys())) or "none" + raise ValueError( + Errors.E893.format( + name=func_name, reg_name=registry_name, available=available + ) + ) from None + return func + + @classmethod + def has(cls, registry_name: str, func_name: str) -> bool: + """Check whether a function is available in a registry.""" + if not hasattr(cls, registry_name): + return False + reg = getattr(cls, registry_name) + if func_name.startswith("spacy."): + legacy_name = func_name.replace("spacy.", "spacy-legacy.") + return func_name in reg or legacy_name in reg + return func_name in reg + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default From d1338966aeaff87e60ac022ee72b03ad69d9b80c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 15 Jan 2021 21:59:06 +1100 Subject: [PATCH 2/5] Require spacy-legacy --- requirements.txt | 1 + setup.cfg | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 641c04a62..47dc54da6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ # Our libraries +spacy-legacy>=3.0.0.dev0,<3.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 thinc>=8.0.0rc3,<8.1.0 diff --git a/setup.cfg b/setup.cfg index ac4ab21c9..588155308 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,6 +37,7 @@ setup_requires = thinc>=8.0.0rc3,<8.1.0 install_requires = # Our libraries + spacy-legacy>=3.0.0.dev0,<3.1.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 From d12be459f6232e8876ed3797289f6593f37503e6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Jan 2021 12:57:13 +1100 Subject: [PATCH 3/5] Raise RegistryError --- spacy/util.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 5703afde2..d66d0ba87 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -15,6 +15,7 @@ import numpy.random import numpy import srsly import catalogue +from catalogue import RegistryError import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier @@ -111,11 +112,11 @@ class registry(thinc.registry): # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): - raise ValueError(Errors.E894.format(name=registry_name)) + raise RegistryError(Errors.E894.format(name=registry_name)) reg = getattr(cls, registry_name) try: func = reg.get(func_name) - except catalogue.RegistryError: + except RegistryError: if func_name.startswith("spacy."): legacy_name = func_name.replace("spacy.", "spacy-legacy.") try: @@ -123,7 +124,7 @@ class registry(thinc.registry): except catalogue.RegistryError: pass available = ", ".join(sorted(reg.get_all().keys())) or "none" - raise ValueError( + raise RegistryError( Errors.E893.format( name=func_name, reg_name=registry_name, available=available ) From a552db2819d6619b3482f01fbacfb2006edccbcc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Jan 2021 14:35:03 +1100 Subject: [PATCH 4/5] Include available registry names in error --- spacy/errors.py | 2 +- spacy/util.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 72df888a5..cc85c157f 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -468,7 +468,7 @@ class Errors: "If the function is provided by a third-party package, e.g. " "spacy-transformers, make sure the package is installed in your " "environment.\n\nAvailable names: {available}") - E894 = ("Unknown function registry: '{name}'.") + E894 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}") E895 = ("The 'textcat' component received gold-standard annotations with " "multiple labels per document. In spaCy 3 you should use the " "'textcat_multilabel' component for this instead. " diff --git a/spacy/util.py b/spacy/util.py index d66d0ba87..67dccba3a 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -15,7 +15,7 @@ import numpy.random import numpy import srsly import catalogue -from catalogue import RegistryError +from catalogue import RegistryError, Registry import sys import warnings from packaging.specifiers import SpecifierSet, InvalidSpecifier @@ -106,13 +106,23 @@ class registry(thinc.registry): models = catalogue.create("spacy", "models", entry_points=True) cli = catalogue.create("spacy", "cli", entry_points=True) + @classmethod + def get_registry_names(cls) -> List[str]: + """List all available registries.""" + names = [] + for name, value in inspect.getmembers(cls): + if not name.startswith("_") and isinstance(value, Registry): + names.append(name) + return sorted(names) + @classmethod def get(cls, registry_name: str, func_name: str) -> Callable: """Get a registered function from the registry.""" # We're overwriting this classmethod so we're able to provide more # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): - raise RegistryError(Errors.E894.format(name=registry_name)) + names = ", ".join(cls.get_registry_names()) or "none" + raise RegistryError(Errors.E894.format(name=registry_name, available=names)) reg = getattr(cls, registry_name) try: func = reg.get(func_name) From f4d547b73c31e7578dcc676b9276dab3dce851ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 18 Jan 2021 11:43:45 +1100 Subject: [PATCH 5/5] Fix error code --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 67dccba3a..77aa712d1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -122,7 +122,7 @@ class registry(thinc.registry): # specific error messages and implement a fallback to spacy-legacy. if not hasattr(cls, registry_name): names = ", ".join(cls.get_registry_names()) or "none" - raise RegistryError(Errors.E894.format(name=registry_name, available=names)) + raise RegistryError(Errors.E892.format(name=registry_name, available=names)) reg = getattr(cls, registry_name) try: func = reg.get(func_name)