Merge pull request #6731 from explosion/feature/spacy-legacy

This commit is contained in:
Ines Montani 2021-01-18 12:21:03 +11:00 committed by GitHub
commit b331653ade
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 57 additions and 181 deletions

View File

@ -1,4 +1,5 @@
# Our libraries
spacy-legacy>=3.0.0.dev0,<3.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.0rc3,<8.1.0

View File

@ -37,6 +37,7 @@ setup_requires =
thinc>=8.0.0rc3,<8.1.0
install_requires =
# Our libraries
spacy-legacy>=3.0.0.dev0,<3.1.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0

View File

@ -463,6 +463,12 @@ class Errors:
"issue tracker: http://github.com/explosion/spaCy/issues")
# TODO: fix numbering after merging develop into master
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
"If you're using a custom function, make sure the code is available. "
"If the function is provided by a third-party package, e.g. "
"spacy-transformers, make sure the package is installed in your "
"environment.\n\nAvailable names: {available}")
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
"'{lang}'.")
E895 = ("The 'textcat' component received gold-standard annotations with "

View File

@ -4,15 +4,13 @@ from thinc.types import Floats2d
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
from thinc.api import Relu, residual, expand_window
from thinc.api import with_cpu, Relu, residual
from thinc.layers.chain import init as init_chain
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
from ...attrs import ORTH
from ...util import registry
from ..extract_ngrams import extract_ngrams
from ..staticvectors import StaticVectors
from ..featureextractor import FeatureExtractor
from ...tokens import Doc
from .tok2vec import get_tok2vec_width
@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model:
return model
# TODO: move to legacy
@registry.architectures.register("spacy.TextCatEnsemble.v1")
def build_text_classifier_v1(
width: int,
embed_size: int,
pretrained_vectors: Optional[bool],
exclusive_classes: bool,
ngram_size: int,
window_size: int,
conv_depth: int,
dropout: Optional[float],
nO: Optional[int] = None,
) -> Model:
# Don't document this yet, I'm not sure it's right.
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
lower = HashEmbed(
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
)
prefix = HashEmbed(
nO=width // 2,
nV=embed_size,
column=cols.index(PREFIX),
dropout=dropout,
seed=11,
)
suffix = HashEmbed(
nO=width // 2,
nV=embed_size,
column=cols.index(SUFFIX),
dropout=dropout,
seed=12,
)
shape = HashEmbed(
nO=width // 2,
nV=embed_size,
column=cols.index(SHAPE),
dropout=dropout,
seed=13,
)
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
trained_vectors = FeatureExtractor(cols) >> with_array(
uniqued(
(lower | prefix | suffix | shape)
>> Maxout(nO=width, nI=width_nI, normalize=True),
column=cols.index(ORTH),
)
)
if pretrained_vectors:
static_vectors = StaticVectors(width)
vector_layer = trained_vectors | static_vectors
vectors_width = width * 2
else:
vector_layer = trained_vectors
vectors_width = width
tok2vec = vector_layer >> with_array(
Maxout(width, vectors_width, normalize=True)
>> residual(
(
expand_window(window_size=window_size)
>> Maxout(
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
)
)
)
** conv_depth,
pad=conv_depth,
)
cnn_model = (
tok2vec
>> list2ragged()
>> ParametricAttention(width)
>> reduce_sum()
>> residual(Maxout(nO=width, nI=width))
>> Linear(nO=nO, nI=width)
>> Dropout(0.0)
)
linear_model = build_bow_text_classifier(
nO=nO,
ngram_size=ngram_size,
exclusive_classes=exclusive_classes,
no_output_layer=False,
)
nO_double = nO * 2 if nO else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nO_double)
else:
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
model = (linear_model | cnn_model) >> output_layer
model.set_ref("tok2vec", tok2vec)
if model.has_dim("nO") is not False:
model.set_dim("nO", nO)
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
model.attrs["multi_label"] = not exclusive_classes
return model
@registry.architectures.register("spacy.TextCatLowData.v1")
def build_text_classifier_lowdata(
width: int, dropout: Optional[float], nO: Optional[int] = None

View File

@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec(
)
# TODO: archive
@registry.architectures.register("spacy.Tok2Vec.v1")
def _build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]],
encode: Model[List[Floats2d], List[Floats2d]],
) -> Model[List[Doc], List[Floats2d]]:
"""Construct a tok2vec model out of embedding and encoding subnetworks.
See https://explosion.ai/blog/deep-learning-formula-nlp
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
word vector representations.
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
"""
receptive_field = encode.attrs.get("receptive_field", 0)
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
tok2vec.set_dim("nO", encode.get_dim("nO"))
tok2vec.set_ref("embed", embed)
tok2vec.set_ref("encode", encode)
return tok2vec
@registry.architectures.register("spacy.Tok2Vec.v2")
def build_Tok2Vec_model(
embed: Model[List[Doc], List[Floats2d]],
@ -276,39 +254,6 @@ def CharacterEmbed(
return model
# TODO: archive
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
def _MaxoutWindowEncoder(
width: int, window_size: int, maxout_pieces: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using convolutions with maxout activation, layer
normalization and residual connections.
width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1.
maxout_pieces (int): The number of maxout pieces to use. Recommended
values are 2 or 3.
depth (int): The number of convolutional layers. Recommended value is 4.
"""
cnn = chain(
expand_window(window_size=window_size),
Maxout(
nO=width,
nI=width * ((window_size * 2) + 1),
nP=maxout_pieces,
dropout=0.0,
normalize=True,
),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", width)
model.attrs["receptive_field"] = window_size * depth
return model
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
def MaxoutWindowEncoder(
width: int, window_size: int, maxout_pieces: int, depth: int
@ -341,30 +286,6 @@ def MaxoutWindowEncoder(
return with_array(model, pad=receptive_field)
# TODO: archive
@registry.architectures.register("spacy.MishWindowEncoder.v1")
def _MishWindowEncoder(
width: int, window_size: int, depth: int
) -> Model[List[Floats2d], List[Floats2d]]:
"""Encode context using convolutions with mish activation, layer
normalization and residual connections.
width (int): The input and output width. These are required to be the same,
to allow residual connections. This value will be determined by the
width of the inputs. Recommended values are between 64 and 300.
window_size (int): The number of words to concatenate around each token
to construct the convolution. Recommended value is 1.
depth (int): The number of convolutional layers. Recommended value is 4.
"""
cnn = chain(
expand_window(window_size=window_size),
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
)
model = clone(residual(cnn), depth)
model.set_dim("nO", width)
return model
@registry.architectures.register("spacy.MishWindowEncoder.v2")
def MishWindowEncoder(
width: int, window_size: int, depth: int

View File

@ -15,6 +15,7 @@ import numpy.random
import numpy
import srsly
import catalogue
from catalogue import RegistryError, Registry
import sys
import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
@ -105,6 +106,52 @@ class registry(thinc.registry):
models = catalogue.create("spacy", "models", entry_points=True)
cli = catalogue.create("spacy", "cli", entry_points=True)
@classmethod
def get_registry_names(cls) -> List[str]:
"""List all available registries."""
names = []
for name, value in inspect.getmembers(cls):
if not name.startswith("_") and isinstance(value, Registry):
names.append(name)
return sorted(names)
@classmethod
def get(cls, registry_name: str, func_name: str) -> Callable:
"""Get a registered function from the registry."""
# We're overwriting this classmethod so we're able to provide more
# specific error messages and implement a fallback to spacy-legacy.
if not hasattr(cls, registry_name):
names = ", ".join(cls.get_registry_names()) or "none"
raise RegistryError(Errors.E892.format(name=registry_name, available=names))
reg = getattr(cls, registry_name)
try:
func = reg.get(func_name)
except RegistryError:
if func_name.startswith("spacy."):
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
try:
return reg.get(legacy_name)
except catalogue.RegistryError:
pass
available = ", ".join(sorted(reg.get_all().keys())) or "none"
raise RegistryError(
Errors.E893.format(
name=func_name, reg_name=registry_name, available=available
)
) from None
return func
@classmethod
def has(cls, registry_name: str, func_name: str) -> bool:
"""Check whether a function is available in a registry."""
if not hasattr(cls, registry_name):
return False
reg = getattr(cls, registry_name)
if func_name.startswith("spacy."):
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
return func_name in reg or legacy_name in reg
return func_name in reg
class SimpleFrozenDict(dict):
"""Simplified implementation of a frozen dict, mainly used as default