mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Merge pull request #6731 from explosion/feature/spacy-legacy
This commit is contained in:
commit
b331653ade
|
@ -1,4 +1,5 @@
|
|||
# Our libraries
|
||||
spacy-legacy>=3.0.0.dev0,<3.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0rc3,<8.1.0
|
||||
|
|
|
@ -37,6 +37,7 @@ setup_requires =
|
|||
thinc>=8.0.0rc3,<8.1.0
|
||||
install_requires =
|
||||
# Our libraries
|
||||
spacy-legacy>=3.0.0.dev0,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
|
|
|
@ -463,6 +463,12 @@ class Errors:
|
|||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
|
||||
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
|
||||
"If you're using a custom function, make sure the code is available. "
|
||||
"If the function is provided by a third-party package, e.g. "
|
||||
"spacy-transformers, make sure the package is installed in your "
|
||||
"environment.\n\nAvailable names: {available}")
|
||||
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
|
||||
"'{lang}'.")
|
||||
E895 = ("The 'textcat' component received gold-standard annotations with "
|
||||
|
|
|
@ -4,15 +4,13 @@ from thinc.types import Floats2d
|
|||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
||||
from thinc.api import Relu, residual, expand_window
|
||||
from thinc.api import with_cpu, Relu, residual
|
||||
from thinc.layers.chain import init as init_chain
|
||||
|
||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
||||
from ...attrs import ORTH
|
||||
from ...util import registry
|
||||
from ..extract_ngrams import extract_ngrams
|
||||
from ..staticvectors import StaticVectors
|
||||
from ..featureextractor import FeatureExtractor
|
||||
from ...tokens import Doc
|
||||
from .tok2vec import get_tok2vec_width
|
||||
|
||||
|
@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model:
|
|||
return model
|
||||
|
||||
|
||||
# TODO: move to legacy
|
||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
||||
def build_text_classifier_v1(
|
||||
width: int,
|
||||
embed_size: int,
|
||||
pretrained_vectors: Optional[bool],
|
||||
exclusive_classes: bool,
|
||||
ngram_size: int,
|
||||
window_size: int,
|
||||
conv_depth: int,
|
||||
dropout: Optional[float],
|
||||
nO: Optional[int] = None,
|
||||
) -> Model:
|
||||
# Don't document this yet, I'm not sure it's right.
|
||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||
lower = HashEmbed(
|
||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
||||
)
|
||||
prefix = HashEmbed(
|
||||
nO=width // 2,
|
||||
nV=embed_size,
|
||||
column=cols.index(PREFIX),
|
||||
dropout=dropout,
|
||||
seed=11,
|
||||
)
|
||||
suffix = HashEmbed(
|
||||
nO=width // 2,
|
||||
nV=embed_size,
|
||||
column=cols.index(SUFFIX),
|
||||
dropout=dropout,
|
||||
seed=12,
|
||||
)
|
||||
shape = HashEmbed(
|
||||
nO=width // 2,
|
||||
nV=embed_size,
|
||||
column=cols.index(SHAPE),
|
||||
dropout=dropout,
|
||||
seed=13,
|
||||
)
|
||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
||||
uniqued(
|
||||
(lower | prefix | suffix | shape)
|
||||
>> Maxout(nO=width, nI=width_nI, normalize=True),
|
||||
column=cols.index(ORTH),
|
||||
)
|
||||
)
|
||||
if pretrained_vectors:
|
||||
static_vectors = StaticVectors(width)
|
||||
vector_layer = trained_vectors | static_vectors
|
||||
vectors_width = width * 2
|
||||
else:
|
||||
vector_layer = trained_vectors
|
||||
vectors_width = width
|
||||
tok2vec = vector_layer >> with_array(
|
||||
Maxout(width, vectors_width, normalize=True)
|
||||
>> residual(
|
||||
(
|
||||
expand_window(window_size=window_size)
|
||||
>> Maxout(
|
||||
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
|
||||
)
|
||||
)
|
||||
)
|
||||
** conv_depth,
|
||||
pad=conv_depth,
|
||||
)
|
||||
cnn_model = (
|
||||
tok2vec
|
||||
>> list2ragged()
|
||||
>> ParametricAttention(width)
|
||||
>> reduce_sum()
|
||||
>> residual(Maxout(nO=width, nI=width))
|
||||
>> Linear(nO=nO, nI=width)
|
||||
>> Dropout(0.0)
|
||||
)
|
||||
|
||||
linear_model = build_bow_text_classifier(
|
||||
nO=nO,
|
||||
ngram_size=ngram_size,
|
||||
exclusive_classes=exclusive_classes,
|
||||
no_output_layer=False,
|
||||
)
|
||||
nO_double = nO * 2 if nO else None
|
||||
if exclusive_classes:
|
||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
||||
else:
|
||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
||||
model = (linear_model | cnn_model) >> output_layer
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
if model.has_dim("nO") is not False:
|
||||
model.set_dim("nO", nO)
|
||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
||||
model.attrs["multi_label"] = not exclusive_classes
|
||||
return model
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||
def build_text_classifier_lowdata(
|
||||
width: int, dropout: Optional[float], nO: Optional[int] = None
|
||||
|
|
|
@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec(
|
|||
)
|
||||
|
||||
|
||||
# TODO: archive
|
||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
||||
def _build_Tok2Vec_model(
|
||||
embed: Model[List[Doc], List[Floats2d]],
|
||||
encode: Model[List[Floats2d], List[Floats2d]],
|
||||
) -> Model[List[Doc], List[Floats2d]]:
|
||||
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
||||
See https://explosion.ai/blog/deep-learning-formula-nlp
|
||||
|
||||
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
|
||||
word vector representations.
|
||||
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
||||
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
||||
"""
|
||||
receptive_field = encode.attrs.get("receptive_field", 0)
|
||||
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
||||
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
||||
tok2vec.set_ref("embed", embed)
|
||||
tok2vec.set_ref("encode", encode)
|
||||
return tok2vec
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.Tok2Vec.v2")
|
||||
def build_Tok2Vec_model(
|
||||
embed: Model[List[Doc], List[Floats2d]],
|
||||
|
@ -276,39 +254,6 @@ def CharacterEmbed(
|
|||
return model
|
||||
|
||||
|
||||
# TODO: archive
|
||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
||||
def _MaxoutWindowEncoder(
|
||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||
"""Encode context using convolutions with maxout activation, layer
|
||||
normalization and residual connections.
|
||||
|
||||
width (int): The input and output width. These are required to be the same,
|
||||
to allow residual connections. This value will be determined by the
|
||||
width of the inputs. Recommended values are between 64 and 300.
|
||||
window_size (int): The number of words to concatenate around each token
|
||||
to construct the convolution. Recommended value is 1.
|
||||
maxout_pieces (int): The number of maxout pieces to use. Recommended
|
||||
values are 2 or 3.
|
||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||
"""
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Maxout(
|
||||
nO=width,
|
||||
nI=width * ((window_size * 2) + 1),
|
||||
nP=maxout_pieces,
|
||||
dropout=0.0,
|
||||
normalize=True,
|
||||
),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
model.attrs["receptive_field"] = window_size * depth
|
||||
return model
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
|
||||
def MaxoutWindowEncoder(
|
||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||
|
@ -341,30 +286,6 @@ def MaxoutWindowEncoder(
|
|||
return with_array(model, pad=receptive_field)
|
||||
|
||||
|
||||
# TODO: archive
|
||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
||||
def _MishWindowEncoder(
|
||||
width: int, window_size: int, depth: int
|
||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
||||
"""Encode context using convolutions with mish activation, layer
|
||||
normalization and residual connections.
|
||||
|
||||
width (int): The input and output width. These are required to be the same,
|
||||
to allow residual connections. This value will be determined by the
|
||||
width of the inputs. Recommended values are between 64 and 300.
|
||||
window_size (int): The number of words to concatenate around each token
|
||||
to construct the convolution. Recommended value is 1.
|
||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
||||
"""
|
||||
cnn = chain(
|
||||
expand_window(window_size=window_size),
|
||||
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
||||
)
|
||||
model = clone(residual(cnn), depth)
|
||||
model.set_dim("nO", width)
|
||||
return model
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.MishWindowEncoder.v2")
|
||||
def MishWindowEncoder(
|
||||
width: int, window_size: int, depth: int
|
||||
|
|
|
@ -15,6 +15,7 @@ import numpy.random
|
|||
import numpy
|
||||
import srsly
|
||||
import catalogue
|
||||
from catalogue import RegistryError, Registry
|
||||
import sys
|
||||
import warnings
|
||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||
|
@ -105,6 +106,52 @@ class registry(thinc.registry):
|
|||
models = catalogue.create("spacy", "models", entry_points=True)
|
||||
cli = catalogue.create("spacy", "cli", entry_points=True)
|
||||
|
||||
@classmethod
|
||||
def get_registry_names(cls) -> List[str]:
|
||||
"""List all available registries."""
|
||||
names = []
|
||||
for name, value in inspect.getmembers(cls):
|
||||
if not name.startswith("_") and isinstance(value, Registry):
|
||||
names.append(name)
|
||||
return sorted(names)
|
||||
|
||||
@classmethod
|
||||
def get(cls, registry_name: str, func_name: str) -> Callable:
|
||||
"""Get a registered function from the registry."""
|
||||
# We're overwriting this classmethod so we're able to provide more
|
||||
# specific error messages and implement a fallback to spacy-legacy.
|
||||
if not hasattr(cls, registry_name):
|
||||
names = ", ".join(cls.get_registry_names()) or "none"
|
||||
raise RegistryError(Errors.E892.format(name=registry_name, available=names))
|
||||
reg = getattr(cls, registry_name)
|
||||
try:
|
||||
func = reg.get(func_name)
|
||||
except RegistryError:
|
||||
if func_name.startswith("spacy."):
|
||||
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
|
||||
try:
|
||||
return reg.get(legacy_name)
|
||||
except catalogue.RegistryError:
|
||||
pass
|
||||
available = ", ".join(sorted(reg.get_all().keys())) or "none"
|
||||
raise RegistryError(
|
||||
Errors.E893.format(
|
||||
name=func_name, reg_name=registry_name, available=available
|
||||
)
|
||||
) from None
|
||||
return func
|
||||
|
||||
@classmethod
|
||||
def has(cls, registry_name: str, func_name: str) -> bool:
|
||||
"""Check whether a function is available in a registry."""
|
||||
if not hasattr(cls, registry_name):
|
||||
return False
|
||||
reg = getattr(cls, registry_name)
|
||||
if func_name.startswith("spacy."):
|
||||
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
|
||||
return func_name in reg or legacy_name in reg
|
||||
return func_name in reg
|
||||
|
||||
|
||||
class SimpleFrozenDict(dict):
|
||||
"""Simplified implementation of a frozen dict, mainly used as default
|
||||
|
|
Loading…
Reference in New Issue
Block a user