mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
Merge pull request #6731 from explosion/feature/spacy-legacy
This commit is contained in:
commit
b331653ade
|
@ -1,4 +1,5 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
|
spacy-legacy>=3.0.0.dev0,<3.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.0rc3,<8.1.0
|
thinc>=8.0.0rc3,<8.1.0
|
||||||
|
|
|
@ -37,6 +37,7 @@ setup_requires =
|
||||||
thinc>=8.0.0rc3,<8.1.0
|
thinc>=8.0.0rc3,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
|
spacy-legacy>=3.0.0.dev0,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
|
|
|
@ -463,6 +463,12 @@ class Errors:
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E892 = ("Unknown function registry: '{name}'.\n\nAvailable names: {available}")
|
||||||
|
E893 = ("Could not find function '{name}' in function registry '{reg_name}'. "
|
||||||
|
"If you're using a custom function, make sure the code is available. "
|
||||||
|
"If the function is provided by a third-party package, e.g. "
|
||||||
|
"spacy-transformers, make sure the package is installed in your "
|
||||||
|
"environment.\n\nAvailable names: {available}")
|
||||||
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
|
E894 = ("The 'noun_chunks' syntax iterator is not implemented for language "
|
||||||
"'{lang}'.")
|
"'{lang}'.")
|
||||||
E895 = ("The 'textcat' component received gold-standard annotations with "
|
E895 = ("The 'textcat' component received gold-standard annotations with "
|
||||||
|
|
|
@ -4,15 +4,13 @@ from thinc.types import Floats2d
|
||||||
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
from thinc.api import Model, reduce_mean, Linear, list2ragged, Logistic
|
||||||
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
from thinc.api import chain, concatenate, clone, Dropout, ParametricAttention
|
||||||
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
from thinc.api import SparseLinear, Softmax, softmax_activation, Maxout, reduce_sum
|
||||||
from thinc.api import HashEmbed, with_array, with_cpu, uniqued
|
from thinc.api import with_cpu, Relu, residual
|
||||||
from thinc.api import Relu, residual, expand_window
|
|
||||||
from thinc.layers.chain import init as init_chain
|
from thinc.layers.chain import init as init_chain
|
||||||
|
|
||||||
from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
|
from ...attrs import ORTH
|
||||||
from ...util import registry
|
from ...util import registry
|
||||||
from ..extract_ngrams import extract_ngrams
|
from ..extract_ngrams import extract_ngrams
|
||||||
from ..staticvectors import StaticVectors
|
from ..staticvectors import StaticVectors
|
||||||
from ..featureextractor import FeatureExtractor
|
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
from .tok2vec import get_tok2vec_width
|
from .tok2vec import get_tok2vec_width
|
||||||
|
|
||||||
|
@ -115,104 +113,6 @@ def init_ensemble_textcat(model, X, Y) -> Model:
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
# TODO: move to legacy
|
|
||||||
@registry.architectures.register("spacy.TextCatEnsemble.v1")
|
|
||||||
def build_text_classifier_v1(
|
|
||||||
width: int,
|
|
||||||
embed_size: int,
|
|
||||||
pretrained_vectors: Optional[bool],
|
|
||||||
exclusive_classes: bool,
|
|
||||||
ngram_size: int,
|
|
||||||
window_size: int,
|
|
||||||
conv_depth: int,
|
|
||||||
dropout: Optional[float],
|
|
||||||
nO: Optional[int] = None,
|
|
||||||
) -> Model:
|
|
||||||
# Don't document this yet, I'm not sure it's right.
|
|
||||||
cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID]
|
|
||||||
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
|
||||||
lower = HashEmbed(
|
|
||||||
nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout, seed=10
|
|
||||||
)
|
|
||||||
prefix = HashEmbed(
|
|
||||||
nO=width // 2,
|
|
||||||
nV=embed_size,
|
|
||||||
column=cols.index(PREFIX),
|
|
||||||
dropout=dropout,
|
|
||||||
seed=11,
|
|
||||||
)
|
|
||||||
suffix = HashEmbed(
|
|
||||||
nO=width // 2,
|
|
||||||
nV=embed_size,
|
|
||||||
column=cols.index(SUFFIX),
|
|
||||||
dropout=dropout,
|
|
||||||
seed=12,
|
|
||||||
)
|
|
||||||
shape = HashEmbed(
|
|
||||||
nO=width // 2,
|
|
||||||
nV=embed_size,
|
|
||||||
column=cols.index(SHAPE),
|
|
||||||
dropout=dropout,
|
|
||||||
seed=13,
|
|
||||||
)
|
|
||||||
width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape])
|
|
||||||
trained_vectors = FeatureExtractor(cols) >> with_array(
|
|
||||||
uniqued(
|
|
||||||
(lower | prefix | suffix | shape)
|
|
||||||
>> Maxout(nO=width, nI=width_nI, normalize=True),
|
|
||||||
column=cols.index(ORTH),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if pretrained_vectors:
|
|
||||||
static_vectors = StaticVectors(width)
|
|
||||||
vector_layer = trained_vectors | static_vectors
|
|
||||||
vectors_width = width * 2
|
|
||||||
else:
|
|
||||||
vector_layer = trained_vectors
|
|
||||||
vectors_width = width
|
|
||||||
tok2vec = vector_layer >> with_array(
|
|
||||||
Maxout(width, vectors_width, normalize=True)
|
|
||||||
>> residual(
|
|
||||||
(
|
|
||||||
expand_window(window_size=window_size)
|
|
||||||
>> Maxout(
|
|
||||||
nO=width, nI=width * ((window_size * 2) + 1), normalize=True
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
** conv_depth,
|
|
||||||
pad=conv_depth,
|
|
||||||
)
|
|
||||||
cnn_model = (
|
|
||||||
tok2vec
|
|
||||||
>> list2ragged()
|
|
||||||
>> ParametricAttention(width)
|
|
||||||
>> reduce_sum()
|
|
||||||
>> residual(Maxout(nO=width, nI=width))
|
|
||||||
>> Linear(nO=nO, nI=width)
|
|
||||||
>> Dropout(0.0)
|
|
||||||
)
|
|
||||||
|
|
||||||
linear_model = build_bow_text_classifier(
|
|
||||||
nO=nO,
|
|
||||||
ngram_size=ngram_size,
|
|
||||||
exclusive_classes=exclusive_classes,
|
|
||||||
no_output_layer=False,
|
|
||||||
)
|
|
||||||
nO_double = nO * 2 if nO else None
|
|
||||||
if exclusive_classes:
|
|
||||||
output_layer = Softmax(nO=nO, nI=nO_double)
|
|
||||||
else:
|
|
||||||
output_layer = Linear(nO=nO, nI=nO_double) >> Dropout(0.0) >> Logistic()
|
|
||||||
model = (linear_model | cnn_model) >> output_layer
|
|
||||||
model.set_ref("tok2vec", tok2vec)
|
|
||||||
if model.has_dim("nO") is not False:
|
|
||||||
model.set_dim("nO", nO)
|
|
||||||
model.set_ref("output_layer", linear_model.get_ref("output_layer"))
|
|
||||||
model.attrs["multi_label"] = not exclusive_classes
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.TextCatLowData.v1")
|
@registry.architectures.register("spacy.TextCatLowData.v1")
|
||||||
def build_text_classifier_lowdata(
|
def build_text_classifier_lowdata(
|
||||||
width: int, dropout: Optional[float], nO: Optional[int] = None
|
width: int, dropout: Optional[float], nO: Optional[int] = None
|
||||||
|
|
|
@ -87,28 +87,6 @@ def build_hash_embed_cnn_tok2vec(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# TODO: archive
|
|
||||||
@registry.architectures.register("spacy.Tok2Vec.v1")
|
|
||||||
def _build_Tok2Vec_model(
|
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
|
||||||
encode: Model[List[Floats2d], List[Floats2d]],
|
|
||||||
) -> Model[List[Doc], List[Floats2d]]:
|
|
||||||
"""Construct a tok2vec model out of embedding and encoding subnetworks.
|
|
||||||
See https://explosion.ai/blog/deep-learning-formula-nlp
|
|
||||||
|
|
||||||
embed (Model[List[Doc], List[Floats2d]]): Embed tokens into context-independent
|
|
||||||
word vector representations.
|
|
||||||
encode (Model[List[Floats2d], List[Floats2d]]): Encode context into the
|
|
||||||
embeddings, using an architecture such as a CNN, BiLSTM or transformer.
|
|
||||||
"""
|
|
||||||
receptive_field = encode.attrs.get("receptive_field", 0)
|
|
||||||
tok2vec = chain(embed, with_array(encode, pad=receptive_field))
|
|
||||||
tok2vec.set_dim("nO", encode.get_dim("nO"))
|
|
||||||
tok2vec.set_ref("embed", embed)
|
|
||||||
tok2vec.set_ref("encode", encode)
|
|
||||||
return tok2vec
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.Tok2Vec.v2")
|
@registry.architectures.register("spacy.Tok2Vec.v2")
|
||||||
def build_Tok2Vec_model(
|
def build_Tok2Vec_model(
|
||||||
embed: Model[List[Doc], List[Floats2d]],
|
embed: Model[List[Doc], List[Floats2d]],
|
||||||
|
@ -276,39 +254,6 @@ def CharacterEmbed(
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
# TODO: archive
|
|
||||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
|
|
||||||
def _MaxoutWindowEncoder(
|
|
||||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
|
||||||
"""Encode context using convolutions with maxout activation, layer
|
|
||||||
normalization and residual connections.
|
|
||||||
|
|
||||||
width (int): The input and output width. These are required to be the same,
|
|
||||||
to allow residual connections. This value will be determined by the
|
|
||||||
width of the inputs. Recommended values are between 64 and 300.
|
|
||||||
window_size (int): The number of words to concatenate around each token
|
|
||||||
to construct the convolution. Recommended value is 1.
|
|
||||||
maxout_pieces (int): The number of maxout pieces to use. Recommended
|
|
||||||
values are 2 or 3.
|
|
||||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
|
||||||
"""
|
|
||||||
cnn = chain(
|
|
||||||
expand_window(window_size=window_size),
|
|
||||||
Maxout(
|
|
||||||
nO=width,
|
|
||||||
nI=width * ((window_size * 2) + 1),
|
|
||||||
nP=maxout_pieces,
|
|
||||||
dropout=0.0,
|
|
||||||
normalize=True,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
model = clone(residual(cnn), depth)
|
|
||||||
model.set_dim("nO", width)
|
|
||||||
model.attrs["receptive_field"] = window_size * depth
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
|
@registry.architectures.register("spacy.MaxoutWindowEncoder.v2")
|
||||||
def MaxoutWindowEncoder(
|
def MaxoutWindowEncoder(
|
||||||
width: int, window_size: int, maxout_pieces: int, depth: int
|
width: int, window_size: int, maxout_pieces: int, depth: int
|
||||||
|
@ -341,30 +286,6 @@ def MaxoutWindowEncoder(
|
||||||
return with_array(model, pad=receptive_field)
|
return with_array(model, pad=receptive_field)
|
||||||
|
|
||||||
|
|
||||||
# TODO: archive
|
|
||||||
@registry.architectures.register("spacy.MishWindowEncoder.v1")
|
|
||||||
def _MishWindowEncoder(
|
|
||||||
width: int, window_size: int, depth: int
|
|
||||||
) -> Model[List[Floats2d], List[Floats2d]]:
|
|
||||||
"""Encode context using convolutions with mish activation, layer
|
|
||||||
normalization and residual connections.
|
|
||||||
|
|
||||||
width (int): The input and output width. These are required to be the same,
|
|
||||||
to allow residual connections. This value will be determined by the
|
|
||||||
width of the inputs. Recommended values are between 64 and 300.
|
|
||||||
window_size (int): The number of words to concatenate around each token
|
|
||||||
to construct the convolution. Recommended value is 1.
|
|
||||||
depth (int): The number of convolutional layers. Recommended value is 4.
|
|
||||||
"""
|
|
||||||
cnn = chain(
|
|
||||||
expand_window(window_size=window_size),
|
|
||||||
Mish(nO=width, nI=width * ((window_size * 2) + 1), dropout=0.0, normalize=True),
|
|
||||||
)
|
|
||||||
model = clone(residual(cnn), depth)
|
|
||||||
model.set_dim("nO", width)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
@registry.architectures.register("spacy.MishWindowEncoder.v2")
|
@registry.architectures.register("spacy.MishWindowEncoder.v2")
|
||||||
def MishWindowEncoder(
|
def MishWindowEncoder(
|
||||||
width: int, window_size: int, depth: int
|
width: int, window_size: int, depth: int
|
||||||
|
|
|
@ -15,6 +15,7 @@ import numpy.random
|
||||||
import numpy
|
import numpy
|
||||||
import srsly
|
import srsly
|
||||||
import catalogue
|
import catalogue
|
||||||
|
from catalogue import RegistryError, Registry
|
||||||
import sys
|
import sys
|
||||||
import warnings
|
import warnings
|
||||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||||
|
@ -105,6 +106,52 @@ class registry(thinc.registry):
|
||||||
models = catalogue.create("spacy", "models", entry_points=True)
|
models = catalogue.create("spacy", "models", entry_points=True)
|
||||||
cli = catalogue.create("spacy", "cli", entry_points=True)
|
cli = catalogue.create("spacy", "cli", entry_points=True)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_registry_names(cls) -> List[str]:
|
||||||
|
"""List all available registries."""
|
||||||
|
names = []
|
||||||
|
for name, value in inspect.getmembers(cls):
|
||||||
|
if not name.startswith("_") and isinstance(value, Registry):
|
||||||
|
names.append(name)
|
||||||
|
return sorted(names)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get(cls, registry_name: str, func_name: str) -> Callable:
|
||||||
|
"""Get a registered function from the registry."""
|
||||||
|
# We're overwriting this classmethod so we're able to provide more
|
||||||
|
# specific error messages and implement a fallback to spacy-legacy.
|
||||||
|
if not hasattr(cls, registry_name):
|
||||||
|
names = ", ".join(cls.get_registry_names()) or "none"
|
||||||
|
raise RegistryError(Errors.E892.format(name=registry_name, available=names))
|
||||||
|
reg = getattr(cls, registry_name)
|
||||||
|
try:
|
||||||
|
func = reg.get(func_name)
|
||||||
|
except RegistryError:
|
||||||
|
if func_name.startswith("spacy."):
|
||||||
|
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
|
||||||
|
try:
|
||||||
|
return reg.get(legacy_name)
|
||||||
|
except catalogue.RegistryError:
|
||||||
|
pass
|
||||||
|
available = ", ".join(sorted(reg.get_all().keys())) or "none"
|
||||||
|
raise RegistryError(
|
||||||
|
Errors.E893.format(
|
||||||
|
name=func_name, reg_name=registry_name, available=available
|
||||||
|
)
|
||||||
|
) from None
|
||||||
|
return func
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def has(cls, registry_name: str, func_name: str) -> bool:
|
||||||
|
"""Check whether a function is available in a registry."""
|
||||||
|
if not hasattr(cls, registry_name):
|
||||||
|
return False
|
||||||
|
reg = getattr(cls, registry_name)
|
||||||
|
if func_name.startswith("spacy."):
|
||||||
|
legacy_name = func_name.replace("spacy.", "spacy-legacy.")
|
||||||
|
return func_name in reg or legacy_name in reg
|
||||||
|
return func_name in reg
|
||||||
|
|
||||||
|
|
||||||
class SimpleFrozenDict(dict):
|
class SimpleFrozenDict(dict):
|
||||||
"""Simplified implementation of a frozen dict, mainly used as default
|
"""Simplified implementation of a frozen dict, mainly used as default
|
||||||
|
|
Loading…
Reference in New Issue
Block a user