mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
Put Tok2Vec refactor behind feature flag (#4563)
* Add back pre-2.2.2 tok2vec * Add simple tok2vec tests * Add simple tok2vec tests * Reformat * Fix CharacterEmbed in new tok2vec * Fix legacy tok2vec * Resolve circular imports * Fix test for Python 2
This commit is contained in:
parent
828108a57f
commit
e82306937e
|
@ -25,9 +25,12 @@ from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
from .errors import Errors, user_warning, Warnings
|
from .errors import Errors, user_warning, Warnings
|
||||||
from . import util
|
from . import util
|
||||||
from . import ml as new_ml
|
from . import ml as new_ml
|
||||||
|
from .ml import _legacy_tok2vec
|
||||||
|
|
||||||
|
|
||||||
VECTORS_KEY = "spacy_pretrained_vectors"
|
VECTORS_KEY = "spacy_pretrained_vectors"
|
||||||
|
# Backwards compatibility with <2.2.2
|
||||||
|
USE_MODEL_REGISTRY_TOK2VEC = False
|
||||||
|
|
||||||
|
|
||||||
def cosine(vec1, vec2):
|
def cosine(vec1, vec2):
|
||||||
|
@ -315,6 +318,9 @@ def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
|
||||||
|
|
||||||
|
|
||||||
def Tok2Vec(width, embed_size, **kwargs):
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
|
if not USE_MODEL_REGISTRY_TOK2VEC:
|
||||||
|
# Preserve prior tok2vec for backwards compat, in v2.2.2
|
||||||
|
return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
|
||||||
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||||
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
||||||
subword_features = kwargs.get("subword_features", True)
|
subword_features = kwargs.get("subword_features", True)
|
||||||
|
|
131
spacy/ml/_legacy_tok2vec.py
Normal file
131
spacy/ml/_legacy_tok2vec.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
from thinc.v2v import Model, Maxout
|
||||||
|
from thinc.i2v import HashEmbed, StaticVectors
|
||||||
|
from thinc.t2t import ExtractWindow
|
||||||
|
from thinc.misc import Residual
|
||||||
|
from thinc.misc import LayerNorm as LN
|
||||||
|
from thinc.misc import FeatureExtracter
|
||||||
|
from thinc.api import layerize, chain, clone, concatenate, with_flatten
|
||||||
|
from thinc.api import uniqued, wrap, noop
|
||||||
|
|
||||||
|
from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
|
||||||
|
|
||||||
|
|
||||||
|
def Tok2Vec(width, embed_size, **kwargs):
|
||||||
|
# Circular imports :(
|
||||||
|
from .._ml import CharacterEmbed
|
||||||
|
from .._ml import PyTorchBiLSTM
|
||||||
|
|
||||||
|
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||||
|
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
||||||
|
subword_features = kwargs.get("subword_features", True)
|
||||||
|
char_embed = kwargs.get("char_embed", False)
|
||||||
|
if char_embed:
|
||||||
|
subword_features = False
|
||||||
|
conv_depth = kwargs.get("conv_depth", 4)
|
||||||
|
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
||||||
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
|
||||||
|
norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
|
||||||
|
if subword_features:
|
||||||
|
prefix = HashEmbed(
|
||||||
|
width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
|
||||||
|
)
|
||||||
|
suffix = HashEmbed(
|
||||||
|
width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
|
||||||
|
)
|
||||||
|
shape = HashEmbed(
|
||||||
|
width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prefix, suffix, shape = (None, None, None)
|
||||||
|
if pretrained_vectors is not None:
|
||||||
|
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
|
||||||
|
|
||||||
|
if subword_features:
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
embed = uniqued(
|
||||||
|
(glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
|
elif subword_features:
|
||||||
|
embed = uniqued(
|
||||||
|
(norm | prefix | suffix | shape)
|
||||||
|
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||||
|
column=cols.index(ORTH),
|
||||||
|
)
|
||||||
|
elif char_embed:
|
||||||
|
embed = concatenate_lists(
|
||||||
|
CharacterEmbed(nM=64, nC=8),
|
||||||
|
FeatureExtracter(cols) >> with_flatten(norm),
|
||||||
|
)
|
||||||
|
reduce_dimensions = LN(
|
||||||
|
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
embed = norm
|
||||||
|
|
||||||
|
convolution = Residual(
|
||||||
|
ExtractWindow(nW=1)
|
||||||
|
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||||
|
)
|
||||||
|
if char_embed:
|
||||||
|
tok2vec = embed >> with_flatten(
|
||||||
|
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
||||||
|
embed >> convolution ** conv_depth, pad=conv_depth
|
||||||
|
)
|
||||||
|
|
||||||
|
if bilstm_depth >= 1:
|
||||||
|
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
||||||
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
|
tok2vec.nO = width
|
||||||
|
tok2vec.embed = embed
|
||||||
|
return tok2vec
|
||||||
|
|
||||||
|
|
||||||
|
@layerize
|
||||||
|
def flatten(seqs, drop=0.0):
|
||||||
|
ops = Model.ops
|
||||||
|
lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
|
||||||
|
|
||||||
|
def finish_update(d_X, sgd=None):
|
||||||
|
return ops.unflatten(d_X, lengths, pad=0)
|
||||||
|
|
||||||
|
X = ops.flatten(seqs, pad=0)
|
||||||
|
return X, finish_update
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||||
|
"""Compose two or more models `f`, `g`, etc, such that their outputs are
|
||||||
|
concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
|
||||||
|
"""
|
||||||
|
if not layers:
|
||||||
|
return noop()
|
||||||
|
drop_factor = kwargs.get("drop_factor", 1.0)
|
||||||
|
ops = layers[0].ops
|
||||||
|
layers = [chain(layer, flatten) for layer in layers]
|
||||||
|
concat = concatenate(*layers)
|
||||||
|
|
||||||
|
def concatenate_lists_fwd(Xs, drop=0.0):
|
||||||
|
if drop is not None:
|
||||||
|
drop *= drop_factor
|
||||||
|
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
||||||
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||||
|
ys = ops.unflatten(flat_y, lengths)
|
||||||
|
|
||||||
|
def concatenate_lists_bwd(d_ys, sgd=None):
|
||||||
|
return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
|
||||||
|
|
||||||
|
return ys, concatenate_lists_bwd
|
||||||
|
|
||||||
|
model = wrap(concatenate_lists_fwd, concat)
|
||||||
|
return model
|
|
@ -6,7 +6,6 @@ from thinc.v2v import Maxout, Model
|
||||||
from thinc.i2v import HashEmbed, StaticVectors
|
from thinc.i2v import HashEmbed, StaticVectors
|
||||||
from thinc.t2t import ExtractWindow
|
from thinc.t2t import ExtractWindow
|
||||||
from thinc.misc import Residual, LayerNorm, FeatureExtracter
|
from thinc.misc import Residual, LayerNorm, FeatureExtracter
|
||||||
|
|
||||||
from ..util import make_layer, register_architecture
|
from ..util import make_layer, register_architecture
|
||||||
from ._wire import concatenate_lists
|
from ._wire import concatenate_lists
|
||||||
|
|
||||||
|
@ -72,19 +71,20 @@ def MultiHashEmbed(config):
|
||||||
)
|
)
|
||||||
elif config["@pretrained_vectors"]:
|
elif config["@pretrained_vectors"]:
|
||||||
mix._layers[0].nI = width * 2
|
mix._layers[0].nI = width * 2
|
||||||
embed = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
|
layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
|
||||||
else:
|
else:
|
||||||
embed = norm
|
layer = norm
|
||||||
layer.cfg = config
|
layer.cfg = config
|
||||||
return layer
|
return layer
|
||||||
|
|
||||||
|
|
||||||
@register_architecture("spacy.CharacterEmbed.v1")
|
@register_architecture("spacy.CharacterEmbed.v1")
|
||||||
def CharacterEmbed(config):
|
def CharacterEmbed(config):
|
||||||
|
from .. import _ml
|
||||||
width = config["width"]
|
width = config["width"]
|
||||||
chars = config["chars"]
|
chars = config["chars"]
|
||||||
|
|
||||||
chr_embed = CharacterEmbed(nM=width, nC=chars)
|
chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars)
|
||||||
other_tables = make_layer(config["@embed_features"])
|
other_tables = make_layer(config["@embed_features"])
|
||||||
mix = make_layer(config["@mix"])
|
mix = make_layer(config["@mix"])
|
||||||
|
|
||||||
|
@ -128,6 +128,7 @@ def PretrainedVectors(config):
|
||||||
return StaticVectors(config["vectors_name"], config["width"], config["column"])
|
return StaticVectors(config["vectors_name"], config["width"], config["column"])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@register_architecture("spacy.TorchBiLSTMEncoder.v1")
|
@register_architecture("spacy.TorchBiLSTMEncoder.v1")
|
||||||
def TorchBiLSTMEncoder(config):
|
def TorchBiLSTMEncoder(config):
|
||||||
import torch.nn
|
import torch.nn
|
||||||
|
@ -142,6 +143,9 @@ def TorchBiLSTMEncoder(config):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
_EXAMPLE_CONFIG = {
|
_EXAMPLE_CONFIG = {
|
||||||
"@doc2feats": {
|
"@doc2feats": {
|
||||||
"arch": "Doc2Feats",
|
"arch": "Doc2Feats",
|
||||||
|
|
66
spacy/tests/test_tok2vec.py
Normal file
66
spacy/tests/test_tok2vec.py
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
# coding: utf-8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy._ml import Tok2Vec
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tokens import Doc
|
||||||
|
from spacy.compat import unicode_
|
||||||
|
|
||||||
|
|
||||||
|
def get_batch(batch_size):
|
||||||
|
vocab = Vocab()
|
||||||
|
docs = []
|
||||||
|
start = 0
|
||||||
|
for size in range(1, batch_size + 1):
|
||||||
|
# Make the words numbers, so that they're distnct
|
||||||
|
# across the batch, and easy to track.
|
||||||
|
numbers = [unicode_(i) for i in range(start, start + size)]
|
||||||
|
docs.append(Doc(vocab, words=numbers))
|
||||||
|
start += size
|
||||||
|
return docs
|
||||||
|
|
||||||
|
|
||||||
|
# This fails in Thinc v7.3.1. Need to push patch
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_empty_doc():
|
||||||
|
width = 128
|
||||||
|
embed_size = 2000
|
||||||
|
vocab = Vocab()
|
||||||
|
doc = Doc(vocab, words=[])
|
||||||
|
tok2vec = Tok2Vec(width, embed_size)
|
||||||
|
vectors, backprop = tok2vec.begin_update([doc])
|
||||||
|
assert len(vectors) == 1
|
||||||
|
assert vectors[0].shape == (0, width)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"batch_size,width,embed_size", [[1, 128, 2000], [2, 128, 2000], [3, 8, 63]]
|
||||||
|
)
|
||||||
|
def test_tok2vec_batch_sizes(batch_size, width, embed_size):
|
||||||
|
batch = get_batch(batch_size)
|
||||||
|
tok2vec = Tok2Vec(width, embed_size)
|
||||||
|
vectors, backprop = tok2vec.begin_update(batch)
|
||||||
|
assert len(vectors) == len(batch)
|
||||||
|
for doc_vec, doc in zip(vectors, batch):
|
||||||
|
assert doc_vec.shape == (len(doc), width)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"tok2vec_config",
|
||||||
|
[
|
||||||
|
{"width": 8, "embed_size": 100, "char_embed": False},
|
||||||
|
{"width": 8, "embed_size": 100, "char_embed": True},
|
||||||
|
{"width": 8, "embed_size": 100, "conv_depth": 6},
|
||||||
|
{"width": 8, "embed_size": 100, "conv_depth": 6},
|
||||||
|
{"width": 8, "embed_size": 100, "subword_features": False},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_tok2vec_configs(tok2vec_config):
|
||||||
|
docs = get_batch(3)
|
||||||
|
tok2vec = Tok2Vec(**tok2vec_config)
|
||||||
|
vectors, backprop = tok2vec.begin_update(docs)
|
||||||
|
assert len(vectors) == len(docs)
|
||||||
|
assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"])
|
||||||
|
backprop(vectors)
|
Loading…
Reference in New Issue
Block a user