Put Tok2Vec refactor behind feature flag (#4563)

* Add back pre-2.2.2 tok2vec * Add simple tok2vec tests * Add simple tok2vec tests * Reformat * Fix CharacterEmbed in new tok2vec * Fix legacy tok2vec * Resolve circular imports * Fix test for Python 2
2025-08-24 22:14:56 +03:00 · 2019-10-31 15:01:15 +01:00 · 2019-10-31 15:01:15 +01:00 · e82306937e
commit e82306937e
parent 828108a57f
4 changed files with 211 additions and 4 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -25,9 +25,12 @@ from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
 from .errors import Errors, user_warning, Warnings
 from . import util
 from . import ml as new_ml
+from .ml import _legacy_tok2vec


 VECTORS_KEY = "spacy_pretrained_vectors"
+# Backwards compatibility with <2.2.2
+USE_MODEL_REGISTRY_TOK2VEC = False


 def cosine(vec1, vec2):
@ -315,6 +318,9 @@ def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):


 def Tok2Vec(width, embed_size, **kwargs):
+    if not USE_MODEL_REGISTRY_TOK2VEC:
+        # Preserve prior tok2vec for backwards compat, in v2.2.2
+        return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
    pretrained_vectors = kwargs.get("pretrained_vectors", None)
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
--- a/spacy/ml/_legacy_tok2vec.py
+++ b/spacy/ml/_legacy_tok2vec.py
@ -0,0 +1,131 @@
+# coding: utf8
+from __future__ import unicode_literals
+from thinc.v2v import Model, Maxout
+from thinc.i2v import HashEmbed, StaticVectors
+from thinc.t2t import ExtractWindow
+from thinc.misc import Residual
+from thinc.misc import LayerNorm as LN
+from thinc.misc import FeatureExtracter
+from thinc.api import layerize, chain, clone, concatenate, with_flatten
+from thinc.api import uniqued, wrap, noop
+
+from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
+
+
+def Tok2Vec(width, embed_size, **kwargs):
+    # Circular imports :(
+    from .._ml import CharacterEmbed
+    from .._ml import PyTorchBiLSTM
+
+    pretrained_vectors = kwargs.get("pretrained_vectors", None)
+    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
+    subword_features = kwargs.get("subword_features", True)
+    char_embed = kwargs.get("char_embed", False)
+    if char_embed:
+        subword_features = False
+    conv_depth = kwargs.get("conv_depth", 4)
+    bilstm_depth = kwargs.get("bilstm_depth", 0)
+    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+    with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
+        if subword_features:
+            prefix = HashEmbed(
+                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
+            )
+            suffix = HashEmbed(
+                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
+            )
+            shape = HashEmbed(
+                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
+            )
+        else:
+            prefix, suffix, shape = (None, None, None)
+        if pretrained_vectors is not None:
+            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
+
+            if subword_features:
+                embed = uniqued(
+                    (glove | norm | prefix | suffix | shape)
+                    >> LN(Maxout(width, width * 5, pieces=3)),
+                    column=cols.index(ORTH),
+                )
+            else:
+                embed = uniqued(
+                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
+                    column=cols.index(ORTH),
+                )
+        elif subword_features:
+            embed = uniqued(
+                (norm | prefix | suffix | shape)
+                >> LN(Maxout(width, width * 4, pieces=3)),
+                column=cols.index(ORTH),
+            )
+        elif char_embed:
+            embed = concatenate_lists(
+                CharacterEmbed(nM=64, nC=8),
+                FeatureExtracter(cols) >> with_flatten(norm),
+            )
+            reduce_dimensions = LN(
+                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
+            )
+        else:
+            embed = norm
+
+        convolution = Residual(
+            ExtractWindow(nW=1)
+            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
+        )
+        if char_embed:
+            tok2vec = embed >> with_flatten(
+                reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
+            )
+        else:
+            tok2vec = FeatureExtracter(cols) >> with_flatten(
+                embed >> convolution ** conv_depth, pad=conv_depth
+            )
+
+        if bilstm_depth >= 1:
+            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
+        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
+        tok2vec.nO = width
+        tok2vec.embed = embed
+    return tok2vec
+
+
+@layerize
+def flatten(seqs, drop=0.0):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
+
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=0)
+
+    X = ops.flatten(seqs, pad=0)
+    return X, finish_update
+
+
+def concatenate_lists(*layers, **kwargs):  # pragma: no cover
+    """Compose two or more models `f`, `g`, etc, such that their outputs are
+    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
+    """
+    if not layers:
+        return noop()
+    drop_factor = kwargs.get("drop_factor", 1.0)
+    ops = layers[0].ops
+    layers = [chain(layer, flatten) for layer in layers]
+    concat = concatenate(*layers)
+
+    def concatenate_lists_fwd(Xs, drop=0.0):
+        if drop is not None:
+            drop *= drop_factor
+        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
+        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
+        ys = ops.unflatten(flat_y, lengths)
+
+        def concatenate_lists_bwd(d_ys, sgd=None):
+            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+
+        return ys, concatenate_lists_bwd
+
+    model = wrap(concatenate_lists_fwd, concat)
+    return model
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@ -6,7 +6,6 @@ from thinc.v2v import Maxout, Model
 from thinc.i2v import HashEmbed, StaticVectors
 from thinc.t2t import ExtractWindow
 from thinc.misc import Residual, LayerNorm, FeatureExtracter
-
 from ..util import make_layer, register_architecture
 from ._wire import concatenate_lists

@ -72,19 +71,20 @@ def MultiHashEmbed(config):
            )
        elif config["@pretrained_vectors"]:
            mix._layers[0].nI = width * 2
-            embed = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
+            layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
        else:
-            embed = norm
+            layer = norm
    layer.cfg = config
    return layer


@register_architecture("spacy.CharacterEmbed.v1")
 def CharacterEmbed(config):
+    from .. import _ml
    width = config["width"]
    chars = config["chars"]

-    chr_embed = CharacterEmbed(nM=width, nC=chars)
+    chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars)
    other_tables = make_layer(config["@embed_features"])
    mix = make_layer(config["@mix"])

@ -128,6 +128,7 @@ def PretrainedVectors(config):
    return StaticVectors(config["vectors_name"], config["width"], config["column"])


+
@register_architecture("spacy.TorchBiLSTMEncoder.v1")
 def TorchBiLSTMEncoder(config):
    import torch.nn
@ -142,6 +143,9 @@ def TorchBiLSTMEncoder(config):
    )


+
+
+
 _EXAMPLE_CONFIG = {
    "@doc2feats": {
        "arch": "Doc2Feats",
--- a/spacy/tests/test_tok2vec.py
+++ b/spacy/tests/test_tok2vec.py
@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy._ml import Tok2Vec
+from spacy.vocab import Vocab
+from spacy.tokens import Doc
+from spacy.compat import unicode_
+
+
+def get_batch(batch_size):
+    vocab = Vocab()
+    docs = []
+    start = 0
+    for size in range(1, batch_size + 1):
+        # Make the words numbers, so that they're distnct
+        # across the batch, and easy to track.
+        numbers = [unicode_(i) for i in range(start, start + size)]
+        docs.append(Doc(vocab, words=numbers))
+        start += size
+    return docs
+
+
+# This fails in Thinc v7.3.1. Need to push patch
+@pytest.mark.xfail
+def test_empty_doc():
+    width = 128
+    embed_size = 2000
+    vocab = Vocab()
+    doc = Doc(vocab, words=[])
+    tok2vec = Tok2Vec(width, embed_size)
+    vectors, backprop = tok2vec.begin_update([doc])
+    assert len(vectors) == 1
+    assert vectors[0].shape == (0, width)
+
+
+@pytest.mark.parametrize(
+    "batch_size,width,embed_size", [[1, 128, 2000], [2, 128, 2000], [3, 8, 63]]
+)
+def test_tok2vec_batch_sizes(batch_size, width, embed_size):
+    batch = get_batch(batch_size)
+    tok2vec = Tok2Vec(width, embed_size)
+    vectors, backprop = tok2vec.begin_update(batch)
+    assert len(vectors) == len(batch)
+    for doc_vec, doc in zip(vectors, batch):
+        assert doc_vec.shape == (len(doc), width)
+
+
+@pytest.mark.parametrize(
+    "tok2vec_config",
+    [
+        {"width": 8, "embed_size": 100, "char_embed": False},
+        {"width": 8, "embed_size": 100, "char_embed": True},
+        {"width": 8, "embed_size": 100, "conv_depth": 6},
+        {"width": 8, "embed_size": 100, "conv_depth": 6},
+        {"width": 8, "embed_size": 100, "subword_features": False},
+    ],
+)
+def test_tok2vec_configs(tok2vec_config):
+    docs = get_batch(3)
+    tok2vec = Tok2Vec(**tok2vec_config)
+    vectors, backprop = tok2vec.begin_update(docs)
+    assert len(vectors) == len(docs)
+    assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"])
+    backprop(vectors)