Refactor Tok2Vec to use architecture registry (#4518)

* Add refactored tok2vec, using register_architecture * Refactor Tok2Vec * Fix ml * Fix new tok2vec * Move make_layer to util * Add wire * Fix missing import
2025-07-16 03:02:41 +03:00 · 2019-10-25 22:28:20 +02:00 · 2019-10-25 22:28:20 +02:00 · 406eb95a47
commit 406eb95a47
parent 99e309bb19
6 changed files with 298 additions and 76 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -3,16 +3,14 @@ from __future__ import unicode_literals

 import numpy
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
-from thinc.i2v import HashEmbed, StaticVectors
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.t2v import Pooling, sum_pool, mean_pool
-from thinc.misc import Residual
+from thinc.i2v import HashEmbed
+from thinc.misc import Residual, FeatureExtracter
 from thinc.misc import LayerNorm as LN
-from thinc.misc import FeatureExtracter
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.api import with_getitem, flatten_add_lengths
 from thinc.api import uniqued, wrap, noop
-from thinc.api import with_square_sequences
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module, copy_array
@ -26,12 +24,8 @@ import thinc.extra.load_nlp
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
 from .errors import Errors, user_warning, Warnings
 from . import util
+from . import ml as new_ml

-try:
-    import torch.nn
-    from thinc.extra.wrappers import PyTorchWrapperRNN
-except ImportError:
-    torch = None

 VECTORS_KEY = "spacy_pretrained_vectors"

@ -310,6 +304,9 @@ def link_vectors_to_models(vocab):


 def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
+    import torch.nn
+    from thinc.api import with_square_sequences
+    from thinc.extra.wrappers import PyTorchWrapperRNN
    if depth == 0:
        return layerize(noop())
    model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
@ -321,77 +318,91 @@ def Tok2Vec(width, embed_size, **kwargs):
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    char_embed = kwargs.get("char_embed", False)
-    if char_embed:
-        subword_features = False
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
-    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
-    with Model.define_operators(
-        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
-    ):
-        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
-        if subword_features:
-            prefix = HashEmbed(
-                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
-            )
-            suffix = HashEmbed(
-                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
-            )
-            shape = HashEmbed(
-                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
-            )
-        else:
-            prefix, suffix, shape = (None, None, None)
-        if pretrained_vectors is not None:
-            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))

-            if subword_features:
-                embed = uniqued(
-                    (glove | norm | prefix | suffix | shape)
-                    >> LN(Maxout(width, width * 5, pieces=3)),
-                    column=cols.index(ORTH),
-                )
-            else:
-                embed = uniqued(
-                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
-                    column=cols.index(ORTH),
-                )
-        elif subword_features:
-            embed = uniqued(
-                (norm | prefix | suffix | shape)
-                >> LN(Maxout(width, width * 4, pieces=3)),
-                column=cols.index(ORTH),
-            )
-        elif char_embed:
-            embed = concatenate_lists(
-                CharacterEmbed(nM=64, nC=8),
-                FeatureExtracter(cols) >> with_flatten(norm),
-            )
-            reduce_dimensions = LN(
-                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
-            )
-        else:
-            embed = norm
+    cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
 
-        convolution = Residual(
-            ExtractWindow(nW=1)
-            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
-        )
+    doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
    if char_embed:
-            tok2vec = embed >> with_flatten(
-                reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
-            )
+        embed_cfg = {
+            "arch": "spacy.CharacterEmbed.v1",
+            "config": {
+                "width": 64,
+                "chars": 6,
+                "@mix": {
+                    "arch": "spacy.LayerNormalizedMaxout.v1",
+                    "config": {
+                        "width": width,
+                        "pieces": 3
+                    }
+                },
+                "@embed_features": None
+            }
+        }
    else:
-            tok2vec = FeatureExtracter(cols) >> with_flatten(
-                embed >> convolution ** conv_depth, pad=conv_depth
-            )
+        embed_cfg = {
+            "arch": "spacy.MultiHashEmbed.v1",
+            "config": {
+                "width": width,
+                "rows": embed_size,
+                "columns": cols,
+                "use_subwords": subword_features,
+                "@pretrained_vectors": None,
+                "@mix": {
+                    "arch": "spacy.LayerNormalizedMaxout.v1",
+                    "config": {
+                        "width": width,
+                        "pieces": 3
+                    }
+                },
+            }
+        }
+        if pretrained_vectors:
+            embed_cfg["config"]["@pretrained_vectors"] = {
+                "arch": "spacy.PretrainedVectors.v1",
+                "config": {
+                    "vectors_name": pretrained_vectors,
+                    "width": width,
+                    "column": cols.index(ID)
+                }
+            }
+    cnn_cfg = {
+        "arch": "spacy.MaxoutWindowEncoder.v1",
+        "config": {
+            "width": width,
+            "window_size": 1,
+            "pieces": cnn_maxout_pieces,
+            "depth": conv_depth
+        }
+    }

-        if bilstm_depth >= 1:
-            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
-        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
-        tok2vec.nO = width
-        tok2vec.embed = embed
-    return tok2vec
+    bilstm_cfg = {
+        "arch": "spacy.TorchBiLSTMEncoder.v1",
+        "config": {
+            "width": width,
+            "depth": bilstm_depth,
+        }
+    }
+    if conv_depth == 0 and bilstm_depth == 0:
+        encode_cfg = {}
+    elif conv_depth >= 1 and bilstm_depth >= 1:
+        encode_cfg = {
+            "arch": "thinc.FeedForward.v1",
+            "config": {
+                "children": [cnn_cfg, bilstm_cfg]
+            }
+        }
+    elif conv_depth >= 1:
+        encode_cfg = cnn_cfg
+    else:
+        encode_cfg = bilstm_cfg
+    config = {
+        "@doc2feats": doc2feats_cfg,
+        "@embed": embed_cfg,
+        "@encode": encode_cfg
+    }
+    return new_ml.Tok2Vec(config)


 def reapply(layer, n_times):
--- a/spacy/ml/init.py
+++ b/spacy/ml/init.py
@ -0,0 +1 @@
+from .tok2vec import Tok2Vec
--- a/spacy/ml/_wire.py
+++ b/spacy/ml/_wire.py
@ -0,0 +1,42 @@
+from __future__ import unicode_literals
+from thinc.api import layerize, wrap, noop, chain, concatenate
+from thinc.v2v import Model
+
+
+def concatenate_lists(*layers, **kwargs):  # pragma: no cover
+    """Compose two or more models `f`, `g`, etc, such that their outputs are
+    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
+    """
+    if not layers:
+        return layerize(noop())
+    drop_factor = kwargs.get("drop_factor", 1.0)
+    ops = layers[0].ops
+    layers = [chain(layer, flatten) for layer in layers]
+    concat = concatenate(*layers)
+
+    def concatenate_lists_fwd(Xs, drop=0.0):
+        if drop is not None:
+            drop *= drop_factor
+        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
+        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
+        ys = ops.unflatten(flat_y, lengths)
+
+        def concatenate_lists_bwd(d_ys, sgd=None):
+            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+
+        return ys, concatenate_lists_bwd
+
+    model = wrap(concatenate_lists_fwd, concat)
+    return model
+
+
+@layerize
+def flatten(seqs, drop=0.0):
+    ops = Model.ops
+    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
+
+    def finish_update(d_X, sgd=None):
+        return ops.unflatten(d_X, lengths, pad=0)
+
+    X = ops.flatten(seqs, pad=0)
+    return X, finish_update
--- a/spacy/ml/common.py
+++ b/spacy/ml/common.py
@ -0,0 +1,22 @@
+from __future__ import unicode_literals
+
+from thinc.api import chain
+from thinc.v2v import Maxout
+from thinc.misc import LayerNorm
+from ..util import register_architecture, make_layer
+
+
+@register_architecture("thinc.FeedForward.v1")
+def FeedForward(config):
+    layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]]
+    model = chain(*layers)
+    model.cfg = config
+    return model
+
+@register_architecture("spacy.LayerNormalizedMaxout.v1")
+def LayerNormalizedMaxout(config):
+    width = config["width"]
+    pieces = config["pieces"]
+    layer = chain(Maxout(width, pieces=pieces), LayerNorm(nO=width))
+    layer.nO = width
+    return layer
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@ -0,0 +1,141 @@
+from __future__ import unicode_literals
+
+from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
+from thinc.api import noop, with_square_sequences
+from thinc.v2v import Maxout
+from thinc.i2v import HashEmbed, StaticVectors
+from thinc.t2t import ExtractWindow
+from thinc.misc import Residual, LayerNorm, FeatureExtracter
+
+from ..util import make_layer, register_architecture
+from ._wire import concatenate_lists
+from .common import *
+
+
+@register_architecture("spacy.Tok2Vec.v1")
+def Tok2Vec(config):
+    doc2feats = make_layer(config["@doc2feats"])
+    embed = make_layer(config["@embed"])
+    encode = make_layer(config["@encode"])
+    tok2vec = chain(doc2feats, with_flatten(chain(embed, encode)))
+    tok2vec.cfg = config
+    tok2vec.nO = encode.nO
+    tok2vec.embed = embed
+    tok2vec.encode = encode
+    return tok2vec
+
+
+@register_architecture("spacy.Doc2Feats.v1")
+def Doc2Feats(config):
+    columns = config["columns"]
+    return FeatureExtracter(columns)
+
+
+@register_architecture("spacy.MultiHashEmbed.v1")
+def MultiHashEmbed(config):
+    cols = config["columns"]
+    width = config["width"]
+    rows = config["rows"]
+
+    tables = [HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")]
+    if config["use_subwords"]:
+        for feature in ["PREFIX", "SUFFIX", "SHAPE"]:
+            tables.append(
+                HashEmbed(
+                    width,
+                    rows // 2,
+                    column=cols.index(feature),
+                    name="embed_%s" % feature.lower(),
+                )
+            )
+    if config.get("@pretrained_vectors"):
+        tables.append(make_layer(config["@pretrained_vectors"]))
+    mix = make_layer(config["@mix"])
+    # This is a pretty ugly hack. Not sure what the best solution should be.
+    mix._layers[0].nI = sum(table.nO for table in tables)
+    layer = uniqued(chain(concatenate(*tables), mix), column=cols.index("ORTH"))
+    layer.cfg = config
+    return layer
+
+
+@register_architecture("spacy.CharacterEmbed.v1")
+def CharacterEmbed(config):
+    width = config["width"]
+    chars = config["chars"]
+
+    chr_embed = CharacterEmbed(nM=width, nC=chars)
+    other_tables = make_layer(config["@embed_features"])
+    mix = make_layer(config["@mix"])
+
+    model = chain(concatenate_lists(chr_embed, other_tables), mix)
+    model.cfg = config
+    return model
+
+
+@register_architecture("spacy.MaxoutWindowEncoder.v1")
+def MaxoutWindowEncoder(config):
+    nO = config["width"]
+    nW = config["window_size"]
+    nP = config["pieces"]
+    depth = config["depth"]
+
+    cnn = chain(
+        ExtractWindow(nW=nW),
+        Maxout(nO, nO * ((nW * 2) + 1), pieces=nP),
+        LayerNorm(nO=nO),
+    )
+    model = clone(Residual(cnn), depth)
+    model.nO = nO
+    return model
+
+
+@register_architecture("spacy.PretrainedVectors.v1")
+def PretrainedVectors(config):
+    return StaticVectors(config["vectors_name"], config["width"], config["column"])
+
+
+@register_architecture("spacy.TorchBiLSTMEncoder.v1")
+def TorchBiLSTMEncoder(config):
+    import torch.nn
+    from thinc.extra.wrappers import PyTorchWrapperRNN
+
+    width = config["width"]
+    depth = config["depth"]
+    if depth == 0:
+        return layerize(noop())
+    return with_square_sequences(
+        PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
+    )
+
+
+_EXAMPLE_CONFIG = {
+    "@doc2feats": {
+        "arch": "Doc2Feats",
+        "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
+    },
+    "@embed": {
+        "arch": "spacy.MultiHashEmbed.v1",
+        "config": {
+            "width": 96,
+            "rows": 2000,
+            "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
+            "use_subwords": True,
+            "@pretrained_vectors": {
+                "arch": "TransformedStaticVectors",
+                "config": {
+                    "vectors_name": "en_vectors_web_lg.vectors",
+                    "width": 96,
+                    "column": 0,
+                },
+            },
+            "@mix": {
+                "arch": "LayerNormalizedMaxout",
+                "config": {"width": 96, "pieces": 3},
+            },
+        },
+    },
+    "@encode": {
+        "arch": "MaxoutWindowEncode",
+        "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
+    },
+}
--- a/spacy/util.py
+++ b/spacy/util.py
@ -142,6 +142,11 @@ def register_architecture(name, arch=None):
    return do_registration


+def make_layer(arch_config):
+    arch_func = get_architecture(arch_config["arch"])
+    return arch_func(arch_config["config"])
+
+
 def get_architecture(name):
    """Get a model architecture function by name. Raises a KeyError if the
    architecture is not found.