Refactor Tok2Vec to use architecture registry (#4518)

* Add refactored tok2vec, using register_architecture * Refactor Tok2Vec * Fix ml * Fix new tok2vec * Move make_layer to util * Add wire * Fix missing import
2025-12-17 07:04:29 +03:00 · 2019-10-25 22:28:20 +02:00 · 2019-10-25 22:28:20 +02:00 · 406eb95a47
commit 406eb95a47
parent 99e309bb19
6 changed files with 298 additions and 76 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -3,16 +3,14 @@ from __future__ import unicode_literals
 import numpy
 from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
 from thinc.i2v import HashEmbed, StaticVectors
 from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.t2v import Pooling, sum_pool, mean_pool
-from thinc.misc import Residual
+from thinc.i2v import HashEmbed
 from thinc.misc import Residual, FeatureExtracter
 from thinc.misc import LayerNorm as LN
 from thinc.misc import FeatureExtracter
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.api import with_getitem, flatten_add_lengths
 from thinc.api import uniqued, wrap, noop
 from thinc.api import with_square_sequences
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module, copy_array
@ -26,12 +24,8 @@ import thinc.extra.load_nlp
 from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
 from .errors import Errors, user_warning, Warnings
 from . import util
 from . import ml as new_ml
 try:
    import torch.nn
    from thinc.extra.wrappers import PyTorchWrapperRNN
 except ImportError:
    torch = None
 VECTORS_KEY = "spacy_pretrained_vectors"
@ -310,6 +304,9 @@ def link_vectors_to_models(vocab):
 def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
    import torch.nn
    from thinc.api import with_square_sequences
    from thinc.extra.wrappers import PyTorchWrapperRNN
    if depth == 0:
        return layerize(noop())
    model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
@ -321,77 +318,91 @@ def Tok2Vec(width, embed_size, **kwargs):
    cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
    subword_features = kwargs.get("subword_features", True)
    char_embed = kwargs.get("char_embed", False)
    if char_embed:
        subword_features = False
    conv_depth = kwargs.get("conv_depth", 4)
    bilstm_depth = kwargs.get("bilstm_depth", 0)
    cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
    with Model.define_operators(
        {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
    ):
        norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
        if subword_features:
            prefix = HashEmbed(
                width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
            )
            suffix = HashEmbed(
                width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
            )
            shape = HashEmbed(
                width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
            )
        else:
            prefix, suffix, shape = (None, None, None)
        if pretrained_vectors is not None:
            glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
-            if subword_features:
+    cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
                embed = uniqued(
                    (glove | norm | prefix | suffix | shape)
                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
                    column=cols.index(ORTH),
                )
        elif subword_features:
            embed = uniqued(
                (norm | prefix | suffix | shape)
                >> LN(Maxout(width, width * 4, pieces=3)),
                column=cols.index(ORTH),
            )
        elif char_embed:
            embed = concatenate_lists(
                CharacterEmbed(nM=64, nC=8),
                FeatureExtracter(cols) >> with_flatten(norm),
            )
            reduce_dimensions = LN(
                Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
            )
        else:
            embed = norm
-        convolution = Residual(
+    doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
            ExtractWindow(nW=1)
            >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
        )
    if char_embed:
-            tok2vec = embed >> with_flatten(
+        embed_cfg = {
-                reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
+            "arch": "spacy.CharacterEmbed.v1",
-            )
+            "config": {
                "width": 64,
                "chars": 6,
                "@mix": {
                    "arch": "spacy.LayerNormalizedMaxout.v1",
                    "config": {
                        "width": width,
                        "pieces": 3
                    }
                },
                "@embed_features": None
            }
        }
    else:
-            tok2vec = FeatureExtracter(cols) >> with_flatten(
+        embed_cfg = {
-                embed >> convolution ** conv_depth, pad=conv_depth
+            "arch": "spacy.MultiHashEmbed.v1",
-            )
+            "config": {
                "width": width,
                "rows": embed_size,
                "columns": cols,
                "use_subwords": subword_features,
                "@pretrained_vectors": None,
                "@mix": {
                    "arch": "spacy.LayerNormalizedMaxout.v1",
                    "config": {
                        "width": width,
                        "pieces": 3
                    }
                },
            }
        }
        if pretrained_vectors:
            embed_cfg["config"]["@pretrained_vectors"] = {
                "arch": "spacy.PretrainedVectors.v1",
                "config": {
                    "vectors_name": pretrained_vectors,
                    "width": width,
                    "column": cols.index(ID)
                }
            }
    cnn_cfg = {
        "arch": "spacy.MaxoutWindowEncoder.v1",
        "config": {
            "width": width,
            "window_size": 1,
            "pieces": cnn_maxout_pieces,
            "depth": conv_depth
        }
    }
-        if bilstm_depth >= 1:
+    bilstm_cfg = {
-            tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
+        "arch": "spacy.TorchBiLSTMEncoder.v1",
-        # Work around thinc API limitations :(. TODO: Revise in Thinc 7
+        "config": {
-        tok2vec.nO = width
+            "width": width,
-        tok2vec.embed = embed
+            "depth": bilstm_depth,
-    return tok2vec
+        }
    }
    if conv_depth == 0 and bilstm_depth == 0:
        encode_cfg = {}
    elif conv_depth >= 1 and bilstm_depth >= 1:
        encode_cfg = {
            "arch": "thinc.FeedForward.v1",
            "config": {
                "children": [cnn_cfg, bilstm_cfg]
            }
        }
    elif conv_depth >= 1:
        encode_cfg = cnn_cfg
    else:
        encode_cfg = bilstm_cfg
    config = {
        "@doc2feats": doc2feats_cfg,
        "@embed": embed_cfg,
        "@encode": encode_cfg
    }
    return new_ml.Tok2Vec(config)
 def reapply(layer, n_times):
--- a/spacy/ml/init.py
+++ b/spacy/ml/init.py
@ -0,0 +1 @@
 from .tok2vec import Tok2Vec
--- a/spacy/ml/_wire.py
+++ b/spacy/ml/_wire.py
@ -0,0 +1,42 @@
 from __future__ import unicode_literals
 from thinc.api import layerize, wrap, noop, chain, concatenate
 from thinc.v2v import Model
 def concatenate_lists(*layers, **kwargs):  # pragma: no cover
    """Compose two or more models `f`, `g`, etc, such that their outputs are
    concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
    """
    if not layers:
        return layerize(noop())
    drop_factor = kwargs.get("drop_factor", 1.0)
    ops = layers[0].ops
    layers = [chain(layer, flatten) for layer in layers]
    concat = concatenate(*layers)
    def concatenate_lists_fwd(Xs, drop=0.0):
        if drop is not None:
            drop *= drop_factor
        lengths = ops.asarray([len(X) for X in Xs], dtype="i")
        flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
        ys = ops.unflatten(flat_y, lengths)
        def concatenate_lists_bwd(d_ys, sgd=None):
            return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
        return ys, concatenate_lists_bwd
    model = wrap(concatenate_lists_fwd, concat)
    return model
@layerize
 def flatten(seqs, drop=0.0):
    ops = Model.ops
    lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
    def finish_update(d_X, sgd=None):
        return ops.unflatten(d_X, lengths, pad=0)
    X = ops.flatten(seqs, pad=0)
    return X, finish_update
--- a/spacy/ml/common.py
+++ b/spacy/ml/common.py
@ -0,0 +1,22 @@
 from __future__ import unicode_literals
 from thinc.api import chain
 from thinc.v2v import Maxout
 from thinc.misc import LayerNorm
 from ..util import register_architecture, make_layer
@register_architecture("thinc.FeedForward.v1")
 def FeedForward(config):
    layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]]
    model = chain(*layers)
    model.cfg = config
    return model
@register_architecture("spacy.LayerNormalizedMaxout.v1")
 def LayerNormalizedMaxout(config):
    width = config["width"]
    pieces = config["pieces"]
    layer = chain(Maxout(width, pieces=pieces), LayerNorm(nO=width))
    layer.nO = width
    return layer
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
@ -0,0 +1,141 @@
 from __future__ import unicode_literals
 from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
 from thinc.api import noop, with_square_sequences
 from thinc.v2v import Maxout
 from thinc.i2v import HashEmbed, StaticVectors
 from thinc.t2t import ExtractWindow
 from thinc.misc import Residual, LayerNorm, FeatureExtracter
 from ..util import make_layer, register_architecture
 from ._wire import concatenate_lists
 from .common import *
@register_architecture("spacy.Tok2Vec.v1")
 def Tok2Vec(config):
    doc2feats = make_layer(config["@doc2feats"])
    embed = make_layer(config["@embed"])
    encode = make_layer(config["@encode"])
    tok2vec = chain(doc2feats, with_flatten(chain(embed, encode)))
    tok2vec.cfg = config
    tok2vec.nO = encode.nO
    tok2vec.embed = embed
    tok2vec.encode = encode
    return tok2vec
@register_architecture("spacy.Doc2Feats.v1")
 def Doc2Feats(config):
    columns = config["columns"]
    return FeatureExtracter(columns)
@register_architecture("spacy.MultiHashEmbed.v1")
 def MultiHashEmbed(config):
    cols = config["columns"]
    width = config["width"]
    rows = config["rows"]
    tables = [HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")]
    if config["use_subwords"]:
        for feature in ["PREFIX", "SUFFIX", "SHAPE"]:
            tables.append(
                HashEmbed(
                    width,
                    rows // 2,
                    column=cols.index(feature),
                    name="embed_%s" % feature.lower(),
                )
            )
    if config.get("@pretrained_vectors"):
        tables.append(make_layer(config["@pretrained_vectors"]))
    mix = make_layer(config["@mix"])
    # This is a pretty ugly hack. Not sure what the best solution should be.
    mix._layers[0].nI = sum(table.nO for table in tables)
    layer = uniqued(chain(concatenate(*tables), mix), column=cols.index("ORTH"))
    layer.cfg = config
    return layer
@register_architecture("spacy.CharacterEmbed.v1")
 def CharacterEmbed(config):
    width = config["width"]
    chars = config["chars"]
    chr_embed = CharacterEmbed(nM=width, nC=chars)
    other_tables = make_layer(config["@embed_features"])
    mix = make_layer(config["@mix"])
    model = chain(concatenate_lists(chr_embed, other_tables), mix)
    model.cfg = config
    return model
@register_architecture("spacy.MaxoutWindowEncoder.v1")
 def MaxoutWindowEncoder(config):
    nO = config["width"]
    nW = config["window_size"]
    nP = config["pieces"]
    depth = config["depth"]
    cnn = chain(
        ExtractWindow(nW=nW),
        Maxout(nO, nO * ((nW * 2) + 1), pieces=nP),
        LayerNorm(nO=nO),
    )
    model = clone(Residual(cnn), depth)
    model.nO = nO
    return model
@register_architecture("spacy.PretrainedVectors.v1")
 def PretrainedVectors(config):
    return StaticVectors(config["vectors_name"], config["width"], config["column"])
@register_architecture("spacy.TorchBiLSTMEncoder.v1")
 def TorchBiLSTMEncoder(config):
    import torch.nn
    from thinc.extra.wrappers import PyTorchWrapperRNN
    width = config["width"]
    depth = config["depth"]
    if depth == 0:
        return layerize(noop())
    return with_square_sequences(
        PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
    )
 _EXAMPLE_CONFIG = {
    "@doc2feats": {
        "arch": "Doc2Feats",
        "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
    },
    "@embed": {
        "arch": "spacy.MultiHashEmbed.v1",
        "config": {
            "width": 96,
            "rows": 2000,
            "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
            "use_subwords": True,
            "@pretrained_vectors": {
                "arch": "TransformedStaticVectors",
                "config": {
                    "vectors_name": "en_vectors_web_lg.vectors",
                    "width": 96,
                    "column": 0,
                },
            },
            "@mix": {
                "arch": "LayerNormalizedMaxout",
                "config": {"width": 96, "pieces": 3},
            },
        },
    },
    "@encode": {
        "arch": "MaxoutWindowEncode",
        "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
    },
 }
--- a/spacy/util.py
+++ b/spacy/util.py
@ -142,6 +142,11 @@ def register_architecture(name, arch=None):
    return do_registration
 def make_layer(arch_config):
    arch_func = get_architecture(arch_config["arch"])
    return arch_func(arch_config["config"])
 def get_architecture(name):
    """Get a model architecture function by name. Raises a KeyError if the
    architecture is not found.