Tok2Vec: extract-embed-encode (#5102)

* avoid changing original config * fix elif structure, batch with just int crashes otherwise * tok2vec example with doc2feats, encode and embed architectures * further clean up MultiHashEmbed * further generalize Tok2Vec to work with extract-embed-encode parts * avoid initializing the charembed layer with Docs (for now ?) * small fixes for bilstm config (still does not run) * rename to core layer * move new configs * walk model to set nI instead of using core ref * fix senter overfitting test to be more similar to the training data (avoid flakey behaviour)
2025-11-08 03:47:39 +03:00 · 2020-03-08 13:23:18 +01:00 · 2020-03-08 13:23:18 +01:00 · 5847be6022
commit 5847be6022
parent c95ce96c44
10 changed files with 227 additions and 141 deletions
--- a/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
+++ b/examples/experiments/ptb-joint-pos-dep/bilstm_tok2vec.cfg
@ -62,4 +62,4 @@ width = 96
 depth = 4
 embed_size = 2000
 subword_features = true
-char_embed = false
+maxout_pieces = 3
--- a/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/charembed_tok2vec.cfg
@ -0,0 +1,65 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+batch_size = 25
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[nlp.pipeline.tok2vec.model.extract]
+@architectures = "spacy.CharacterEmbed.v1"
+width = 96
+nM = 64
+nC = 8
+rows = 2000
+columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+
+[nlp.pipeline.tok2vec.model.extract.features]
+@architectures = "spacy.Doc2Feats.v1"
+columns = ${nlp.pipeline.tok2vec.model.extract:columns}
+
+[nlp.pipeline.tok2vec.model.embed]
+@architectures = "spacy.LayerNormalizedMaxout.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+maxout_pieces = 4
+
+[nlp.pipeline.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
+window_size = 1
+maxout_pieces = 2
+depth = 2
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model.extract:width}
--- a/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
+++ b/examples/experiments/tok2vec-ner/multihashembed_tok2vec.cfg
@ -0,0 +1,65 @@
+[training]
+use_gpu = -1
+limit = 0
+dropout = 0.2
+patience = 10000
+eval_frequency = 200
+scores = ["ents_f"]
+score_weights = {"ents_f": 1}
+orth_variant_level = 0.0
+gold_preproc = true
+max_length = 0
+batch_size = 25
+
+[optimizer]
+@optimizers = "Adam.v1"
+learn_rate = 0.001
+beta1 = 0.9
+beta2 = 0.999
+
+[nlp]
+lang = "en"
+vectors = null
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.Tok2Vec.v1"
+
+[nlp.pipeline.tok2vec.model.extract]
+@architectures = "spacy.Doc2Feats.v1"
+columns = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
+
+[nlp.pipeline.tok2vec.model.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+columns = ${nlp.pipeline.tok2vec.model.extract:columns}
+width = 96
+rows = 2000
+use_subwords = true
+pretrained_vectors = null
+
+[nlp.pipeline.tok2vec.model.embed.mix]
+@architectures = "spacy.LayerNormalizedMaxout.v1"
+width = ${nlp.pipeline.tok2vec.model.embed:width}
+maxout_pieces = 3
+
+[nlp.pipeline.tok2vec.model.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v1"
+width = ${nlp.pipeline.tok2vec.model.embed:width}
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 6
+hidden_width = 64
+maxout_pieces = 2
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model.embed:width}
--- a/spacy/language.py
+++ b/spacy/language.py
@ -337,13 +337,14 @@ class Language(object):
        default_config = self.defaults.get(name, None)

        # transform the model's config to an actual Model
+        factory_cfg = dict(config)
        model_cfg = None
-        if "model" in config:
-            model_cfg = config["model"]
+        if "model" in factory_cfg:
+            model_cfg = factory_cfg["model"]
            if not isinstance(model_cfg, dict):
                warnings.warn(Warnings.W099.format(type=type(model_cfg), pipe=name))
                model_cfg = None
-            del config["model"]
+            del factory_cfg["model"]
        if model_cfg is None and default_config is not None:
            warnings.warn(Warnings.W098.format(name=name))
            model_cfg = default_config["model"]
@ -353,7 +354,7 @@ class Language(object):
            model = registry.make_from_config({"model": model_cfg}, validate=True)[
                "model"
            ]
-        return factory(self, model, **config)
+        return factory(self, model, **factory_cfg)

    def add_pipe(
        self, component, name=None, before=None, after=None, first=None, last=None
--- a/spacy/ml/_character_embed.py
+++ b/spacy/ml/_character_embed.py
@ -21,7 +21,7 @@ def init(model, X=None, Y=None):


 def forward(model, docs, is_train):
-    if not docs:
+    if docs is None:
        return []
    ids = []
    output = []
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -4,7 +4,7 @@ from thinc.api import HashEmbed, StaticVectors, PyTorchLSTM
 from thinc.api import residual, LayerNorm, FeatureExtractor, Mish

 from ... import util
-from ...util import registry, make_layer
+from ...util import registry
 from ...ml import _character_embed
 from ...pipeline.tok2vec import Tok2VecListener
 from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
@ -23,15 +23,14 @@ def get_vocab_vectors(name):


@registry.architectures.register("spacy.Tok2Vec.v1")
-def Tok2Vec(config):
-    doc2feats = make_layer(config["@doc2feats"])
-    embed = make_layer(config["@embed"])
-    encode = make_layer(config["@encode"])
+def Tok2Vec(extract, embed, encode):
    field_size = 0
-    if encode.has_attr("receptive_field"):
+    if encode.attrs.get("receptive_field", None):
        field_size = encode.attrs["receptive_field"]
-    tok2vec = chain(doc2feats, with_array(chain(embed, encode), pad=field_size))
-    tok2vec.attrs["cfg"] = config
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        if extract.has_dim("nO"):
+            _set_dims(embed, "nI", extract.get_dim("nO"))
+        tok2vec = extract >> with_array(embed >> encode, pad=field_size)
    tok2vec.set_dim("nO", encode.get_dim("nO"))
    tok2vec.set_ref("embed", embed)
    tok2vec.set_ref("encode", encode)
@ -39,8 +38,7 @@ def Tok2Vec(config):


@registry.architectures.register("spacy.Doc2Feats.v1")
-def Doc2Feats(config):
-    columns = config["columns"]
+def Doc2Feats(columns):
    return FeatureExtractor(columns)


@ -79,8 +77,8 @@ def hash_charembed_cnn(
    maxout_pieces,
    window_size,
    subword_features,
-    nM=0,
-    nC=0,
+    nM,
+    nC,
 ):
    # Allows using character embeddings by setting nC, nM and char_embed=True
    return build_Tok2Vec_model(
@ -100,7 +98,7 @@ def hash_charembed_cnn(

@registry.architectures.register("spacy.HashEmbedBiLSTM.v1")
 def hash_embed_bilstm_v1(
-    pretrained_vectors, width, depth, embed_size, subword_features
+    pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces
 ):
    # Does not use character embeddings: set to False by default
    return build_Tok2Vec_model(
@ -109,7 +107,7 @@ def hash_embed_bilstm_v1(
        pretrained_vectors=pretrained_vectors,
        bilstm_depth=depth,
        conv_depth=0,
-        maxout_pieces=0,
+        maxout_pieces=maxout_pieces,
        window_size=1,
        subword_features=subword_features,
        char_embed=False,
@ -120,7 +118,7 @@ def hash_embed_bilstm_v1(

@registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1")
 def hash_char_embed_bilstm_v1(
-    pretrained_vectors, width, depth, embed_size, subword_features, nM=0, nC=0
+    pretrained_vectors, width, depth, embed_size, subword_features, nM, nC, maxout_pieces
 ):
    # Allows using character embeddings by setting nC, nM and char_embed=True
    return build_Tok2Vec_model(
@ -129,7 +127,7 @@ def hash_char_embed_bilstm_v1(
        pretrained_vectors=pretrained_vectors,
        bilstm_depth=depth,
        conv_depth=0,
-        maxout_pieces=0,
+        maxout_pieces=maxout_pieces,
        window_size=1,
        subword_features=subword_features,
        char_embed=True,
@ -138,104 +136,99 @@ def hash_char_embed_bilstm_v1(
    )


-@registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed(config):
-    # For backwards compatibility with models before the architecture registry,
-    # we have to be careful to get exactly the same model structure. One subtle
-    # trick is that when we define concatenation with the operator, the operator
-    # is actually binary associative. So when we write (a | b | c), we're actually
-    # getting concatenate(concatenate(a, b), c). That's why the implementation
-    # is a bit ugly here.
-    cols = config["columns"]
-    width = config["width"]
-    rows = config["rows"]
+@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
+def LayerNormalizedMaxout(width, maxout_pieces):
+    return Maxout(
+        nO=width,
+        nP=maxout_pieces,
+        dropout=0.0,
+        normalize=True,
+    )

-    norm = HashEmbed(width, rows, column=cols.index("NORM"))
-    if config["use_subwords"]:
-        prefix = HashEmbed(width, rows // 2, column=cols.index("PREFIX"))
-        suffix = HashEmbed(width, rows // 2, column=cols.index("SUFFIX"))
-        shape = HashEmbed(width, rows // 2, column=cols.index("SHAPE"))
-    if config.get("@pretrained_vectors"):
-        glove = make_layer(config["@pretrained_vectors"])
-    mix = make_layer(config["@mix"])
+
+@registry.architectures.register("spacy.MultiHashEmbed.v1")
+def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix):
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
+    if use_subwords:
+        prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"))
+        suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"))
+        shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"))
+
+    if pretrained_vectors:
+        glove = StaticVectors(
+            vectors=pretrained_vectors.data,
+            nO=width,
+            column=columns.index(ID),
+            dropout=0.0,
+        )

    with Model.define_operators({">>": chain, "|": concatenate}):
-        if config["use_subwords"] and config["@pretrained_vectors"]:
-            mix._layers[0].set_dim("nI", width * 5)
-            layer = uniqued(
-                (glove | norm | prefix | suffix | shape) >> mix,
-                column=cols.index("ORTH"),
-            )
-        elif config["use_subwords"]:
-            mix._layers[0].set_dim("nI", width * 4)
-            layer = uniqued(
-                (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
-            )
-        elif config["@pretrained_vectors"]:
-            mix._layers[0].set_dim("nI", width * 2)
-            layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"))
+        if not use_subwords and not pretrained_vectors:
+            embed_layer = norm
        else:
-            layer = norm
-    layer.attrs["cfg"] = config
-    return layer
+            if use_subwords and pretrained_vectors:
+                nr_columns = 5
+                concat_columns = glove | norm | prefix | suffix | shape
+            elif use_subwords:
+                nr_columns = 4
+                concat_columns = norm | prefix | suffix | shape
+            else:
+                nr_columns = 2
+                concat_columns = glove | norm

+            _set_dims(mix, "nI", width * nr_columns)
+            embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH"))
+
+    return embed_layer
+
+
+def _set_dims(model, name, value):
+    # Loop through the model to set a specific dimension if its unset on any layer.
+    for node in model.walk():
+        if node.has_dim(name) is None:
+            node.set_dim(name, value)

@registry.architectures.register("spacy.CharacterEmbed.v1")
-def CharacterEmbed(config):
-    width = config["width"]
-    chars = config["chars"]
-
-    chr_embed = _character_embed.CharacterEmbed(nM=width, nC=chars)
-    other_tables = make_layer(config["@embed_features"])
-    mix = make_layer(config["@mix"])
-
-    model = chain(concatenate(chr_embed, other_tables), mix)
-    model.attrs["cfg"] = config
-    return model
+def CharacterEmbed(columns, width, rows, nM, nC, features):
+    norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"))
+    chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC)
+    with Model.define_operators({">>": chain, "|": concatenate}):
+        embed_layer = chr_embed | features >> with_array(norm)
+    embed_layer.set_dim("nO", nM * nC + width)
+    return embed_layer


@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
-def MaxoutWindowEncoder(config):
-    nO = config["width"]
-    nW = config["window_size"]
-    nP = config["pieces"]
-    depth = config["depth"]
-
-    cnn = (
-        expand_window(window_size=nW),
-        Maxout(nO=nO, nI=nO * ((nW * 2) + 1), nP=nP, dropout=0.0, normalize=True),
+def MaxoutWindowEncoder(width, window_size, maxout_pieces, depth):
+    cnn = chain(
+        expand_window(window_size=window_size),
+        Maxout(nO=width, nI=width * ((window_size * 2) + 1), nP=maxout_pieces, dropout=0.0, normalize=True),
    )
    model = clone(residual(cnn), depth)
-    model.set_dim("nO", nO)
-    model.attrs["receptive_field"] = nW * depth
+    model.set_dim("nO", width)
+    model.attrs["receptive_field"] = window_size * depth
    return model


@registry.architectures.register("spacy.MishWindowEncoder.v1")
-def MishWindowEncoder(config):
-    nO = config["width"]
-    nW = config["window_size"]
-    depth = config["depth"]
-
+def MishWindowEncoder(width, window_size, depth):
    cnn = chain(
-        expand_window(window_size=nW),
-        Mish(nO=nO, nI=nO * ((nW * 2) + 1)),
-        LayerNorm(nO),
+        expand_window(window_size=window_size),
+        Mish(nO=width, nI=width * ((window_size * 2) + 1)),
+        LayerNorm(width),
    )
    model = clone(residual(cnn), depth)
-    model.set_dim("nO", nO)
+    model.set_dim("nO", width)
    return model


@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
-def TorchBiLSTMEncoder(config):
+def TorchBiLSTMEncoder(width, depth):
    import torch.nn

    # TODO FIX
    from thinc.api import PyTorchRNNWrapper

-    width = config["width"]
-    depth = config["depth"]
    if depth == 0:
        return noop()
    return with_padded(
@ -243,40 +236,6 @@ def TorchBiLSTMEncoder(config):
    )


-# TODO: update
-_EXAMPLE_CONFIG = {
-    "@doc2feats": {
-        "arch": "Doc2Feats",
-        "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
-    },
-    "@embed": {
-        "arch": "spacy.MultiHashEmbed.v1",
-        "config": {
-            "width": 96,
-            "rows": 2000,
-            "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
-            "use_subwords": True,
-            "@pretrained_vectors": {
-                "arch": "TransformedStaticVectors",
-                "config": {
-                    "vectors_name": "en_vectors_web_lg.vectors",
-                    "width": 96,
-                    "column": 0,
-                },
-            },
-            "@mix": {
-                "arch": "LayerNormalizedMaxout",
-                "config": {"width": 96, "pieces": 3},
-            },
-        },
-    },
-    "@encode": {
-        "arch": "MaxoutWindowEncode",
-        "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
-    },
-}
-
-
 def build_Tok2Vec_model(
    width,
    embed_size,
--- a/spacy/ml/tok2vec.py
+++ b/spacy/ml/tok2vec.py
--- a/spacy/pipeline/tok2vec.py
+++ b/spacy/pipeline/tok2vec.py
@ -131,9 +131,10 @@ class Tok2Vec(Pipe):
        get_examples (function): Function returning example training data.
        pipeline (list): The pipeline the model is part of.
        """
-        # TODO: use examples instead ?
-        docs = [Doc(Vocab(), words=["hello"])]
-        self.model.initialize(X=docs)
+        # TODO: charembed does not play nicely with dim inference yet
+        # docs = [Doc(Vocab(), words=["hello"])]
+        # self.model.initialize(X=docs)
+        self.model.initialize()
        link_vectors_to_models(self.vocab)


--- a/spacy/tests/pipeline/test_senter.py
+++ b/spacy/tests/pipeline/test_senter.py
@ -36,17 +36,17 @@ def test_overfitting_IO():
    assert losses["senter"] < 0.0001

    # test the trained model
-    test_text = "I like eggs. There is ham. She likes ham."
+    test_text = "I like purple eggs. They eat ham. You like yellow eggs."
    doc = nlp(test_text)
-    gold_sent_starts = [0] * 12
+    gold_sent_starts = [0] * 14
    gold_sent_starts[0] = 1
-    gold_sent_starts[4] = 1
-    gold_sent_starts[8] = 1
-    assert gold_sent_starts == [int(t.is_sent_start) for t in doc]
+    gold_sent_starts[5] = 1
+    gold_sent_starts[9] = 1
+    assert [int(t.is_sent_start) for t in doc] == gold_sent_starts

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        doc2 = nlp2(test_text)
-        assert gold_sent_starts == [int(t.is_sent_start) for t in doc2]
+        assert [int(t.is_sent_start) for t in doc2] == gold_sent_starts
--- a/spacy/util.py
+++ b/spacy/util.py
@ -79,11 +79,6 @@ def set_lang_class(name, cls):
    registry.languages.register(name, func=cls)


-def make_layer(arch_config):
-    arch_func = registry.architectures.get(arch_config["arch"])
-    return arch_func(arch_config["config"])
-
-
 def ensure_path(path):
    """Ensure string is converted to a Path.

@ -563,7 +558,7 @@ def minibatch_by_words(examples, size, tuples=True, count_words=len):
    """Create minibatches of a given number of words."""
    if isinstance(size, int):
        size_ = itertools.repeat(size)
-    if isinstance(size, List):
+    elif isinstance(size, List):
        size_ = iter(size)
    else:
        size_ = size