From 9e210fa7fdb8e376655e7a7ab7debd3ffd718a63 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 28 Oct 2019 23:59:03 +0100 Subject: [PATCH] Fix tok2vec structure after model registry refactor (#4549) The model registry refactor of the Tok2Vec function broke loading models trained with the previous function, because the model tree was slightly different. Specifically, the new function wrote: concatenate(norm, prefix, suffix, shape) To build the embedding layer. In the previous implementation, I had used the operator overloading shortcut: ( norm | prefix | suffix | shape ) This actually gets mapped to a binary association, giving something like: concatenate(norm, concatenate(prefix, concatenate(suffix, shape))) This is a different tree, so the layers iterate differently and we loaded the weights wrongly. --- spacy/ml/tok2vec.py | 52 ++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 0e57cfb73..d24b9d0c7 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued from thinc.api import noop, with_square_sequences -from thinc.v2v import Maxout +from thinc.v2v import Maxout, Model from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow from thinc.misc import Residual, LayerNorm, FeatureExtracter @@ -33,27 +33,49 @@ def Doc2Feats(config): @register_architecture("spacy.MultiHashEmbed.v1") def MultiHashEmbed(config): + # For backwards compatibility with models before the architecture registry, + # we have to be careful to get exactly the same model structure. One subtle + # trick is that when we define concatenation with the operator, the operator + # is actually binary associative. So when we write (a | b | c), we're actually + # getting concatenate(concatenate(a, b), c). That's why the implementation + # is a bit ugly here. cols = config["columns"] width = config["width"] rows = config["rows"] - tables = [HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")] + norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm") if config["use_subwords"]: - for feature in ["PREFIX", "SUFFIX", "SHAPE"]: - tables.append( - HashEmbed( - width, - rows // 2, - column=cols.index(feature), - name="embed_%s" % feature.lower(), - ) - ) + prefix = HashEmbed(width, rows // 2, + column=cols.index("PREFIX"), name="embed_prefix") + suffix = HashEmbed(width, rows // 2, + column=cols.index("SUFFIX"), name="embed_suffix") + shape = HashEmbed(width, rows // 2, + column=cols.index("SHAPE"), name="embed_shape") if config.get("@pretrained_vectors"): - tables.append(make_layer(config["@pretrained_vectors"])) + glove = make_layer(config["@pretrained_vectors"]) mix = make_layer(config["@mix"]) - # This is a pretty ugly hack. Not sure what the best solution should be. - mix._layers[0].nI = sum(table.nO for table in tables) - layer = uniqued(chain(concatenate(*tables), mix), column=cols.index("ORTH")) + + with Model.define_operators({">>": chain, "|": concatenate}): + if config["use_subwords"] and config["@pretrained_vectors"]: + mix._layers[0].nI = width * 5 + layer = uniqued( + (glove | norm | prefix | suffix | shape) >> mix, + column=cols.index("ORTH") + ) + elif config["use_subwords"]: + mix._layers[0].nI = width * 4 + layer = uniqued( + (norm | prefix | suffix | shape) >> mix, + column=cols.index("ORTH") + ) + elif config["@pretrained_vectors"]: + mix._layers[0].nI = width * 2 + embed = uniqued( + (glove | norm) >> mix, + column=cols.index("ORTH"), + ) + else: + embed = norm layer.cfg = config return layer