Fix tok2vec structure after model registry refactor (#4549)

The model registry refactor of the Tok2Vec function broke loading models
trained with the previous function, because the model tree was slightly
different. Specifically, the new function wrote:

    concatenate(norm, prefix, suffix, shape)

To build the embedding layer. In the previous implementation, I had used
the operator overloading shortcut:

    ( norm | prefix | suffix | shape )

This actually gets mapped to a binary association, giving something
like:

    concatenate(norm, concatenate(prefix, concatenate(suffix, shape)))

This is a different tree, so the layers iterate differently and we
loaded the weights wrongly.
This commit is contained in:
Matthew Honnibal 2019-10-28 23:59:03 +01:00 committed by GitHub
parent bade60fe64
commit 9e210fa7fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,7 +2,7 @@ from __future__ import unicode_literals
from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
from thinc.api import noop, with_square_sequences from thinc.api import noop, with_square_sequences
from thinc.v2v import Maxout from thinc.v2v import Maxout, Model
from thinc.i2v import HashEmbed, StaticVectors from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow from thinc.t2t import ExtractWindow
from thinc.misc import Residual, LayerNorm, FeatureExtracter from thinc.misc import Residual, LayerNorm, FeatureExtracter
@ -33,27 +33,49 @@ def Doc2Feats(config):
@register_architecture("spacy.MultiHashEmbed.v1") @register_architecture("spacy.MultiHashEmbed.v1")
def MultiHashEmbed(config): def MultiHashEmbed(config):
# For backwards compatibility with models before the architecture registry,
# we have to be careful to get exactly the same model structure. One subtle
# trick is that when we define concatenation with the operator, the operator
# is actually binary associative. So when we write (a | b | c), we're actually
# getting concatenate(concatenate(a, b), c). That's why the implementation
# is a bit ugly here.
cols = config["columns"] cols = config["columns"]
width = config["width"] width = config["width"]
rows = config["rows"] rows = config["rows"]
tables = [HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")] norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")
if config["use_subwords"]: if config["use_subwords"]:
for feature in ["PREFIX", "SUFFIX", "SHAPE"]: prefix = HashEmbed(width, rows // 2,
tables.append( column=cols.index("PREFIX"), name="embed_prefix")
HashEmbed( suffix = HashEmbed(width, rows // 2,
width, column=cols.index("SUFFIX"), name="embed_suffix")
rows // 2, shape = HashEmbed(width, rows // 2,
column=cols.index(feature), column=cols.index("SHAPE"), name="embed_shape")
name="embed_%s" % feature.lower(),
)
)
if config.get("@pretrained_vectors"): if config.get("@pretrained_vectors"):
tables.append(make_layer(config["@pretrained_vectors"])) glove = make_layer(config["@pretrained_vectors"])
mix = make_layer(config["@mix"]) mix = make_layer(config["@mix"])
# This is a pretty ugly hack. Not sure what the best solution should be.
mix._layers[0].nI = sum(table.nO for table in tables) with Model.define_operators({">>": chain, "|": concatenate}):
layer = uniqued(chain(concatenate(*tables), mix), column=cols.index("ORTH")) if config["use_subwords"] and config["@pretrained_vectors"]:
mix._layers[0].nI = width * 5
layer = uniqued(
(glove | norm | prefix | suffix | shape) >> mix,
column=cols.index("ORTH")
)
elif config["use_subwords"]:
mix._layers[0].nI = width * 4
layer = uniqued(
(norm | prefix | suffix | shape) >> mix,
column=cols.index("ORTH")
)
elif config["@pretrained_vectors"]:
mix._layers[0].nI = width * 2
embed = uniqued(
(glove | norm) >> mix,
column=cols.index("ORTH"),
)
else:
embed = norm
layer.cfg = config layer.cfg = config
return layer return layer