From 8ec79ad3fadd97f39b220c874e0df46921646fd0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 5 Oct 2020 15:22:00 +0200
Subject: [PATCH] Allow configuration of MultiHashEmbed features

Update arguments to MultiHashEmbed layer so that the attributes can be
controlled. A kind of tricky scheme is used to allow optional
specification of the rows. I think it's an okay balance between
flexibility and convenience.
---
 spacy/ml/models/tok2vec.py | 100 +++++++++++++++++++++++++------------
 spacy/tests/test_models.py |  32 +++++++++++-
 2 files changed, 98 insertions(+), 34 deletions(-)

diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py
index 1a0979cab..4abc1bee6 100644
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@@ -1,4 +1,4 @@
-from typing import Optional, List, Union
+from typing import Optional, List, Union, Dict
 from thinc.types import Floats2d
 from thinc.api import chain, clone, concatenate, with_array, with_padded
 from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed
@@ -11,7 +11,7 @@ from ...ml import _character_embed
 from ..staticvectors import StaticVectors
 from ..featureextractor import FeatureExtractor
 from ...pipeline.tok2vec import Tok2VecListener
-from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
+from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr
 
 
 @registry.architectures.register("spacy.Tok2VecListener.v1")
@@ -54,12 +54,16 @@ def build_hash_embed_cnn_tok2vec(
         a language such as Chinese.
     pretrained_vectors (bool): Whether to also use static vectors.
     """
+    if subword_features:
+        attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5}
+    else:
+        attrs = {"NORM": 1.0}
     return build_Tok2Vec_model(
         embed=MultiHashEmbed(
             width=width,
             rows=embed_size,
-            also_embed_subwords=subword_features,
-            also_use_static_vectors=bool(pretrained_vectors),
+            attrs=attrs,
+            include_static_vectors=bool(pretrained_vectors),
         ),
         encode=MaxoutWindowEncoder(
             width=width,
@@ -92,59 +96,89 @@ def build_Tok2Vec_model(
 
 
 @registry.architectures.register("spacy.MultiHashEmbed.v1")
-def MultiHashEmbed(
+def MultiHashEmbed_v1(
     width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool
+) -> Model[List[Doc], List[Floats2d]]:
+    """Previous interface for MultiHashEmbed. This should be removed, it's only
+    here as a temporary compatibility."""
+    return MultiHashEmbed(
+        width=width,
+        rows=rows, 
+        attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM],
+        include_static_vectors=also_use_static_vectors
+    )
+
+@registry.architectures.register("spacy.MultiHashEmbed.v2")
+def MultiHashEmbed(
+    width: int,
+    rows: int,
+    attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]],
+    include_static_vectors: bool
 ) -> Model[List[Doc], List[Floats2d]]:
     """Construct an embedding layer that separately embeds a number of lexical
     attributes using hash embedding, concatenates the results, and passes it
     through a feed-forward subnetwork to build a mixed representations.
 
-    The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have
-    varying definitions depending on the Vocab of the Doc object passed in.
-    Vectors from pretrained static vectors can also be incorporated into the
-    concatenated representation.
+    The features used can be configured with the 'attrs' argument. The suggested
+    attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
+    account some subword information, without contruction a fully character-based
+    representation. If pretrained vectors are available, they can be included in
+    the representation as well, with the vectors table will be kept static
+    (i.e. it's not updated).
+
+    The `width` parameter specifices the output width of the layer and the widths
+    of all embedding tables. If static vectors are included, a learned linear
+    layer is used to map the vectors to the specified width before concatenating
+    it with the other embedding outputs. A single Maxout layer is then used to
+    reduce the concatenated vectors to the final width.
+    
+    The `rows` parameter controls the number of rows used by the `HashEmbed`
+    tables. The HashEmbed layer needs surprisingly few rows, due to its use of
+    the hashing trick. Generally between 2000 and 10000 rows is sufficient,
+    even for very large vocabularies. You can vary the number of rows per
+    attribute by specifying the attrs as a dict, mapping the keys to float
+    values which are interpreted as factors of `rows`. For instance, 
+    attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and 
+    rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are
+    assumed for all attributes.
 
     width (int): The output width. Also used as the width of the embedding tables.
         Recommended values are between 64 and 300.
-    rows (int): The number of rows for the embedding tables. Can be low, due
-        to the hashing trick. Embeddings for prefix, suffix and word shape
-        use half as many rows. Recommended values are between 2000 and 10000.
-    also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE
-        features in the embeddings. If not using these, you may need more
-        rows in your hash embeddings, as there will be increased chance of
-        collisions.
-    also_use_static_vectors (bool): Whether to also use static word vectors.
+    rows (int): The base number of rows for the embedding tables. Can be low, due
+        to the hashing trick. The rows can be varied per attribute by providing
+        a dictionary as the value of `attrs`.
+    attrs (dict or list of attr IDs): The token attributes to embed. A separate
+        embedding table will be constructed for each attribute. Attributes
+        can be specified as a list or as a dictionary, which lets you control
+        the number of rows used for each table.
+    include_static_vectors (bool): Whether to also use static word vectors.
         Requires a vectors table to be loaded in the Doc objects' vocab.
     """
-    cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH]
+    if isinstance(attrs, dict):
+        # Exclude tables that would have 0 rows.
+        attrs = {key: value for key, value in attrs.items() if value > 0.0}
+    indices = {attr: i for i, attr in enumerate(attrs)}
     seed = 7
 
     def make_hash_embed(feature):
         nonlocal seed
+        row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0
         seed += 1
         return HashEmbed(
             width,
-            rows if feature == LOWER else rows // 2,
-            column=cols.index(feature),
+            int(rows * row_factor),
+            column=indices[feature],
             seed=seed,
             dropout=0.0,
         )
 
-    if also_embed_subwords:
-        embeddings = [
-            make_hash_embed(LOWER),
-            make_hash_embed(PREFIX),
-            make_hash_embed(SUFFIX),
-            make_hash_embed(SHAPE),
-        ]
-    else:
-        embeddings = [make_hash_embed(LOWER)]
-    concat_size = width * (len(embeddings) + also_use_static_vectors)
-    if also_use_static_vectors:
+    embeddings = [make_hash_embed(attr) for attr in attrs]
+    concat_size = width * (len(embeddings) + include_static_vectors)
+    if include_static_vectors:
         model = chain(
             concatenate(
                 chain(
-                    FeatureExtractor(cols),
+                    FeatureExtractor(list(attrs)),
                     list2ragged(),
                     with_array(concatenate(*embeddings)),
                 ),
@@ -155,7 +189,7 @@ def MultiHashEmbed(
         )
     else:
         model = chain(
-            FeatureExtractor(cols),
+            FeatureExtractor(list(attrs)),
             list2ragged(),
             with_array(concatenate(*embeddings)),
             with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)),
diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py
index a123f459d..3bd3b903d 100644
--- a/spacy/tests/test_models.py
+++ b/spacy/tests/test_models.py
@@ -6,6 +6,7 @@ from numpy.testing import assert_array_equal
 import numpy
 
 from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder
+from spacy.ml.models import MultiHashEmbed_v1
 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier
 from spacy.lang.en import English
 from spacy.lang.en.examples import sentences as EN_SENTENCES
@@ -61,7 +62,10 @@ def get_tok2vec_kwargs():
     # This actually creates models, so seems best to put it in a function.
     return {
         "embed": MultiHashEmbed(
-            width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False
+            width=32,
+            rows=500,
+            attrs=["NORM", "PREFIX", "SHAPE"],
+            include_static_vectors=False
         ),
         "encode": MaxoutWindowEncoder(
             width=32, depth=2, maxout_pieces=2, window_size=1
@@ -73,6 +77,32 @@ def test_tok2vec():
     return build_Tok2Vec_model(**get_tok2vec_kwargs())
 
 
+def test_multi_hash_embed():
+    embed = MultiHashEmbed(
+        width=32,
+        rows=500,
+        attrs=["NORM", "PREFIX", "SHAPE"],
+        include_static_vectors=False
+    )
+    hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
+    assert len(hash_embeds) == 3
+    # Check they look at different columns.
+    assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2]
+    # Check they use different seeds
+    assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3
+    # Check they all have the same number of rows
+    assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500]
+    # Now try with different row factors
+    embed = MultiHashEmbed(
+        width=32,
+        rows=500,
+        attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5},
+        include_static_vectors=False
+    )
+    hash_embeds = [node for node in embed.walk() if node.name == "hashembed"]
+    assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250]
+ 
+
 @pytest.mark.parametrize(
     "seed,model_func,kwargs",
     [