From 8ec79ad3fadd97f39b220c874e0df46921646fd0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:22:00 +0200 Subject: [PATCH 01/16] Allow configuration of MultiHashEmbed features Update arguments to MultiHashEmbed layer so that the attributes can be controlled. A kind of tricky scheme is used to allow optional specification of the rows. I think it's an okay balance between flexibility and convenience. --- spacy/ml/models/tok2vec.py | 100 +++++++++++++++++++++++++------------ spacy/tests/test_models.py | 32 +++++++++++- 2 files changed, 98 insertions(+), 34 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 1a0979cab..4abc1bee6 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -1,4 +1,4 @@ -from typing import Optional, List, Union +from typing import Optional, List, Union, Dict from thinc.types import Floats2d from thinc.api import chain, clone, concatenate, with_array, with_padded from thinc.api import Model, noop, list2ragged, ragged2list, HashEmbed @@ -11,7 +11,7 @@ from ...ml import _character_embed from ..staticvectors import StaticVectors from ..featureextractor import FeatureExtractor from ...pipeline.tok2vec import Tok2VecListener -from ...attrs import ORTH, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr +from ...attrs import ORTH, NORM, LOWER, PREFIX, SUFFIX, SHAPE, intify_attr @registry.architectures.register("spacy.Tok2VecListener.v1") @@ -54,12 +54,16 @@ def build_hash_embed_cnn_tok2vec( a language such as Chinese. pretrained_vectors (bool): Whether to also use static vectors. """ + if subword_features: + attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5} + else: + attrs = {"NORM": 1.0} return build_Tok2Vec_model( embed=MultiHashEmbed( width=width, rows=embed_size, - also_embed_subwords=subword_features, - also_use_static_vectors=bool(pretrained_vectors), + attrs=attrs, + include_static_vectors=bool(pretrained_vectors), ), encode=MaxoutWindowEncoder( width=width, @@ -92,59 +96,89 @@ def build_Tok2Vec_model( @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed( +def MultiHashEmbed_v1( width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool +) -> Model[List[Doc], List[Floats2d]]: + """Previous interface for MultiHashEmbed. This should be removed, it's only + here as a temporary compatibility.""" + return MultiHashEmbed( + width=width, + rows=rows, + attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], + include_static_vectors=also_use_static_vectors + ) + +@registry.architectures.register("spacy.MultiHashEmbed.v2") +def MultiHashEmbed( + width: int, + rows: int, + attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]], + include_static_vectors: bool ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it through a feed-forward subnetwork to build a mixed representations. - The features used are the LOWER, PREFIX, SUFFIX and SHAPE, which can have - varying definitions depending on the Vocab of the Doc object passed in. - Vectors from pretrained static vectors can also be incorporated into the - concatenated representation. + The features used can be configured with the 'attrs' argument. The suggested + attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into + account some subword information, without contruction a fully character-based + representation. If pretrained vectors are available, they can be included in + the representation as well, with the vectors table will be kept static + (i.e. it's not updated). + + The `width` parameter specifices the output width of the layer and the widths + of all embedding tables. If static vectors are included, a learned linear + layer is used to map the vectors to the specified width before concatenating + it with the other embedding outputs. A single Maxout layer is then used to + reduce the concatenated vectors to the final width. + + The `rows` parameter controls the number of rows used by the `HashEmbed` + tables. The HashEmbed layer needs surprisingly few rows, due to its use of + the hashing trick. Generally between 2000 and 10000 rows is sufficient, + even for very large vocabularies. You can vary the number of rows per + attribute by specifying the attrs as a dict, mapping the keys to float + values which are interpreted as factors of `rows`. For instance, + attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and + rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are + assumed for all attributes. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. - rows (int): The number of rows for the embedding tables. Can be low, due - to the hashing trick. Embeddings for prefix, suffix and word shape - use half as many rows. Recommended values are between 2000 and 10000. - also_embed_subwords (bool): Whether to use the PREFIX, SUFFIX and SHAPE - features in the embeddings. If not using these, you may need more - rows in your hash embeddings, as there will be increased chance of - collisions. - also_use_static_vectors (bool): Whether to also use static word vectors. + rows (int): The base number of rows for the embedding tables. Can be low, due + to the hashing trick. The rows can be varied per attribute by providing + a dictionary as the value of `attrs`. + attrs (dict or list of attr IDs): The token attributes to embed. A separate + embedding table will be constructed for each attribute. Attributes + can be specified as a list or as a dictionary, which lets you control + the number of rows used for each table. + include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ - cols = [LOWER, PREFIX, SUFFIX, SHAPE, ORTH] + if isinstance(attrs, dict): + # Exclude tables that would have 0 rows. + attrs = {key: value for key, value in attrs.items() if value > 0.0} + indices = {attr: i for i, attr in enumerate(attrs)} seed = 7 def make_hash_embed(feature): nonlocal seed + row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0 seed += 1 return HashEmbed( width, - rows if feature == LOWER else rows // 2, - column=cols.index(feature), + int(rows * row_factor), + column=indices[feature], seed=seed, dropout=0.0, ) - if also_embed_subwords: - embeddings = [ - make_hash_embed(LOWER), - make_hash_embed(PREFIX), - make_hash_embed(SUFFIX), - make_hash_embed(SHAPE), - ] - else: - embeddings = [make_hash_embed(LOWER)] - concat_size = width * (len(embeddings) + also_use_static_vectors) - if also_use_static_vectors: + embeddings = [make_hash_embed(attr) for attr in attrs] + concat_size = width * (len(embeddings) + include_static_vectors) + if include_static_vectors: model = chain( concatenate( chain( - FeatureExtractor(cols), + FeatureExtractor(list(attrs)), list2ragged(), with_array(concatenate(*embeddings)), ), @@ -155,7 +189,7 @@ def MultiHashEmbed( ) else: model = chain( - FeatureExtractor(cols), + FeatureExtractor(list(attrs)), list2ragged(), with_array(concatenate(*embeddings)), with_array(Maxout(width, concat_size, nP=3, dropout=0.0, normalize=True)), diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index a123f459d..3bd3b903d 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -6,6 +6,7 @@ from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder +from spacy.ml.models import MultiHashEmbed_v1 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES @@ -61,7 +62,10 @@ def get_tok2vec_kwargs(): # This actually creates models, so seems best to put it in a function. return { "embed": MultiHashEmbed( - width=32, rows=500, also_embed_subwords=True, also_use_static_vectors=False + width=32, + rows=500, + attrs=["NORM", "PREFIX", "SHAPE"], + include_static_vectors=False ), "encode": MaxoutWindowEncoder( width=32, depth=2, maxout_pieces=2, window_size=1 @@ -73,6 +77,32 @@ def test_tok2vec(): return build_Tok2Vec_model(**get_tok2vec_kwargs()) +def test_multi_hash_embed(): + embed = MultiHashEmbed( + width=32, + rows=500, + attrs=["NORM", "PREFIX", "SHAPE"], + include_static_vectors=False + ) + hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] + assert len(hash_embeds) == 3 + # Check they look at different columns. + assert list(sorted(he.attrs["column"] for he in hash_embeds)) == [0, 1, 2] + # Check they use different seeds + assert len(set(he.attrs["seed"] for he in hash_embeds)) == 3 + # Check they all have the same number of rows + assert [he.get_dim("nV") for he in hash_embeds] == [500, 500, 500] + # Now try with different row factors + embed = MultiHashEmbed( + width=32, + rows=500, + attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5}, + include_static_vectors=False + ) + hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] + assert [he.get_dim("nV") for he in hash_embeds] == [1000, 50, 250] + + @pytest.mark.parametrize( "seed,model_func,kwargs", [ From f2f1deca662a197c8e605e32238bfa015851f2ad Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:24:33 +0200 Subject: [PATCH 02/16] spacy/tests/ --- spacy/tests/pipeline/test_tok2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 06212e351..78a677acf 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -25,8 +25,8 @@ def test_empty_doc(): MultiHashEmbed( width=width, rows=embed_size, - also_use_static_vectors=False, - also_embed_subwords=True, + include_static_vectors=False, + attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) From f4ca9a39cb5245da78f01d39f95efa53924ae15a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:27:06 +0200 Subject: [PATCH 03/16] spacy/tests/ --- spacy/tests/pipeline/test_tok2vec.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 78a677acf..df844365b 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -45,8 +45,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): MultiHashEmbed( width=width, rows=embed_size, - also_use_static_vectors=False, - also_embed_subwords=True, + include_static_vectors=False, + attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), MaxoutWindowEncoder(width=width, depth=4, window_size=1, maxout_pieces=3), ) @@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "width,embed_arch,embed_config,encode_arch,encode_config", [ - (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": 100, "also_embed_subwords": True, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], From 7d93575f35a7fb8484096b772ce71834bfd1914a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:28:12 +0200 Subject: [PATCH 04/16] spacy/tests/ --- spacy/tests/pipeline/test_tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index df844365b..aa60faf5b 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -62,7 +62,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): "width,embed_arch,embed_config,encode_arch,encode_config", [ (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], From eb9ba61517e4e7f39b5521313e797bdbbf6740af Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 15:29:49 +0200 Subject: [PATCH 05/16] Format --- spacy/ml/models/tok2vec.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 4abc1bee6..6e5aed77b 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -103,17 +103,18 @@ def MultiHashEmbed_v1( here as a temporary compatibility.""" return MultiHashEmbed( width=width, - rows=rows, + rows=rows, attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], - include_static_vectors=also_use_static_vectors + include_static_vectors=also_use_static_vectors, ) + @registry.architectures.register("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, rows: int, attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]], - include_static_vectors: bool + include_static_vectors: bool, ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical attributes using hash embedding, concatenates the results, and passes it From 90040aacec90f18d7e5a0c5f051352316f9e5cd0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 16:12:01 +0200 Subject: [PATCH 06/16] Fix merge --- spacy/training/augment.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index bbe164aed..685016b62 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -119,7 +119,6 @@ def make_orth_variants( orig_token_dict = copy.deepcopy(token_dict) ndsv = orth_variants.get("single", []) ndpv = orth_variants.get("paired", []) - logger.debug(f"Data augmentation: {len(ndsv)} single / {len(ndpv)} paired variants") words = token_dict.get("ORTH", []) tags = token_dict.get("TAG", []) # keep unmodified if words or tags are not defined From 9f1bc3f24c6c9f0412f815abe044274d3840fa23 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 16:40:23 +0200 Subject: [PATCH 07/16] Fix augment --- spacy/training/augment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/training/augment.py b/spacy/training/augment.py index 685016b62..c538f02d0 100644 --- a/spacy/training/augment.py +++ b/spacy/training/augment.py @@ -130,7 +130,7 @@ def make_orth_variants( for word_idx in range(len(words)): for punct_idx in range(len(ndsv)): if ( - tags[word_idx] in ndsv[punct_idx]["TAG"] + tags[word_idx] in ndsv[punct_idx]["tags"] and words[word_idx] in ndsv[punct_idx]["variants"] ): words[word_idx] = punct_choices[punct_idx] From 6dcc4a0ba63370f2b27713b5f7e86e6a8de6c825 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 19:57:45 +0200 Subject: [PATCH 08/16] Simplify MultiHashEmbed signature --- spacy/ml/models/tok2vec.py | 48 +++++++++++----------------- spacy/tests/pipeline/test_tok2vec.py | 16 +++++----- spacy/tests/test_models.py | 8 ++--- 3 files changed, 31 insertions(+), 41 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 6e5aed77b..f0e846bac 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -55,13 +55,15 @@ def build_hash_embed_cnn_tok2vec( pretrained_vectors (bool): Whether to also use static vectors. """ if subword_features: - attrs = {"NORM": 1.0, "PREFIX": 0.5, "SUFFIX": 0.5, "SHAPE": 0.5} + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + row_sizes = [embed_size, embed_size//2, embed_size//2, embed_size//2] else: - attrs = {"NORM": 1.0} + attrs = ["NORM"] + row_sizes = [embed_size] return build_Tok2Vec_model( embed=MultiHashEmbed( width=width, - rows=embed_size, + rows=row_sizes, attrs=attrs, include_static_vectors=bool(pretrained_vectors), ), @@ -103,7 +105,7 @@ def MultiHashEmbed_v1( here as a temporary compatibility.""" return MultiHashEmbed( width=width, - rows=rows, + rows=[rows, rows//2, rows//2, rows//2] if also_embed_subwords else [rows], attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], include_static_vectors=also_use_static_vectors, ) @@ -112,8 +114,8 @@ def MultiHashEmbed_v1( @registry.architectures.register("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, - rows: int, - attrs: Union[List[Union[str, int]], Dict[Union[str, int], float]], + attrs: List[Union[str, int]], + rows: List[int], include_static_vectors: bool, ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedding layer that separately embeds a number of lexical @@ -136,50 +138,38 @@ def MultiHashEmbed( The `rows` parameter controls the number of rows used by the `HashEmbed` tables. The HashEmbed layer needs surprisingly few rows, due to its use of the hashing trick. Generally between 2000 and 10000 rows is sufficient, - even for very large vocabularies. You can vary the number of rows per - attribute by specifying the attrs as a dict, mapping the keys to float - values which are interpreted as factors of `rows`. For instance, - attrs={"NORM": 1.0, PREFIX: 0.2} will use rows*1 for the NORM table and - rows*0.2 for the PREFIX table. If `attrs` is a list, factors of 1.0 are - assumed for all attributes. + even for very large vocabularies. A number of rows must be specified for each + table, so the `rows` list must be of the same length as the `attrs` parameter. width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. - rows (int): The base number of rows for the embedding tables. Can be low, due - to the hashing trick. The rows can be varied per attribute by providing - a dictionary as the value of `attrs`. - attrs (dict or list of attr IDs): The token attributes to embed. A separate - embedding table will be constructed for each attribute. Attributes - can be specified as a list or as a dictionary, which lets you control - the number of rows used for each table. + attrs (list of attr IDs): The token attributes to embed. A separate + embedding table will be constructed for each attribute. + rows (List[int]): The number of rows in the embedding tables. Must have the + same length as attrs. include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ - if isinstance(attrs, dict): - # Exclude tables that would have 0 rows. - attrs = {key: value for key, value in attrs.items() if value > 0.0} - indices = {attr: i for i, attr in enumerate(attrs)} seed = 7 - def make_hash_embed(feature): + def make_hash_embed(index): nonlocal seed - row_factor = attrs[feature] if isinstance(attrs, dict) else 1.0 seed += 1 return HashEmbed( width, - int(rows * row_factor), - column=indices[feature], + rows[index], + column=index, seed=seed, dropout=0.0, ) - embeddings = [make_hash_embed(attr) for attr in attrs] + embeddings = [make_hash_embed(i) for i in range(len(attrs))] concat_size = width * (len(embeddings) + include_static_vectors) if include_static_vectors: model = chain( concatenate( chain( - FeatureExtractor(list(attrs)), + FeatureExtractor(attrs), list2ragged(), with_array(concatenate(*embeddings)), ), diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index aa60faf5b..e86d97a54 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -24,7 +24,7 @@ def test_empty_doc(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, - rows=embed_size, + rows=[embed_size, embed_size, embed_size, embed_size], include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), @@ -44,7 +44,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=width, - rows=embed_size, + rows=[embed_size] * 4, include_static_vectors=False, attrs=["NORM", "PREFIX", "SUFFIX", "SHAPE"], ), @@ -61,8 +61,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "width,embed_arch,embed_config,encode_arch,encode_config", [ - (8, MultiHashEmbed, {"rows": 100, "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), - (8, MultiHashEmbed, {"rows": 100, "attrs": {"ORTH": 1.0, "PREFIX": 0.2}, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), + (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), + (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], @@ -116,11 +116,11 @@ cfg_string = """ @architectures = "spacy.Tok2Vec.v1" [components.tok2vec.model.embed] - @architectures = "spacy.MultiHashEmbed.v1" + @architectures = "spacy.MultiHashEmbed.v2" width = ${components.tok2vec.model.encode.width} - rows = 2000 - also_embed_subwords = true - also_use_static_vectors = false + rows = [2000, 1000, 1000, 1000] + attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] + include_static_vectors = false [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index 3bd3b903d..d621be0ba 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -63,7 +63,7 @@ def get_tok2vec_kwargs(): return { "embed": MultiHashEmbed( width=32, - rows=500, + rows=[500, 500, 500], attrs=["NORM", "PREFIX", "SHAPE"], include_static_vectors=False ), @@ -80,7 +80,7 @@ def test_tok2vec(): def test_multi_hash_embed(): embed = MultiHashEmbed( width=32, - rows=500, + rows=[500, 500, 500], attrs=["NORM", "PREFIX", "SHAPE"], include_static_vectors=False ) @@ -95,8 +95,8 @@ def test_multi_hash_embed(): # Now try with different row factors embed = MultiHashEmbed( width=32, - rows=500, - attrs={"NORM": 2.0, "PREFIX": 0.1, "SHAPE": 0.5}, + rows=[1000, 50, 250], + attrs=["NORM", "PREFIX", "SHAPE"], include_static_vectors=False ) hash_embeds = [node for node in embed.walk() if node.name == "hashembed"] From cdd2b79b6d2a87db04f59d478dfa0fd8c2d3abdb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 19:58:18 +0200 Subject: [PATCH 09/16] Remove deprecated MultiHashEmbed --- spacy/ml/models/tok2vec.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index f0e846bac..3a7da4a8e 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -98,20 +98,6 @@ def build_Tok2Vec_model( @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed_v1( - width: int, rows: int, also_embed_subwords: bool, also_use_static_vectors: bool -) -> Model[List[Doc], List[Floats2d]]: - """Previous interface for MultiHashEmbed. This should be removed, it's only - here as a temporary compatibility.""" - return MultiHashEmbed( - width=width, - rows=[rows, rows//2, rows//2, rows//2] if also_embed_subwords else [rows], - attrs=[NORM, PREFIX, SUFFIX, SHAPE] if also_embed_subwords else [NORM], - include_static_vectors=also_use_static_vectors, - ) - - -@registry.architectures.register("spacy.MultiHashEmbed.v2") def MultiHashEmbed( width: int, attrs: List[Union[str, int]], From db84d175c3e5d661f9358b6d8b85cd2fe9316392 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 19:59:30 +0200 Subject: [PATCH 10/16] Fix test --- spacy/tests/pipeline/test_tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index e86d97a54..90882ae3f 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -116,7 +116,7 @@ cfg_string = """ @architectures = "spacy.Tok2Vec.v1" [components.tok2vec.model.embed] - @architectures = "spacy.MultiHashEmbed.v2" + @architectures = "spacy.MultiHashEmbed.v1" width = ${components.tok2vec.model.encode.width} rows = [2000, 1000, 1000, 1000] attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] From e50047f1c5e9949894bbba0a3183295fc79f2f2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 20:02:45 +0200 Subject: [PATCH 11/16] Check lengths match --- spacy/ml/models/tok2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 3a7da4a8e..65d2bffbb 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -136,6 +136,8 @@ def MultiHashEmbed( include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ + if len(rows) != len(attrs): + raise ValueError(f"Mismatched lengths: {len(rows)} vs {len(attrs)}") seed = 7 def make_hash_embed(index): From b392d48e7667b95d820bf120dae4ab4a719af497 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 20:17:07 +0200 Subject: [PATCH 12/16] Fix test --- spacy/tests/test_models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/test_models.py b/spacy/tests/test_models.py index d621be0ba..bad964786 100644 --- a/spacy/tests/test_models.py +++ b/spacy/tests/test_models.py @@ -6,7 +6,6 @@ from numpy.testing import assert_array_equal import numpy from spacy.ml.models import build_Tok2Vec_model, MultiHashEmbed, MaxoutWindowEncoder -from spacy.ml.models import MultiHashEmbed_v1 from spacy.ml.models import build_text_classifier, build_simple_cnn_text_classifier from spacy.lang.en import English from spacy.lang.en.examples import sentences as EN_SENTENCES From 919790cb47b408c827e4cb40a1c6d3343fe0a28f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 20:28:21 +0200 Subject: [PATCH 13/16] Upd MultiHashEmbed docs --- website/docs/api/architectures.md | 51 +++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 5cee45ba5..cea390bb1 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -136,25 +136,50 @@ argument that connects to the shared `tok2vec` component in the pipeline. > [model] > @architectures = "spacy.MultiHashEmbed.v1" > width = 64 -> rows = 2000 -> also_embed_subwords = false -> also_use_static_vectors = false +> attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +> rows = [2000, 1000, 1000, 1000] +> include_static_vectors = true > ``` Construct an embedding layer that separately embeds a number of lexical -attributes using hash embedding, concatenates the results, and passes it through -a feed-forward subnetwork to build mixed representations. The features used are -the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, and they are extracted with a -[FeatureExtractor](/api/architectures#FeatureExtractor) layer. Vectors from pretrained static -vectors can also be incorporated into the concatenated representation. +attributes using hash embedding, concatenates the results, and passes it +through a feed-forward subnetwork to build a mixed representations. + +The features used can be configured with the 'attrs' argument. The suggested +attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into +account some subword information, without contruction a fully character-based +representation. If pretrained vectors are available, they can be included in +the representation as well, with the vectors table will be kept static +(i.e. it's not updated). + +The `width` parameter specifices the output width of the layer and the widths +of all embedding tables. If static vectors are included, a learned linear +layer is used to map the vectors to the specified width before concatenating +it with the other embedding outputs. A single Maxout layer is then used to +reduce the concatenated vectors to the final width. + +The `rows` parameter controls the number of rows used by the `HashEmbed` +tables. The HashEmbed layer needs surprisingly few rows, due to its use of +the hashing trick. Generally between 2000 and 10000 rows is sufficient, +even for very large vocabularies. A number of rows must be specified for each +table, so the `rows` list must be of the same length as the `attrs` parameter. + + attrs (list of attr IDs): The token attributes to embed. A separate + embedding table will be constructed for each attribute. + rows (List[int]): The number of rows in the embedding tables. Must have the + same length as attrs. + include_static_vectors (bool): Whether to also use static word vectors. + Requires a vectors table to be loaded in the Doc objects' vocab. + | Name | Description | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | -| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ | -| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ | -| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | -| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | +| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ | +| `attrs` | The token attributes to embed. A separate | +embedding table will be constructed for each attribute. ~~List[Union[int, str]]~~ | +| `rows` | The number of rows for each embedding tables. Can be low, due to the hashing trick. Recommended values are between `1000` and `10000`. ~~List[int]~~ | +| `include_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ | +| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ | ### spacy.CharacterEmbed.v1 {#CharacterEmbed} From 91d0fbb58821fcecf4b4af3d2bb32d12b490c565 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 21:13:53 +0200 Subject: [PATCH 14/16] Fix test --- spacy/tests/serialize/test_serialize_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index da048f3d6..8b3f5c2b8 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -89,9 +89,9 @@ def my_parser(): tok2vec = build_Tok2Vec_model( MultiHashEmbed( width=321, - rows=5432, - also_embed_subwords=True, - also_use_static_vectors=False, + attrs=["LOWER", "SHAPE"], + rows=[5432, 5432], + include_static_vectors=False, ), MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), ) From ff8b9807750e045f40c9a40208eba8c575c714cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 21:19:41 +0200 Subject: [PATCH 15/16] Upd quickstart template --- spacy/cli/templates/quickstart_training.jinja | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 3bd237b0a..c3419e67d 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -171,8 +171,13 @@ factory = "tok2vec" [components.tok2vec.model.embed] @architectures = "spacy.MultiHashEmbed.v1" width = ${components.tok2vec.model.encode.width} -rows = {{ 2000 if optimize == "efficiency" else 7000 }} -also_embed_subwords = {{ "true" if has_letters else "false" }} +{% if has_letters -%} +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +rows = [5000, 2500, 2500, 2500] +{% else -%} +attrs = ["ORTH", "SHAPE"] +rows = [5000, 2500] +{% endif -%} also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} [components.tok2vec.model.encode] From b7e01d20246efbeeb1c6f9babbb08ac965a45582 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 5 Oct 2020 21:21:30 +0200 Subject: [PATCH 16/16] Fix quickstart --- spacy/cli/templates/quickstart_training.jinja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index c3419e67d..d92de9c15 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -178,7 +178,7 @@ rows = [5000, 2500, 2500, 2500] attrs = ["ORTH", "SHAPE"] rows = [5000, 2500] {% endif -%} -also_use_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} +include_static_vectors = {{ "true" if optimize == "accuracy" else "false" }} [components.tok2vec.model.encode] @architectures = "spacy.MaxoutWindowEncoder.v1"