From 39aabf50ab23f4cadef5d5b459436a988f9fe677 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 11:54:48 +0200 Subject: [PATCH 1/2] Also rename to include_static_vectors in CharEmbed --- spacy/ml/models/tok2vec.py | 6 +++--- spacy/pipeline/morphologizer.pyx | 2 +- spacy/tests/pipeline/test_tok2vec.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 23cfe883b..6ef7b2325 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -177,7 +177,7 @@ def CharacterEmbed( rows: int, nM: int, nC: int, - also_use_static_vectors: bool, + include_static_vectors: bool, feature: Union[int, str] = "LOWER", ) -> Model[List[Doc], List[Floats2d]]: """Construct an embedded representation based on character embeddings, using @@ -204,13 +204,13 @@ def CharacterEmbed( nC (int): The number of UTF-8 bytes to embed per word. Recommended values are between 3 and 8, although it may depend on the length of words in the language. - also_use_static_vectors (bool): Whether to also use static word vectors. + include_static_vectors (bool): Whether to also use static word vectors. Requires a vectors table to be loaded in the Doc objects' vocab. """ feature = intify_attr(feature) if feature is None: raise ValueError(Errors.E911(feat=feature)) - if also_use_static_vectors: + if include_static_vectors: model = chain( concatenate( chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()), diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index a456b7a0f..00188a762 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -32,7 +32,7 @@ width = 128 rows = 7000 nM = 64 nC = 8 -also_use_static_vectors = false +include_static_vectors = false [model.tok2vec.encode] @architectures = "spacy.MaxoutWindowEncoder.v1" diff --git a/spacy/tests/pipeline/test_tok2vec.py b/spacy/tests/pipeline/test_tok2vec.py index 90882ae3f..ec4ed17dd 100644 --- a/spacy/tests/pipeline/test_tok2vec.py +++ b/spacy/tests/pipeline/test_tok2vec.py @@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): [ (8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}), (8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}), - (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), - (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}), + (8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}), ], ) # fmt: on From 2dd79454af73cb07d07ac1b9ad12644736e96bd5 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Fri, 9 Oct 2020 14:42:07 +0200 Subject: [PATCH 2/2] Update docs --- website/docs/usage/embeddings-transformers.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/website/docs/usage/embeddings-transformers.md b/website/docs/usage/embeddings-transformers.md index 549c3bcc4..942fc4e7b 100644 --- a/website/docs/usage/embeddings-transformers.md +++ b/website/docs/usage/embeddings-transformers.md @@ -514,7 +514,7 @@ Many neural network models are able to use word vector tables as additional features, which sometimes results in significant improvements in accuracy. spaCy's built-in embedding layer, [MultiHashEmbed](/api/architectures#MultiHashEmbed), can be configured to use -word vector tables using the `also_use_static_vectors` flag. This setting is +word vector tables using the `include_static_vectors` flag. This setting is also available on the [MultiHashEmbedCNN](/api/architectures#MultiHashEmbedCNN) layer, which builds the default token-to-vector encoding architecture. @@ -522,9 +522,9 @@ layer, which builds the default token-to-vector encoding architecture. [tagger.model.tok2vec.embed] @architectures = "spacy.MultiHashEmbed.v1" width = 128 -rows = 7000 -also_embed_subwords = true -also_use_static_vectors = true +attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = true ```