From eac12cbb773912d274a2e4eb5090b8fe89992ef4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jun 2020 11:50:16 +0200 Subject: [PATCH] make dropout in embed layers configurable --- spacy/ml/models/textcat.py | 17 ++++---- spacy/ml/models/tok2vec.py | 40 ++++++++++--------- .../defaults/entity_linker_defaults.cfg | 1 + .../defaults/morphologizer_defaults.cfg | 1 + spacy/pipeline/defaults/ner_defaults.cfg | 1 + spacy/pipeline/defaults/parser_defaults.cfg | 1 + spacy/pipeline/defaults/senter_defaults.cfg | 1 + .../pipeline/defaults/simple_ner_defaults.cfg | 1 + spacy/pipeline/defaults/tagger_defaults.cfg | 1 + .../defaults/textcat_cnn_defaults.cfg | 1 + spacy/pipeline/defaults/textcat_defaults.cfg | 1 + spacy/pipeline/defaults/tok2vec_defaults.cfg | 1 + spacy/tests/pipeline/test_textcat.py | 6 +-- .../tests/serialize/test_serialize_config.py | 3 ++ spacy/tests/test_tok2vec.py | 19 ++++----- 15 files changed, 57 insertions(+), 38 deletions(-) diff --git a/spacy/ml/models/textcat.py b/spacy/ml/models/textcat.py index ce31d058c..141c66f79 100644 --- a/spacy/ml/models/textcat.py +++ b/spacy/ml/models/textcat.py @@ -49,13 +49,13 @@ def build_bow_text_classifier(exclusive_classes, ngram_size, no_output_layer, nO @registry.architectures.register("spacy.TextCat.v1") def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_classes, ngram_size, - window_size, conv_depth, nO=None): + window_size, conv_depth, dropout, nO=None): cols = [ORTH, LOWER, PREFIX, SUFFIX, SHAPE, ID] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER)) - prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE)) + lower = HashEmbed(nO=width, nV=embed_size, column=cols.index(LOWER), dropout=dropout) + prefix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width // 2, nV=embed_size, column=cols.index(SHAPE), dropout=dropout) width_nI = sum(layer.get_dim("nO") for layer in [lower, prefix, suffix, shape]) trained_vectors = FeatureExtractor(cols) >> with_array( @@ -114,7 +114,7 @@ def build_text_classifier(width, embed_size, pretrained_vectors, exclusive_class @registry.architectures.register("spacy.TextCatLowData.v1") -def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): +def build_text_classifier_lowdata(width, pretrained_vectors, dropout, nO=None): nlp = util.load_model(pretrained_vectors) vectors = nlp.vocab.vectors vector_dim = vectors.data.shape[1] @@ -129,7 +129,8 @@ def build_text_classifier_lowdata(width, pretrained_vectors, nO=None): >> reduce_sum() >> residual(Relu(width, width)) ** 2 >> Linear(nO, width) - >> Dropout(0.0) - >> Logistic() ) + if dropout: + model = model >> Dropout(dropout) + model = model >> Logistic() return model diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index a2e8f589a..53798e57c 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -49,6 +49,7 @@ def hash_embed_cnn( maxout_pieces, window_size, subword_features, + dropout, ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -63,6 +64,7 @@ def hash_embed_cnn( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @@ -76,6 +78,7 @@ def hash_charembed_cnn( window_size, nM, nC, + dropout, ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -90,12 +93,13 @@ def hash_charembed_cnn( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @registry.architectures.register("spacy.HashEmbedBiLSTM.v1") def hash_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces + pretrained_vectors, width, depth, embed_size, subword_features, maxout_pieces, dropout ): # Does not use character embeddings: set to False by default return build_Tok2Vec_model( @@ -110,12 +114,13 @@ def hash_embed_bilstm_v1( char_embed=False, nM=0, nC=0, + dropout=dropout, ) @registry.architectures.register("spacy.HashCharEmbedBiLSTM.v1") def hash_char_embed_bilstm_v1( - pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC + pretrained_vectors, width, depth, embed_size, maxout_pieces, nM, nC, dropout ): # Allows using character embeddings by setting nC, nM and char_embed=True return build_Tok2Vec_model( @@ -130,6 +135,7 @@ def hash_char_embed_bilstm_v1( char_embed=True, nM=nM, nC=nC, + dropout=dropout, ) @@ -144,19 +150,19 @@ def LayerNormalizedMaxout(width, maxout_pieces): @registry.architectures.register("spacy.MultiHashEmbed.v1") -def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) if use_subwords: - prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX")) - suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX")) - shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE")) + prefix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("PREFIX"), dropout=dropout) + suffix = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SUFFIX"), dropout=dropout) + shape = HashEmbed(nO=width, nV=rows // 2, column=columns.index("SHAPE"), dropout=dropout) if pretrained_vectors: glove = StaticVectors( vectors=pretrained_vectors.data, nO=width, column=columns.index(ID), - dropout=0.0, + dropout=dropout, ) with Model.define_operators({">>": chain, "|": concatenate}): @@ -164,13 +170,10 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): embed_layer = norm else: if use_subwords and pretrained_vectors: - nr_columns = 5 concat_columns = glove | norm | prefix | suffix | shape elif use_subwords: - nr_columns = 4 concat_columns = norm | prefix | suffix | shape else: - nr_columns = 2 concat_columns = glove | norm embed_layer = uniqued(concat_columns >> mix, column=columns.index("ORTH")) @@ -179,8 +182,8 @@ def MultiHashEmbed(columns, width, rows, use_subwords, pretrained_vectors, mix): @registry.architectures.register("spacy.CharacterEmbed.v1") -def CharacterEmbed(columns, width, rows, nM, nC, features): - norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM")) +def CharacterEmbed(columns, width, rows, nM, nC, features, dropout): + norm = HashEmbed(nO=width, nV=rows, column=columns.index("NORM"), dropout=dropout) chr_embed = _character_embed.CharacterEmbed(nM=nM, nC=nC) with Model.define_operators({">>": chain, "|": concatenate}): embed_layer = chr_embed | features >> with_array(norm) @@ -238,16 +241,17 @@ def build_Tok2Vec_model( nC, conv_depth, bilstm_depth, + dropout, ) -> Model: if char_embed: subword_features = False cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM)) + norm = HashEmbed(nO=width, nV=embed_size, column=cols.index(NORM), dropout=dropout) if subword_features: - prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX)) - suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX)) - shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE)) + prefix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(PREFIX), dropout=dropout) + suffix = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SUFFIX), dropout=dropout) + shape = HashEmbed(nO=width, nV=embed_size // 2, column=cols.index(SHAPE), dropout=dropout) else: prefix, suffix, shape = (None, None, None) if pretrained_vectors is not None: @@ -255,7 +259,7 @@ def build_Tok2Vec_model( vectors=pretrained_vectors.data, nO=width, column=cols.index(ID), - dropout=0.0, + dropout=dropout, ) if subword_features: diff --git a/spacy/pipeline/defaults/entity_linker_defaults.cfg b/spacy/pipeline/defaults/entity_linker_defaults.cfg index 6a591ec3e..26a294f37 100644 --- a/spacy/pipeline/defaults/entity_linker_defaults.cfg +++ b/spacy/pipeline/defaults/entity_linker_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 300 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/morphologizer_defaults.cfg b/spacy/pipeline/defaults/morphologizer_defaults.cfg index 150eca507..c4452c689 100644 --- a/spacy/pipeline/defaults/morphologizer_defaults.cfg +++ b/spacy/pipeline/defaults/morphologizer_defaults.cfg @@ -11,3 +11,4 @@ window_size = 1 maxout_pieces = 3 nM = 64 nC = 8 +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/ner_defaults.cfg b/spacy/pipeline/defaults/ner_defaults.cfg index db2c131f5..eb926c43b 100644 --- a/spacy/pipeline/defaults/ner_defaults.cfg +++ b/spacy/pipeline/defaults/ner_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/parser_defaults.cfg b/spacy/pipeline/defaults/parser_defaults.cfg index 9cbb6eadb..6fe0fd7cb 100644 --- a/spacy/pipeline/defaults/parser_defaults.cfg +++ b/spacy/pipeline/defaults/parser_defaults.cfg @@ -13,3 +13,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/senter_defaults.cfg b/spacy/pipeline/defaults/senter_defaults.cfg index ffa2c6ce2..304e42b01 100644 --- a/spacy/pipeline/defaults/senter_defaults.cfg +++ b/spacy/pipeline/defaults/senter_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 2 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/simple_ner_defaults.cfg b/spacy/pipeline/defaults/simple_ner_defaults.cfg index 4e3b640df..7f206a636 100644 --- a/spacy/pipeline/defaults/simple_ner_defaults.cfg +++ b/spacy/pipeline/defaults/simple_ner_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 7000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/tagger_defaults.cfg b/spacy/pipeline/defaults/tagger_defaults.cfg index 5aea80a32..f26c5f099 100644 --- a/spacy/pipeline/defaults/tagger_defaults.cfg +++ b/spacy/pipeline/defaults/tagger_defaults.cfg @@ -10,3 +10,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg index cea1bfe54..91f3a1742 100644 --- a/spacy/pipeline/defaults/textcat_cnn_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_cnn_defaults.cfg @@ -11,3 +11,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null diff --git a/spacy/pipeline/defaults/textcat_defaults.cfg b/spacy/pipeline/defaults/textcat_defaults.cfg index 9477b2995..e5817de4a 100644 --- a/spacy/pipeline/defaults/textcat_defaults.cfg +++ b/spacy/pipeline/defaults/textcat_defaults.cfg @@ -7,3 +7,4 @@ conv_depth = 2 embed_size = 2000 window_size = 1 ngram_size = 1 +dropout = null \ No newline at end of file diff --git a/spacy/pipeline/defaults/tok2vec_defaults.cfg b/spacy/pipeline/defaults/tok2vec_defaults.cfg index 9475d4aab..36bf0c3da 100644 --- a/spacy/pipeline/defaults/tok2vec_defaults.cfg +++ b/spacy/pipeline/defaults/tok2vec_defaults.cfg @@ -7,3 +7,4 @@ embed_size = 2000 window_size = 1 maxout_pieces = 3 subword_features = true +dropout = null \ No newline at end of file diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 725a4fd69..179659597 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -123,9 +123,9 @@ def test_overfitting_IO(): {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 4, "no_output_layer": False}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": False, "ngram_size": 3, "no_output_layer": True}, {"@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": True}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1}, - {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": False, "ngram_size": 1, "pretrained_vectors": False, "width": 64, "conv_depth": 2, "embed_size": 2000, "window_size": 2, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 5, "pretrained_vectors": False, "width": 128, "conv_depth": 2, "embed_size": 2000, "window_size": 1, "dropout": None}, + {"@architectures": "spacy.TextCat.v1", "exclusive_classes": True, "ngram_size": 2, "pretrained_vectors": False, "width": 32, "conv_depth": 3, "embed_size": 500, "window_size": 3, "dropout": None}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": True}, {"@architectures": "spacy.TextCatCNN.v1", "tok2vec": default_tok2vec(), "exclusive_classes": False}, ], diff --git a/spacy/tests/serialize/test_serialize_config.py b/spacy/tests/serialize/test_serialize_config.py index ba63adfa4..870a980f2 100644 --- a/spacy/tests/serialize/test_serialize_config.py +++ b/spacy/tests/serialize/test_serialize_config.py @@ -24,6 +24,7 @@ window_size = 1 embed_size = 2000 maxout_pieces = 3 subword_features = true +dropout = null [nlp.pipeline.tagger] factory = "tagger" @@ -53,6 +54,7 @@ embed_size = 5555 window_size = 1 maxout_pieces = 7 subword_features = false +dropout = null """ @@ -70,6 +72,7 @@ def my_parser(): nC=8, conv_depth=2, bilstm_depth=0, + dropout=None, ) parser = build_tb_parser_model( tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index 9c2e9004b..ee1f9dead 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -15,7 +15,7 @@ def test_empty_doc(): vocab = Vocab() doc = Doc(vocab, words=[]) # TODO: fix tok2vec arguments - tok2vec = build_Tok2Vec_model(width, embed_size) + tok2vec = build_Tok2Vec_model(width, embed_size, dropout=None) vectors, backprop = tok2vec.begin_update([doc]) assert len(vectors) == 1 assert vectors[0].shape == (0, width) @@ -38,6 +38,7 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): char_embed=False, nM=64, nC=8, + dropout=None, ) tok2vec.initialize() vectors, backprop = tok2vec.begin_update(batch) @@ -50,14 +51,14 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size): @pytest.mark.parametrize( "tok2vec_config", [ - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, - {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 6, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": True, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 1, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": False, "nM": 64, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 8, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, + {"width": 8, "embed_size": 100, "char_embed": True, "nM": 81, "nC": 9, "pretrained_vectors": None, "window_size": 3, "conv_depth": 2, "bilstm_depth": 0, "maxout_pieces": 3, "subword_features": False, "dropout": None}, ], ) # fmt: on