From a52d466bfcb13d4e15cd0cba945b7862209c2cee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 20:50:55 +0100 Subject: [PATCH 1/6] any instead of all --- spacy/pipeline/tok2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 61ba498c9..4a396eaeb 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -291,7 +291,7 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): # of data. # When the components batch differently, we don't receive a matching # prediction from the upstream, so we can't predict. - if not all(doc.tensor.size for doc in inputs): + if not any(doc.tensor.size for doc in inputs): # But we do need to do *something* if the tensor hasn't been set. # The compromise is to at least return data of the right shape, # so the output is valid. From ebeedfc70ba3f50cae9ecf98224de125cc6fa51a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 20:56:48 +0100 Subject: [PATCH 2/6] regression test for 7029 --- pretrain.cfg | 218 +++++++++++++++++++++++ pretrain_gpu.cfg | 217 ++++++++++++++++++++++ spacy/tests/regression/test_issue7029.py | 71 ++++++++ 3 files changed, 506 insertions(+) create mode 100644 pretrain.cfg create mode 100644 pretrain_gpu.cfg create mode 100644 spacy/tests/regression/test_issue7029.py diff --git a/pretrain.cfg b/pretrain.cfg new file mode 100644 index 000000000..50bd72350 --- /dev/null +++ b/pretrain.cfg @@ -0,0 +1,218 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +raw_text = null + +[system] +gpu_allocator = null +seed = 0 + +[nlp] +lang = "en" +pipeline = ["tok2vec","tagger","parser","ner"] +batch_size = 1000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = true +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 2000 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = null +component = "tok2vec" +layer = "" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 +learn_rate = 0.001 + +[initialize] +vectors = null +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/pretrain_gpu.cfg b/pretrain_gpu.cfg new file mode 100644 index 000000000..6f9c9195d --- /dev/null +++ b/pretrain_gpu.cfg @@ -0,0 +1,217 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["transformer","tagger","parser","ner"] +batch_size = 128 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = false +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = false +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "roberta-base" + +[components.transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[components.transformer.model.tokenizer_config] +use_fast = true + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 500 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +accumulate_gradient = 3 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +size = 2000 +buffer = 256 +get_length = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = null +component = "tok2vec" +layer = "" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 +learn_rate = 0.001 + +[initialize] +vectors = null +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py new file mode 100644 index 000000000..4033bb725 --- /dev/null +++ b/spacy/tests/regression/test_issue7029.py @@ -0,0 +1,71 @@ +from spacy.lang.en import English +from spacy.training import Example +from spacy.util import load_config_from_str + + +CONFIG = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + + +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch. + """ + nlp = English.from_config(load_config_from_str(CONFIG)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "thrid", "fourth", "and", "then", "some", ""] + nlp.select_pipes(enable=["tok2vec", "tagger"]) + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]] From 967df5901d573835cbd5a341a1e30eeee5b8121d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 20:57:43 +0100 Subject: [PATCH 3/6] cleanup --- pretrain.cfg | 218 ----------------------------------------------- pretrain_gpu.cfg | 217 ---------------------------------------------- 2 files changed, 435 deletions(-) delete mode 100644 pretrain.cfg delete mode 100644 pretrain_gpu.cfg diff --git a/pretrain.cfg b/pretrain.cfg deleted file mode 100644 index 50bd72350..000000000 --- a/pretrain.cfg +++ /dev/null @@ -1,218 +0,0 @@ -[paths] -train = null -dev = null -vectors = null -init_tok2vec = null -raw_text = null - -[system] -gpu_allocator = null -seed = 0 - -[nlp] -lang = "en" -pipeline = ["tok2vec","tagger","parser","ner"] -batch_size = 1000 -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} - -[components] - -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - -[components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "ner" -extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 -use_upper = true -nO = null - -[components.ner.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode.width} -upstream = "*" - -[components.parser] -factory = "parser" -learn_tokens = false -min_action_freq = 30 -moves = null -update_with_oracle_cut_size = 100 - -[components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 -use_upper = true -nO = null - -[components.parser.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode.width} -upstream = "*" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy.Tok2VecListener.v1" -width = ${components.tok2vec.model.encode.width} -upstream = "*" - -[components.tok2vec] -factory = "tok2vec" - -[components.tok2vec.model] -@architectures = "spacy.Tok2Vec.v2" - -[components.tok2vec.model.embed] -@architectures = "spacy.MultiHashEmbed.v1" -width = ${components.tok2vec.model.encode.width} -attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] -rows = [5000,2500,2500,2500] -include_static_vectors = false - -[components.tok2vec.model.encode] -@architectures = "spacy.MaxoutWindowEncoder.v2" -width = 96 -depth = 4 -window_size = 1 -maxout_pieces = 3 - -[corpora] - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 -gold_preproc = false -limit = 0 -augmenter = null - -[corpora.pretrain] -@readers = "spacy.JsonlCorpus.v1" -path = ${paths.raw_text} -min_length = 5 -max_length = 500 -limit = 0 - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = 2000 -gold_preproc = false -limit = 0 -augmenter = null - -[training] -dev_corpus = "corpora.dev" -train_corpus = "corpora.train" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -dropout = 0.1 -accumulate_gradient = 1 -patience = 1600 -max_epochs = 0 -max_steps = 20000 -eval_frequency = 200 -frozen_components = [] -before_to_disk = null - -[training.batcher] -@batchers = "spacy.batch_by_words.v1" -discard_oversize = false -tolerance = 0.2 -get_length = null - -[training.batcher.size] -@schedules = "compounding.v1" -start = 100 -stop = 1000 -compound = 1.001 -t = 0.0 - -[training.logger] -@loggers = "spacy.ConsoleLogger.v1" -progress_bar = false - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 0.00000001 -learn_rate = 0.001 - -[training.score_weights] -dep_las_per_type = null -sents_p = null -sents_r = null -ents_per_type = null -tag_acc = 0.33 -dep_uas = 0.17 -dep_las = 0.17 -sents_f = 0.0 -ents_f = 0.33 -ents_p = 0.0 -ents_r = 0.0 - -[pretraining] -max_epochs = 1000 -dropout = 0.2 -n_save_every = null -component = "tok2vec" -layer = "" -corpus = "corpora.pretrain" - -[pretraining.batcher] -@batchers = "spacy.batch_by_words.v1" -size = 3000 -discard_oversize = false -tolerance = 0.2 -get_length = null - -[pretraining.objective] -@architectures = "spacy.PretrainCharacters.v1" -maxout_pieces = 3 -hidden_size = 300 -n_characters = 4 - -[pretraining.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = true -eps = 0.00000001 -learn_rate = 0.001 - -[initialize] -vectors = null -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.tokenizer] \ No newline at end of file diff --git a/pretrain_gpu.cfg b/pretrain_gpu.cfg deleted file mode 100644 index 6f9c9195d..000000000 --- a/pretrain_gpu.cfg +++ /dev/null @@ -1,217 +0,0 @@ -[paths] -train = null -dev = null -vectors = null -init_tok2vec = null -raw_text = null - -[system] -gpu_allocator = "pytorch" -seed = 0 - -[nlp] -lang = "en" -pipeline = ["transformer","tagger","parser","ner"] -batch_size = 128 -disabled = [] -before_creation = null -after_creation = null -after_pipeline_creation = null -tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} - -[components] - -[components.ner] -factory = "ner" -moves = null -update_with_oracle_cut_size = 100 - -[components.ner.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "ner" -extra_state_tokens = false -hidden_width = 64 -maxout_pieces = 2 -use_upper = false -nO = null - -[components.ner.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.parser] -factory = "parser" -learn_tokens = false -min_action_freq = 30 -moves = null -update_with_oracle_cut_size = 100 - -[components.parser.model] -@architectures = "spacy.TransitionBasedParser.v2" -state_type = "parser" -extra_state_tokens = false -hidden_width = 128 -maxout_pieces = 3 -use_upper = false -nO = null - -[components.parser.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.tagger] -factory = "tagger" - -[components.tagger.model] -@architectures = "spacy.Tagger.v1" -nO = null - -[components.tagger.model.tok2vec] -@architectures = "spacy-transformers.TransformerListener.v1" -grad_factor = 1.0 -pooling = {"@layers":"reduce_mean.v1"} -upstream = "*" - -[components.transformer] -factory = "transformer" -max_batch_items = 4096 -set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} - -[components.transformer.model] -@architectures = "spacy-transformers.TransformerModel.v1" -name = "roberta-base" - -[components.transformer.model.get_spans] -@span_getters = "spacy-transformers.strided_spans.v1" -window = 128 -stride = 96 - -[components.transformer.model.tokenizer_config] -use_fast = true - -[corpora] - -[corpora.dev] -@readers = "spacy.Corpus.v1" -path = ${paths.dev} -max_length = 0 -gold_preproc = false -limit = 0 -augmenter = null - -[corpora.pretrain] -@readers = "spacy.JsonlCorpus.v1" -path = ${paths.raw_text} -min_length = 5 -max_length = 500 -limit = 0 - -[corpora.train] -@readers = "spacy.Corpus.v1" -path = ${paths.train} -max_length = 500 -gold_preproc = false -limit = 0 -augmenter = null - -[training] -accumulate_gradient = 3 -dev_corpus = "corpora.dev" -train_corpus = "corpora.train" -seed = ${system.seed} -gpu_allocator = ${system.gpu_allocator} -dropout = 0.1 -patience = 1600 -max_epochs = 0 -max_steps = 20000 -eval_frequency = 200 -frozen_components = [] -before_to_disk = null - -[training.batcher] -@batchers = "spacy.batch_by_padded.v1" -discard_oversize = true -size = 2000 -buffer = 256 -get_length = null - -[training.logger] -@loggers = "spacy.ConsoleLogger.v1" -progress_bar = false - -[training.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = false -eps = 0.00000001 - -[training.optimizer.learn_rate] -@schedules = "warmup_linear.v1" -warmup_steps = 250 -total_steps = 20000 -initial_rate = 0.00005 - -[training.score_weights] -dep_las_per_type = null -sents_p = null -sents_r = null -ents_per_type = null -tag_acc = 0.33 -dep_uas = 0.17 -dep_las = 0.17 -sents_f = 0.0 -ents_f = 0.33 -ents_p = 0.0 -ents_r = 0.0 - -[pretraining] -max_epochs = 1000 -dropout = 0.2 -n_save_every = null -component = "tok2vec" -layer = "" -corpus = "corpora.pretrain" - -[pretraining.batcher] -@batchers = "spacy.batch_by_words.v1" -size = 3000 -discard_oversize = false -tolerance = 0.2 -get_length = null - -[pretraining.objective] -@architectures = "spacy.PretrainCharacters.v1" -maxout_pieces = 3 -hidden_size = 300 -n_characters = 4 - -[pretraining.optimizer] -@optimizers = "Adam.v1" -beta1 = 0.9 -beta2 = 0.999 -L2_is_weight_decay = true -L2 = 0.01 -grad_clip = 1.0 -use_averages = true -eps = 0.00000001 -learn_rate = 0.001 - -[initialize] -vectors = null -init_tok2vec = ${paths.init_tok2vec} -vocab_data = null -lookups = null -before_init = null -after_init = null - -[initialize.components] - -[initialize.tokenizer] \ No newline at end of file From 278e9eaa148799a3d92eaec8684267e3a10603f0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Feb 2021 21:08:04 +0100 Subject: [PATCH 4/6] remove ner --- spacy/tests/regression/test_issue7029.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index 4033bb725..2ff730e29 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -30,9 +30,6 @@ depth = 4 window_size = 1 maxout_pieces = 3 -[components.ner] -factory = "ner" - [components.tagger] factory = "tagger" From aa3ad8825d8da3bce01265de7b3f87064bb16dd5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Feb 2021 13:14:30 +0100 Subject: [PATCH 5/6] loop instead of any --- spacy/pipeline/tok2vec.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/spacy/pipeline/tok2vec.py b/spacy/pipeline/tok2vec.py index 4a396eaeb..26a4c998c 100644 --- a/spacy/pipeline/tok2vec.py +++ b/spacy/pipeline/tok2vec.py @@ -291,14 +291,16 @@ def forward(model: Tok2VecListener, inputs, is_train: bool): # of data. # When the components batch differently, we don't receive a matching # prediction from the upstream, so we can't predict. - if not any(doc.tensor.size for doc in inputs): - # But we do need to do *something* if the tensor hasn't been set. - # The compromise is to at least return data of the right shape, - # so the output is valid. - width = model.get_dim("nO") - outputs = [model.ops.alloc2f(len(doc), width) for doc in inputs] - else: - outputs = [doc.tensor for doc in inputs] + outputs = [] + width = model.get_dim("nO") + for doc in inputs: + if doc.tensor.size == 0: + # But we do need to do *something* if the tensor hasn't been set. + # The compromise is to at least return data of the right shape, + # so the output is valid. + outputs.append(model.ops.alloc2f(len(doc), width)) + else: + outputs.append(doc.tensor) return outputs, lambda dX: [] From 03b4ec7d7fac8d3c2e5360d50075eddf0478396b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 12 Feb 2021 14:30:16 +0100 Subject: [PATCH 6/6] fix typo --- spacy/tests/regression/test_issue7029.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py index 2ff730e29..dcfb8d9e7 100644 --- a/spacy/tests/regression/test_issue7029.py +++ b/spacy/tests/regression/test_issue7029.py @@ -61,7 +61,7 @@ def test_issue7029(): for i in range(50): losses = {} nlp.update(train_examples, sgd=optimizer, losses=losses) - texts = ["first", "second", "thrid", "fourth", "and", "then", "some", ""] + texts = ["first", "second", "third", "fourth", "and", "then", "some", ""] nlp.select_pipes(enable=["tok2vec", "tagger"]) docs1 = list(nlp.pipe(texts, batch_size=1)) docs2 = list(nlp.pipe(texts, batch_size=4))