diff --git a/pretrain.cfg b/pretrain.cfg new file mode 100644 index 000000000..50bd72350 --- /dev/null +++ b/pretrain.cfg @@ -0,0 +1,218 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +raw_text = null + +[system] +gpu_allocator = null +seed = 0 + +[nlp] +lang = "en" +pipeline = ["tok2vec","tagger","parser","ner"] +batch_size = 1000 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = true +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = true +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} +upstream = "*" + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v2" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode.width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 2000 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +accumulate_gradient = 1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_words.v1" +discard_oversize = false +tolerance = 0.2 +get_length = null + +[training.batcher.size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +t = 0.0 + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 +learn_rate = 0.001 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = null +component = "tok2vec" +layer = "" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 +learn_rate = 0.001 + +[initialize] +vectors = null +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/pretrain_gpu.cfg b/pretrain_gpu.cfg new file mode 100644 index 000000000..6f9c9195d --- /dev/null +++ b/pretrain_gpu.cfg @@ -0,0 +1,217 @@ +[paths] +train = null +dev = null +vectors = null +init_tok2vec = null +raw_text = null + +[system] +gpu_allocator = "pytorch" +seed = 0 + +[nlp] +lang = "en" +pipeline = ["transformer","tagger","parser","ner"] +batch_size = 128 +disabled = [] +before_creation = null +after_creation = null +after_pipeline_creation = null +tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} + +[components] + +[components.ner] +factory = "ner" +moves = null +update_with_oracle_cut_size = 100 + +[components.ner.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "ner" +extra_state_tokens = false +hidden_width = 64 +maxout_pieces = 2 +use_upper = false +nO = null + +[components.ner.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.parser] +factory = "parser" +learn_tokens = false +min_action_freq = 30 +moves = null +update_with_oracle_cut_size = 100 + +[components.parser.model] +@architectures = "spacy.TransitionBasedParser.v2" +state_type = "parser" +extra_state_tokens = false +hidden_width = 128 +maxout_pieces = 3 +use_upper = false +nO = null + +[components.parser.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 +pooling = {"@layers":"reduce_mean.v1"} +upstream = "*" + +[components.transformer] +factory = "transformer" +max_batch_items = 4096 +set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"} + +[components.transformer.model] +@architectures = "spacy-transformers.TransformerModel.v1" +name = "roberta-base" + +[components.transformer.model.get_spans] +@span_getters = "spacy-transformers.strided_spans.v1" +window = 128 +stride = 96 + +[components.transformer.model.tokenizer_config] +use_fast = true + +[corpora] + +[corpora.dev] +@readers = "spacy.Corpus.v1" +path = ${paths.dev} +max_length = 0 +gold_preproc = false +limit = 0 +augmenter = null + +[corpora.pretrain] +@readers = "spacy.JsonlCorpus.v1" +path = ${paths.raw_text} +min_length = 5 +max_length = 500 +limit = 0 + +[corpora.train] +@readers = "spacy.Corpus.v1" +path = ${paths.train} +max_length = 500 +gold_preproc = false +limit = 0 +augmenter = null + +[training] +accumulate_gradient = 3 +dev_corpus = "corpora.dev" +train_corpus = "corpora.train" +seed = ${system.seed} +gpu_allocator = ${system.gpu_allocator} +dropout = 0.1 +patience = 1600 +max_epochs = 0 +max_steps = 20000 +eval_frequency = 200 +frozen_components = [] +before_to_disk = null + +[training.batcher] +@batchers = "spacy.batch_by_padded.v1" +discard_oversize = true +size = 2000 +buffer = 256 +get_length = null + +[training.logger] +@loggers = "spacy.ConsoleLogger.v1" +progress_bar = false + +[training.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = false +eps = 0.00000001 + +[training.optimizer.learn_rate] +@schedules = "warmup_linear.v1" +warmup_steps = 250 +total_steps = 20000 +initial_rate = 0.00005 + +[training.score_weights] +dep_las_per_type = null +sents_p = null +sents_r = null +ents_per_type = null +tag_acc = 0.33 +dep_uas = 0.17 +dep_las = 0.17 +sents_f = 0.0 +ents_f = 0.33 +ents_p = 0.0 +ents_r = 0.0 + +[pretraining] +max_epochs = 1000 +dropout = 0.2 +n_save_every = null +component = "tok2vec" +layer = "" +corpus = "corpora.pretrain" + +[pretraining.batcher] +@batchers = "spacy.batch_by_words.v1" +size = 3000 +discard_oversize = false +tolerance = 0.2 +get_length = null + +[pretraining.objective] +@architectures = "spacy.PretrainCharacters.v1" +maxout_pieces = 3 +hidden_size = 300 +n_characters = 4 + +[pretraining.optimizer] +@optimizers = "Adam.v1" +beta1 = 0.9 +beta2 = 0.999 +L2_is_weight_decay = true +L2 = 0.01 +grad_clip = 1.0 +use_averages = true +eps = 0.00000001 +learn_rate = 0.001 + +[initialize] +vectors = null +init_tok2vec = ${paths.init_tok2vec} +vocab_data = null +lookups = null +before_init = null +after_init = null + +[initialize.components] + +[initialize.tokenizer] \ No newline at end of file diff --git a/spacy/tests/regression/test_issue7029.py b/spacy/tests/regression/test_issue7029.py new file mode 100644 index 000000000..4033bb725 --- /dev/null +++ b/spacy/tests/regression/test_issue7029.py @@ -0,0 +1,71 @@ +from spacy.lang.en import English +from spacy.training import Example +from spacy.util import load_config_from_str + + +CONFIG = """ +[nlp] +lang = "en" +pipeline = ["tok2vec", "tagger"] + +[components] + +[components.tok2vec] +factory = "tok2vec" + +[components.tok2vec.model] +@architectures = "spacy.Tok2Vec.v1" + +[components.tok2vec.model.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = ${components.tok2vec.model.encode:width} +attrs = ["NORM","PREFIX","SUFFIX","SHAPE"] +rows = [5000,2500,2500,2500] +include_static_vectors = false + +[components.tok2vec.model.encode] +@architectures = "spacy.MaxoutWindowEncoder.v1" +width = 96 +depth = 4 +window_size = 1 +maxout_pieces = 3 + +[components.ner] +factory = "ner" + +[components.tagger] +factory = "tagger" + +[components.tagger.model] +@architectures = "spacy.Tagger.v1" +nO = null + +[components.tagger.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode:width} +upstream = "*" +""" + + +TRAIN_DATA = [ + ("I like green eggs", {"tags": ["N", "V", "J", "N"]}), + ("Eat blue ham", {"tags": ["V", "J", "N"]}), +] + + +def test_issue7029(): + """Test that an empty document doesn't mess up an entire batch. + """ + nlp = English.from_config(load_config_from_str(CONFIG)) + train_examples = [] + for t in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) + optimizer = nlp.initialize(get_examples=lambda: train_examples) + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + texts = ["first", "second", "thrid", "fourth", "and", "then", "some", ""] + nlp.select_pipes(enable=["tok2vec", "tagger"]) + docs1 = list(nlp.pipe(texts, batch_size=1)) + docs2 = list(nlp.pipe(texts, batch_size=4)) + assert [doc[0].tag_ for doc in docs1[:-1]] == [doc[0].tag_ for doc in docs2[:-1]]