Re-enable test

This commit is contained in:
Matthew Honnibal 2025-01-13 14:42:59 +01:00
parent 685a386106
commit 128f7256bb

View File

@ -265,50 +265,50 @@ def test_pretraining_tagger():
# Try to debug segfault on windows # Try to debug segfault on windows
# def test_pretraining_training(): def test_pretraining_training():
# """Test that training can use a pretrained Tok2Vec model""" """Test that training can use a pretrained Tok2Vec model"""
# config = Config().from_str(pretrain_string_internal) config = Config().from_str(pretrain_string_internal)
# nlp = util.load_model_from_config(config, auto_fill=True, validate=False) nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
# filled = nlp.config filled = nlp.config
# pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH) pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
# filled = pretrain_config.merge(filled) filled = pretrain_config.merge(filled)
# train_config = util.load_config(DEFAULT_CONFIG_PATH) train_config = util.load_config(DEFAULT_CONFIG_PATH)
# filled = train_config.merge(filled) filled = train_config.merge(filled)
# with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
# pretrain_dir = tmp_dir / "pretrain" pretrain_dir = tmp_dir / "pretrain"
# pretrain_dir.mkdir() pretrain_dir.mkdir()
# file_path = write_sample_jsonl(pretrain_dir) file_path = write_sample_jsonl(pretrain_dir)
# filled["paths"]["raw_text"] = file_path filled["paths"]["raw_text"] = file_path
# filled["pretraining"]["component"] = "tagger" filled["pretraining"]["component"] = "tagger"
# filled["pretraining"]["layer"] = "tok2vec" filled["pretraining"]["layer"] = "tok2vec"
# train_dir = tmp_dir / "train" train_dir = tmp_dir / "train"
# train_dir.mkdir() train_dir.mkdir()
# train_path, dev_path = write_sample_training(train_dir) train_path, dev_path = write_sample_training(train_dir)
# filled["paths"]["train"] = train_path filled["paths"]["train"] = train_path
# filled["paths"]["dev"] = dev_path filled["paths"]["dev"] = dev_path
# filled = filled.interpolate() filled = filled.interpolate()
# P = filled["pretraining"] P = filled["pretraining"]
# nlp_base = init_nlp(filled) nlp_base = init_nlp(filled)
# model_base = ( model_base = (
# nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
# ) )
# embed_base = None embed_base = None
# for node in model_base.walk(): for node in model_base.walk():
# if node.name == "hashembed": if node.name == "hashembed":
# embed_base = node embed_base = node
# pretrain(filled, pretrain_dir) pretrain(filled, pretrain_dir)
# pretrained_model = Path(pretrain_dir / "model3.bin") pretrained_model = Path(pretrain_dir / "model3.bin")
# assert pretrained_model.exists() assert pretrained_model.exists()
# filled["initialize"]["init_tok2vec"] = str(pretrained_model) filled["initialize"]["init_tok2vec"] = str(pretrained_model)
# nlp = init_nlp(filled) nlp = init_nlp(filled)
# model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed") model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
# embed = None embed = None
# for node in model.walk(): for node in model.walk():
# if node.name == "hashembed": if node.name == "hashembed":
# embed = node embed = node
# # ensure that the tok2vec weights are actually changed by the pretraining # ensure that the tok2vec weights are actually changed by the pretraining
# assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E"))) assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
# train(nlp, train_dir) train(nlp, train_dir)
def write_sample_jsonl(tmp_dir): def write_sample_jsonl(tmp_dir):