mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Fixing pretrain (#7342)
* initialize NLP with train corpus * add more pretraining tests * more tests * function to fetch tok2vec layer for pretraining * clarify parameter name * test different objectives * formatting * fix check for static vectors when using vectors objective * clarify docs * logger statement * fix init_tok2vec and proc.initialize order * test training after pretraining * add init_config tests for pretraining * pop pretraining block to avoid config validation errors * custom errors
This commit is contained in:
parent
97bcf2ae3a
commit
cd70c3cb79
|
@ -487,7 +487,10 @@ class Errors:
|
||||||
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
|
||||||
|
|
||||||
# New errors added in v3.x
|
# New errors added in v3.x
|
||||||
|
E874 = ("Could not initialize the tok2vec model from component "
|
||||||
|
"'{component}' and layer '{layer}'.")
|
||||||
|
E875 = ("To use the PretrainVectors objective, make sure that static vectors are loaded. "
|
||||||
|
"In the config, these are defined by the initialize.vectors setting.")
|
||||||
E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
|
E879 = ("Unexpected type for 'spans' data. Provide a dictionary mapping keys to "
|
||||||
"a list of spans, with each span represented by a tuple (start_char, end_char). "
|
"a list of spans, with each span represented by a tuple (start_char, end_char). "
|
||||||
"The tuple can be optionally extended with a label and a KB ID.")
|
"The tuple can be optionally extended with a label and a KB ID.")
|
||||||
|
|
|
@ -1222,10 +1222,6 @@ class Language:
|
||||||
init_vocab(
|
init_vocab(
|
||||||
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
self, data=I["vocab_data"], lookups=I["lookups"], vectors=I["vectors"]
|
||||||
)
|
)
|
||||||
pretrain_cfg = config.get("pretraining")
|
|
||||||
if pretrain_cfg:
|
|
||||||
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
|
||||||
init_tok2vec(self, P, I)
|
|
||||||
if self.vocab.vectors.data.shape[1] >= 1:
|
if self.vocab.vectors.data.shape[1] >= 1:
|
||||||
ops = get_current_ops()
|
ops = get_current_ops()
|
||||||
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
self.vocab.vectors.data = ops.asarray(self.vocab.vectors.data)
|
||||||
|
@ -1244,6 +1240,10 @@ class Language:
|
||||||
proc.initialize, p_settings, section="components", name=name
|
proc.initialize, p_settings, section="components", name=name
|
||||||
)
|
)
|
||||||
proc.initialize(get_examples, nlp=self, **p_settings)
|
proc.initialize(get_examples, nlp=self, **p_settings)
|
||||||
|
pretrain_cfg = config.get("pretraining")
|
||||||
|
if pretrain_cfg:
|
||||||
|
P = registry.resolve(pretrain_cfg, schema=ConfigSchemaPretrain)
|
||||||
|
init_tok2vec(self, P, I)
|
||||||
self._link_components()
|
self._link_components()
|
||||||
self._optimizer = sgd
|
self._optimizer = sgd
|
||||||
if sgd is not None:
|
if sgd is not None:
|
||||||
|
@ -1592,6 +1592,7 @@ class Language:
|
||||||
# using the nlp.config with all defaults.
|
# using the nlp.config with all defaults.
|
||||||
config = util.copy_config(config)
|
config = util.copy_config(config)
|
||||||
orig_pipeline = config.pop("components", {})
|
orig_pipeline = config.pop("components", {})
|
||||||
|
orig_pretraining = config.pop("pretraining", None)
|
||||||
config["components"] = {}
|
config["components"] = {}
|
||||||
if auto_fill:
|
if auto_fill:
|
||||||
filled = registry.fill(config, validate=validate, schema=ConfigSchema)
|
filled = registry.fill(config, validate=validate, schema=ConfigSchema)
|
||||||
|
@ -1599,6 +1600,9 @@ class Language:
|
||||||
filled = config
|
filled = config
|
||||||
filled["components"] = orig_pipeline
|
filled["components"] = orig_pipeline
|
||||||
config["components"] = orig_pipeline
|
config["components"] = orig_pipeline
|
||||||
|
if orig_pretraining is not None:
|
||||||
|
filled["pretraining"] = orig_pretraining
|
||||||
|
config["pretraining"] = orig_pretraining
|
||||||
resolved_nlp = registry.resolve(
|
resolved_nlp = registry.resolve(
|
||||||
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
filled["nlp"], validate=validate, schema=ConfigSchemaNlp
|
||||||
)
|
)
|
||||||
|
|
|
@ -21,6 +21,8 @@ def create_pretrain_vectors(
|
||||||
maxout_pieces: int, hidden_size: int, loss: str
|
maxout_pieces: int, hidden_size: int, loss: str
|
||||||
) -> Callable[["Vocab", Model], Model]:
|
) -> Callable[["Vocab", Model], Model]:
|
||||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||||
|
if vocab.vectors.data.shape[1] == 0:
|
||||||
|
raise ValueError(Errors.E875)
|
||||||
model = build_cloze_multi_task_model(
|
model = build_cloze_multi_task_model(
|
||||||
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||||
)
|
)
|
||||||
|
@ -134,7 +136,7 @@ def build_cloze_characters_multi_task_model(
|
||||||
) -> Model:
|
) -> Model:
|
||||||
output_layer = chain(
|
output_layer = chain(
|
||||||
list2array(),
|
list2array(),
|
||||||
Maxout(hidden_size, nP=maxout_pieces),
|
Maxout(nO=hidden_size, nP=maxout_pieces),
|
||||||
LayerNorm(nI=hidden_size),
|
LayerNorm(nI=hidden_size),
|
||||||
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
MultiSoftmax([256] * nr_char, nI=hidden_size),
|
||||||
)
|
)
|
||||||
|
|
|
@ -293,7 +293,7 @@ def test_serialize_parser(parser_config_string):
|
||||||
|
|
||||||
|
|
||||||
def test_config_nlp_roundtrip():
|
def test_config_nlp_roundtrip():
|
||||||
"""Test that a config prduced by the nlp object passes training config
|
"""Test that a config produced by the nlp object passes training config
|
||||||
validation."""
|
validation."""
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe("entity_ruler")
|
nlp.add_pipe("entity_ruler")
|
||||||
|
|
|
@ -4,7 +4,7 @@ from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||||
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||||
from spacy.lang.nl import Dutch
|
from spacy.lang.nl import Dutch
|
||||||
from spacy.util import ENV_VARS
|
from spacy.util import ENV_VARS, load_model_from_config
|
||||||
from spacy.cli import info
|
from spacy.cli import info
|
||||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||||
|
@ -397,10 +397,14 @@ def test_parse_cli_overrides():
|
||||||
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
"pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]]
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
@pytest.mark.parametrize("optimize", ["efficiency", "accuracy"])
|
||||||
def test_init_config(lang, pipeline, optimize):
|
@pytest.mark.parametrize("pretraining", [True, False])
|
||||||
|
def test_init_config(lang, pipeline, optimize, pretraining):
|
||||||
# TODO: add more tests and also check for GPU with transformers
|
# TODO: add more tests and also check for GPU with transformers
|
||||||
config = init_config(lang=lang, pipeline=pipeline, optimize=optimize, gpu=False)
|
config = init_config(lang=lang, pipeline=pipeline, optimize=optimize, pretraining=pretraining, gpu=False)
|
||||||
assert isinstance(config, Config)
|
assert isinstance(config, Config)
|
||||||
|
if pretraining:
|
||||||
|
config["paths"]["raw_text"] = "my_data.jsonl"
|
||||||
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
|
|
||||||
|
|
||||||
def test_model_recommendations():
|
def test_model_recommendations():
|
||||||
|
|
345
spacy/tests/training/test_pretraining.py
Normal file
345
spacy/tests/training/test_pretraining.py
Normal file
|
@ -0,0 +1,345 @@
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
import pytest
|
||||||
|
import srsly
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from thinc.api import Config
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
from ... import util
|
||||||
|
from ...lang.en import English
|
||||||
|
from ...training.initialize import init_nlp
|
||||||
|
from ...training.loop import train
|
||||||
|
from ...training.pretrain import pretrain
|
||||||
|
from ...tokens import Doc, DocBin
|
||||||
|
from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH
|
||||||
|
|
||||||
|
pretrain_string_listener = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 342
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
embed_size = 2000
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.width}
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
max_epochs = 5
|
||||||
|
|
||||||
|
[training]
|
||||||
|
max_epochs = 5
|
||||||
|
"""
|
||||||
|
|
||||||
|
pretrain_string_internal = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 342
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
embed_size = 2000
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
max_epochs = 5
|
||||||
|
|
||||||
|
[training]
|
||||||
|
max_epochs = 5
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
pretrain_string_vectors = """
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["tok2vec", "tagger"]
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.tok2vec]
|
||||||
|
factory = "tok2vec"
|
||||||
|
|
||||||
|
[components.tok2vec.model]
|
||||||
|
@architectures = "spacy.HashEmbedCNN.v1"
|
||||||
|
pretrained_vectors = null
|
||||||
|
width = 342
|
||||||
|
depth = 4
|
||||||
|
window_size = 1
|
||||||
|
embed_size = 2000
|
||||||
|
maxout_pieces = 3
|
||||||
|
subword_features = true
|
||||||
|
|
||||||
|
[components.tagger]
|
||||||
|
factory = "tagger"
|
||||||
|
|
||||||
|
[components.tagger.model]
|
||||||
|
@architectures = "spacy.Tagger.v1"
|
||||||
|
|
||||||
|
[components.tagger.model.tok2vec]
|
||||||
|
@architectures = "spacy.Tok2VecListener.v1"
|
||||||
|
width = ${components.tok2vec.model.width}
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
max_epochs = 5
|
||||||
|
|
||||||
|
[pretraining.objective]
|
||||||
|
@architectures = spacy.PretrainVectors.v1
|
||||||
|
maxout_pieces = 3
|
||||||
|
hidden_size = 300
|
||||||
|
loss = cosine
|
||||||
|
|
||||||
|
[training]
|
||||||
|
max_epochs = 5
|
||||||
|
"""
|
||||||
|
|
||||||
|
CHAR_OBJECTIVES = [
|
||||||
|
{},
|
||||||
|
{"@architectures": "spacy.PretrainCharacters.v1"},
|
||||||
|
{
|
||||||
|
"@architectures": "spacy.PretrainCharacters.v1",
|
||||||
|
"maxout_pieces": 5,
|
||||||
|
"hidden_size": 42,
|
||||||
|
"n_characters": 2,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
VECTOR_OBJECTIVES = [
|
||||||
|
{
|
||||||
|
"@architectures": "spacy.PretrainVectors.v1",
|
||||||
|
"maxout_pieces": 3,
|
||||||
|
"hidden_size": 300,
|
||||||
|
"loss": "cosine",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"@architectures": "spacy.PretrainVectors.v1",
|
||||||
|
"maxout_pieces": 2,
|
||||||
|
"hidden_size": 200,
|
||||||
|
"loss": "L2",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_pretraining_default():
|
||||||
|
"""Test that pretraining defaults to a character objective"""
|
||||||
|
config = Config().from_str(pretrain_string_internal)
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
assert "PretrainCharacters" in filled["pretraining"]["objective"]["@architectures"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("objective", CHAR_OBJECTIVES)
|
||||||
|
def test_pretraining_tok2vec_characters(objective):
|
||||||
|
"""Test that pretraining works with the character objective"""
|
||||||
|
config = Config().from_str(pretrain_string_listener)
|
||||||
|
config["pretraining"]["objective"] = objective
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = write_sample_jsonl(tmp_dir)
|
||||||
|
filled["paths"]["raw_text"] = file_path
|
||||||
|
filled = filled.interpolate()
|
||||||
|
assert filled["pretraining"]["component"] == "tok2vec"
|
||||||
|
pretrain(filled, tmp_dir)
|
||||||
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||||
|
def test_pretraining_tok2vec_vectors_fail(objective):
|
||||||
|
"""Test that pretraining doesn't works with the vectors objective if there are no static vectors"""
|
||||||
|
config = Config().from_str(pretrain_string_listener)
|
||||||
|
config["pretraining"]["objective"] = objective
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = write_sample_jsonl(tmp_dir)
|
||||||
|
filled["paths"]["raw_text"] = file_path
|
||||||
|
filled = filled.interpolate()
|
||||||
|
assert filled["initialize"]["vectors"] is None
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pretrain(filled, tmp_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("objective", VECTOR_OBJECTIVES)
|
||||||
|
def test_pretraining_tok2vec_vectors(objective):
|
||||||
|
"""Test that pretraining works with the vectors objective and static vectors defined"""
|
||||||
|
config = Config().from_str(pretrain_string_listener)
|
||||||
|
config["pretraining"]["objective"] = objective
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = write_sample_jsonl(tmp_dir)
|
||||||
|
filled["paths"]["raw_text"] = file_path
|
||||||
|
nlp_path = write_vectors_model(tmp_dir)
|
||||||
|
filled["initialize"]["vectors"] = nlp_path
|
||||||
|
filled = filled.interpolate()
|
||||||
|
pretrain(filled, tmp_dir)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("config", [pretrain_string_internal, pretrain_string_listener])
|
||||||
|
def test_pretraining_tagger_tok2vec(config):
|
||||||
|
"""Test pretraining of the tagger's tok2vec layer (via a listener)"""
|
||||||
|
config = Config().from_str(pretrain_string_listener)
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = write_sample_jsonl(tmp_dir)
|
||||||
|
filled["paths"]["raw_text"] = file_path
|
||||||
|
filled["pretraining"]["component"] = "tagger"
|
||||||
|
filled["pretraining"]["layer"] = "tok2vec"
|
||||||
|
filled = filled.interpolate()
|
||||||
|
pretrain(filled, tmp_dir)
|
||||||
|
assert Path(tmp_dir / "model0.bin").exists()
|
||||||
|
assert Path(tmp_dir / "model4.bin").exists()
|
||||||
|
assert not Path(tmp_dir / "model5.bin").exists()
|
||||||
|
|
||||||
|
|
||||||
|
def test_pretraining_tagger():
|
||||||
|
"""Test pretraining of the tagger itself will throw an error (not an appropriate tok2vec layer)"""
|
||||||
|
config = Config().from_str(pretrain_string_internal)
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
file_path = write_sample_jsonl(tmp_dir)
|
||||||
|
filled["paths"]["raw_text"] = file_path
|
||||||
|
filled["pretraining"]["component"] = "tagger"
|
||||||
|
filled = filled.interpolate()
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
pretrain(filled, tmp_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def test_pretraining_training():
|
||||||
|
"""Test that training can use a pretrained Tok2Vec model"""
|
||||||
|
config = Config().from_str(pretrain_string_internal)
|
||||||
|
nlp = util.load_model_from_config(config, auto_fill=True, validate=False)
|
||||||
|
filled = nlp.config
|
||||||
|
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||||
|
filled = pretrain_config.merge(filled)
|
||||||
|
train_config = util.load_config(DEFAULT_CONFIG_PATH)
|
||||||
|
filled = train_config.merge(filled)
|
||||||
|
with make_tempdir() as tmp_dir:
|
||||||
|
pretrain_dir = tmp_dir / "pretrain"
|
||||||
|
pretrain_dir.mkdir()
|
||||||
|
file_path = write_sample_jsonl(pretrain_dir)
|
||||||
|
filled["paths"]["raw_text"] = file_path
|
||||||
|
filled["pretraining"]["component"] = "tagger"
|
||||||
|
filled["pretraining"]["layer"] = "tok2vec"
|
||||||
|
train_dir = tmp_dir / "train"
|
||||||
|
train_dir.mkdir()
|
||||||
|
train_path, dev_path = write_sample_training(train_dir)
|
||||||
|
filled["paths"]["train"] = train_path
|
||||||
|
filled["paths"]["dev"] = dev_path
|
||||||
|
filled = filled.interpolate()
|
||||||
|
P = filled["pretraining"]
|
||||||
|
nlp_base = init_nlp(filled)
|
||||||
|
model_base = nlp_base.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
||||||
|
embed_base = None
|
||||||
|
for node in model_base.walk():
|
||||||
|
if node.name == "hashembed":
|
||||||
|
embed_base = node
|
||||||
|
pretrain(filled, pretrain_dir)
|
||||||
|
pretrained_model = Path(pretrain_dir / "model3.bin")
|
||||||
|
assert pretrained_model.exists()
|
||||||
|
filled["initialize"]["init_tok2vec"] = str(pretrained_model)
|
||||||
|
nlp = init_nlp(filled)
|
||||||
|
model = nlp.get_pipe(P["component"]).model.get_ref(P["layer"]).get_ref("embed")
|
||||||
|
embed = None
|
||||||
|
for node in model.walk():
|
||||||
|
if node.name == "hashembed":
|
||||||
|
embed = node
|
||||||
|
# ensure that the tok2vec weights are actually changed by the pretraining
|
||||||
|
assert np.any(np.not_equal(embed.get_param("E"), embed_base.get_param("E")))
|
||||||
|
train(nlp, train_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def write_sample_jsonl(tmp_dir):
|
||||||
|
data = [
|
||||||
|
{
|
||||||
|
"meta": {"id": "1"},
|
||||||
|
"text": "This is the best TV you'll ever buy!",
|
||||||
|
"cats": {"pos": 1, "neg": 0},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"meta": {"id": "2"},
|
||||||
|
"text": "I wouldn't buy this again.",
|
||||||
|
"cats": {"pos": 0, "neg": 1},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
file_path = f"{tmp_dir}/text.jsonl"
|
||||||
|
srsly.write_jsonl(file_path, data)
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
|
||||||
|
def write_sample_training(tmp_dir):
|
||||||
|
words = ["The", "players", "start", "."]
|
||||||
|
tags = ["DT", "NN", "VBZ", "."]
|
||||||
|
doc = Doc(English().vocab, words=words, tags=tags)
|
||||||
|
doc_bin = DocBin()
|
||||||
|
doc_bin.add(doc)
|
||||||
|
train_path = f"{tmp_dir}/train.spacy"
|
||||||
|
dev_path = f"{tmp_dir}/dev.spacy"
|
||||||
|
doc_bin.to_disk(train_path)
|
||||||
|
doc_bin.to_disk(dev_path)
|
||||||
|
return train_path, dev_path
|
||||||
|
|
||||||
|
|
||||||
|
def write_vectors_model(tmp_dir):
|
||||||
|
import numpy
|
||||||
|
vocab = Vocab()
|
||||||
|
vector_data = {
|
||||||
|
"dog": numpy.random.uniform(-1, 1, (300,)),
|
||||||
|
"cat": numpy.random.uniform(-1, 1, (300,)),
|
||||||
|
"orange": numpy.random.uniform(-1, 1, (300,))
|
||||||
|
}
|
||||||
|
for word, vector in vector_data.items():
|
||||||
|
vocab.set_vector(word, vector)
|
||||||
|
nlp_path = tmp_dir / "vectors_model"
|
||||||
|
nlp = English(vocab)
|
||||||
|
nlp.to_disk(nlp_path)
|
||||||
|
return str(nlp_path)
|
|
@ -9,6 +9,7 @@ import gzip
|
||||||
import zipfile
|
import zipfile
|
||||||
import tqdm
|
import tqdm
|
||||||
|
|
||||||
|
from .pretrain import get_tok2vec_ref
|
||||||
from ..lookups import Lookups
|
from ..lookups import Lookups
|
||||||
from ..vectors import Vectors
|
from ..vectors import Vectors
|
||||||
from ..errors import Errors, Warnings
|
from ..errors import Errors, Warnings
|
||||||
|
@ -147,10 +148,6 @@ def init_tok2vec(
|
||||||
weights_data = None
|
weights_data = None
|
||||||
init_tok2vec = ensure_path(I["init_tok2vec"])
|
init_tok2vec = ensure_path(I["init_tok2vec"])
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
if P["objective"].get("type") == "vectors" and not I["vectors"]:
|
|
||||||
err = 'need initialize.vectors if pretraining.objective.type is "vectors"'
|
|
||||||
errors = [{"loc": ["initialize"], "msg": err}]
|
|
||||||
raise ConfigValidationError(config=nlp.config, errors=errors)
|
|
||||||
if not init_tok2vec.exists():
|
if not init_tok2vec.exists():
|
||||||
err = f"can't find pretrained tok2vec: {init_tok2vec}"
|
err = f"can't find pretrained tok2vec: {init_tok2vec}"
|
||||||
errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
|
errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
|
||||||
|
@ -158,21 +155,9 @@ def init_tok2vec(
|
||||||
with init_tok2vec.open("rb") as file_:
|
with init_tok2vec.open("rb") as file_:
|
||||||
weights_data = file_.read()
|
weights_data = file_.read()
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
tok2vec_component = P["component"]
|
layer = get_tok2vec_ref(nlp, P)
|
||||||
if tok2vec_component is None:
|
|
||||||
desc = (
|
|
||||||
f"To use pretrained tok2vec weights, [pretraining.component] "
|
|
||||||
f"needs to specify the component that should load them."
|
|
||||||
)
|
|
||||||
err = "component can't be null"
|
|
||||||
errors = [{"loc": ["pretraining", "component"], "msg": err}]
|
|
||||||
raise ConfigValidationError(
|
|
||||||
config=nlp.config["pretraining"], errors=errors, desc=desc
|
|
||||||
)
|
|
||||||
layer = nlp.get_pipe(tok2vec_component).model
|
|
||||||
if P["layer"]:
|
|
||||||
layer = layer.get_ref(P["layer"])
|
|
||||||
layer.from_bytes(weights_data)
|
layer.from_bytes(weights_data)
|
||||||
|
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,12 @@ from collections import Counter
|
||||||
import srsly
|
import srsly
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from thinc.config import ConfigValidationError
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
|
||||||
from .example import Example
|
from .example import Example
|
||||||
|
from ..errors import Errors
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..schemas import ConfigSchemaPretrain
|
from ..schemas import ConfigSchemaPretrain
|
||||||
from ..util import registry, load_model_from_config, dot_to_object
|
from ..util import registry, load_model_from_config, dot_to_object
|
||||||
|
@ -133,12 +136,21 @@ def create_pretraining_model(nlp, pretrain_config):
|
||||||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||||
serialized to file and read back in when calling the 'train' command.
|
serialized to file and read back in when calling the 'train' command.
|
||||||
"""
|
"""
|
||||||
nlp.initialize()
|
with nlp.select_pipes(enable=[]):
|
||||||
component = nlp.get_pipe(pretrain_config["component"])
|
nlp.initialize()
|
||||||
if pretrain_config.get("layer"):
|
tok2vec = get_tok2vec_ref(nlp, pretrain_config)
|
||||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
# If the config referred to a Tok2VecListener, grab the original model instead
|
||||||
else:
|
if type(tok2vec).__name__ == "Tok2VecListener":
|
||||||
tok2vec = component.model
|
original_tok2vec = (
|
||||||
|
tok2vec.upstream_name if tok2vec.upstream_name is not "*" else "tok2vec"
|
||||||
|
)
|
||||||
|
tok2vec = nlp.get_pipe(original_tok2vec).model
|
||||||
|
try:
|
||||||
|
tok2vec.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||||
|
except ValueError:
|
||||||
|
component = pretrain_config["component"]
|
||||||
|
layer = pretrain_config["layer"]
|
||||||
|
raise ValueError(Errors.E874.format(component=component, layer=layer))
|
||||||
|
|
||||||
create_function = pretrain_config["objective"]
|
create_function = pretrain_config["objective"]
|
||||||
model = create_function(nlp.vocab, tok2vec)
|
model = create_function(nlp.vocab, tok2vec)
|
||||||
|
@ -147,6 +159,24 @@ def create_pretraining_model(nlp, pretrain_config):
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def get_tok2vec_ref(nlp, pretrain_config):
|
||||||
|
tok2vec_component = pretrain_config["component"]
|
||||||
|
if tok2vec_component is None:
|
||||||
|
desc = (
|
||||||
|
f"To use pretrained tok2vec weights, [pretraining.component] "
|
||||||
|
f"needs to specify the component that should load them."
|
||||||
|
)
|
||||||
|
err = "component can't be null"
|
||||||
|
errors = [{"loc": ["pretraining", "component"], "msg": err}]
|
||||||
|
raise ConfigValidationError(
|
||||||
|
config=nlp.config["pretraining"], errors=errors, desc=desc
|
||||||
|
)
|
||||||
|
layer = nlp.get_pipe(tok2vec_component).model
|
||||||
|
if pretrain_config["layer"]:
|
||||||
|
layer = layer.get_ref(pretrain_config["layer"])
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
class ProgressTracker:
|
class ProgressTracker:
|
||||||
def __init__(self, frequency=1000000):
|
def __init__(self, frequency=1000000):
|
||||||
self.loss = 0.0
|
self.loss = 0.0
|
||||||
|
|
|
@ -447,6 +447,9 @@ For more information, see the section on
|
||||||
> ```ini
|
> ```ini
|
||||||
> [pretraining]
|
> [pretraining]
|
||||||
> component = "tok2vec"
|
> component = "tok2vec"
|
||||||
|
>
|
||||||
|
> [initialize]
|
||||||
|
> vectors = "en_core_web_lg"
|
||||||
> ...
|
> ...
|
||||||
>
|
>
|
||||||
> [pretraining.objective]
|
> [pretraining.objective]
|
||||||
|
@ -457,7 +460,9 @@ For more information, see the section on
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Predict the word's vector from a static embeddings table as pretraining
|
Predict the word's vector from a static embeddings table as pretraining
|
||||||
objective for a Tok2Vec layer.
|
objective for a Tok2Vec layer. To use this objective, make sure that the
|
||||||
|
`initialize.vectors` section in the config refers to a model with static
|
||||||
|
vectors.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
|
Loading…
Reference in New Issue
Block a user