mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
pretrain architectures (#6451)
* define new architectures for the pretraining objective * add loss function as attr of the omdel * cleanup * cleanup * shorten name * fix typo * remove unused error
This commit is contained in:
parent
29b058ebdc
commit
f98a04434a
|
@ -17,7 +17,9 @@ tolerance = 0.2
|
|||
get_length = null
|
||||
|
||||
[pretraining.objective]
|
||||
type = "characters"
|
||||
@architectures = "spacy.PretrainCharacters.v1"
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
n_characters = 4
|
||||
|
||||
[pretraining.optimizer]
|
||||
|
|
|
@ -484,8 +484,8 @@ class Errors:
|
|||
"has been applied.")
|
||||
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
|
||||
"dimension refers to the width of the vectors table.")
|
||||
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
|
||||
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
|
||||
E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
|
||||
"are: {supported}")
|
||||
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
|
||||
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
|
||||
E910 = ("Encountered NaN value when computing loss for component '{name}'.")
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from .entity_linker import * # noqa
|
||||
from .multi_task import * # noqa
|
||||
from .parser import * # noqa
|
||||
from .tagger import * # noqa
|
||||
from .textcat import * # noqa
|
||||
|
|
|
@ -1,7 +1,14 @@
|
|||
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
|
||||
import numpy
|
||||
from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
|
||||
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
|
||||
from thinc.api import MultiSoftmax, list2array
|
||||
from thinc.api import to_categorical, CosineDistance, L2Distance
|
||||
|
||||
from ...util import registry
|
||||
from ...errors import Errors
|
||||
from ...attrs import ID
|
||||
|
||||
import numpy
|
||||
from functools import partial
|
||||
|
||||
if TYPE_CHECKING:
|
||||
# This lets us add type hints for mypy etc. without causing circular imports
|
||||
|
@ -9,6 +16,74 @@ if TYPE_CHECKING:
|
|||
from ...tokens import Doc # noqa: F401
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.PretrainVectors.v1")
|
||||
def create_pretrain_vectors(
|
||||
maxout_pieces: int, hidden_size: int, loss: str
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
model = build_cloze_multi_task_model(
|
||||
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
model.attrs["loss"] = create_vectors_loss()
|
||||
return model
|
||||
|
||||
def create_vectors_loss() -> Callable:
|
||||
if loss == "cosine":
|
||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
elif loss == "L2":
|
||||
distance = L2Distance(normalize=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
else:
|
||||
raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
|
||||
|
||||
return create_vectors_objective
|
||||
|
||||
|
||||
@registry.architectures.register("spacy.PretrainCharacters.v1")
|
||||
def create_pretrain_characters(
|
||||
maxout_pieces: int, hidden_size: int, n_characters: int
|
||||
) -> Callable[["Vocab", Model], Model]:
|
||||
def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=n_characters,
|
||||
)
|
||||
model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
|
||||
return model
|
||||
|
||||
return create_characters_objective
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def build_multi_task_model(
|
||||
tok2vec: Model,
|
||||
maxout_pieces: int,
|
||||
|
@ -33,23 +108,19 @@ def build_multi_task_model(
|
|||
|
||||
|
||||
def build_cloze_multi_task_model(
|
||||
vocab: "Vocab",
|
||||
tok2vec: Model,
|
||||
maxout_pieces: int,
|
||||
hidden_size: int,
|
||||
nO: Optional[int] = None,
|
||||
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
|
||||
) -> Model:
|
||||
# nO = vocab.vectors.data.shape[1]
|
||||
nO = vocab.vectors.data.shape[1]
|
||||
output_layer = chain(
|
||||
list2array(),
|
||||
Maxout(
|
||||
nO=nO,
|
||||
nO=hidden_size,
|
||||
nI=tok2vec.get_dim("nO"),
|
||||
nP=maxout_pieces,
|
||||
normalize=True,
|
||||
dropout=0.0,
|
||||
),
|
||||
Linear(nO=nO, nI=nO, init_W=zero_init),
|
||||
Linear(nO=nO, nI=hidden_size, init_W=zero_init),
|
||||
)
|
||||
model = chain(tok2vec, output_layer)
|
||||
model = build_masked_language_model(vocab, model)
|
||||
|
|
|
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
|
|||
batcher: Batcher = Field(..., title="Batcher for the training data")
|
||||
component: str = Field(..., title="Component to find the layer to pretrain")
|
||||
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
|
||||
|
||||
# TODO: use a more detailed schema for this?
|
||||
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
|
||||
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
|
||||
# fmt: on
|
||||
|
||||
class Config:
|
||||
|
|
|
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
|
|||
import spacy
|
||||
from spacy.lang.en import English
|
||||
from spacy.lang.de import German
|
||||
from spacy.language import Language, DEFAULT_CONFIG
|
||||
from spacy.util import registry, load_model_from_config
|
||||
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
|
||||
from spacy.util import registry, load_model_from_config, load_config
|
||||
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
|
||||
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
|
||||
from spacy.schemas import ConfigSchema
|
||||
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
|
||||
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
nlp_config_string = """
|
||||
[paths]
|
||||
train = null
|
||||
|
@ -63,6 +63,59 @@ factory = "tagger"
|
|||
width = ${components.tok2vec.model.width}
|
||||
"""
|
||||
|
||||
pretrain_config_string = """
|
||||
[paths]
|
||||
train = null
|
||||
dev = null
|
||||
|
||||
[corpora]
|
||||
|
||||
[corpora.train]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.dev}
|
||||
|
||||
[training]
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "spacy.batch_by_words.v1"
|
||||
size = 666
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
pipeline = ["tok2vec", "tagger"]
|
||||
|
||||
[components]
|
||||
|
||||
[components.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[components.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = null
|
||||
width = 342
|
||||
depth = 4
|
||||
window_size = 1
|
||||
embed_size = 2000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
||||
|
||||
[components.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[components.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.width}
|
||||
|
||||
[pretraining]
|
||||
"""
|
||||
|
||||
|
||||
parser_config_string = """
|
||||
[model]
|
||||
|
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
|
|||
load_model_from_config(Config(bad_cfg), auto_fill=True)
|
||||
|
||||
|
||||
def test_create_nlp_from_pretraining_config():
|
||||
"""Test that the default pretraining config validates properly"""
|
||||
config = Config().from_str(pretrain_config_string)
|
||||
pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
|
||||
filled = config.merge(pretrain_config)
|
||||
resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
|
||||
|
||||
|
||||
def test_create_nlp_from_config_multiple_instances():
|
||||
"""Test that the nlp object is created correctly for a config with multiple
|
||||
instances of the same component."""
|
||||
|
|
|
@ -1,22 +1,16 @@
|
|||
from typing import Optional, Callable, Iterable, Union, List
|
||||
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
|
||||
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
|
||||
from thinc.api import set_dropout_rate
|
||||
from pathlib import Path
|
||||
from functools import partial
|
||||
from collections import Counter
|
||||
import srsly
|
||||
import numpy
|
||||
import time
|
||||
import re
|
||||
from wasabi import Printer
|
||||
|
||||
from .example import Example
|
||||
from ..tokens import Doc
|
||||
from ..attrs import ID
|
||||
from ..ml.models.multi_task import build_cloze_multi_task_model
|
||||
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
|
||||
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
|
||||
from ..errors import Errors
|
||||
from ..util import registry, load_model_from_config, dot_to_object
|
||||
|
||||
|
||||
|
@ -49,6 +43,7 @@ def pretrain(
|
|||
else:
|
||||
# Without '--resume-path' the '--epoch-resume' argument is ignored
|
||||
epoch_resume = 0
|
||||
objective = model.attrs["loss"]
|
||||
# TODO: move this to logger function?
|
||||
tracker = ProgressTracker(frequency=10000)
|
||||
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
|
||||
|
@ -69,7 +64,6 @@ def pretrain(
|
|||
with (output_dir / "log.jsonl").open("a") as file_:
|
||||
file_.write(srsly.json_dumps(log) + "\n")
|
||||
|
||||
objective = create_objective(P["objective"])
|
||||
# TODO: I think we probably want this to look more like the
|
||||
# 'create_train_batches' function?
|
||||
for epoch in range(epoch_resume, P["max_epochs"]):
|
||||
|
@ -132,58 +126,6 @@ def make_update(
|
|||
return float(loss)
|
||||
|
||||
|
||||
def create_objective(config: Config):
|
||||
"""Create the objective for pretraining.
|
||||
|
||||
We'd like to replace this with a registry function but it's tricky because
|
||||
we're also making a model choice based on this. For now we hard-code support
|
||||
for two types (characters, vectors). For characters you can specify
|
||||
n_characters, for vectors you can specify the loss.
|
||||
|
||||
Bleh.
|
||||
"""
|
||||
objective_type = config["type"]
|
||||
if objective_type == "characters":
|
||||
return partial(get_characters_loss, nr_char=config["n_characters"])
|
||||
elif objective_type == "vectors":
|
||||
if config["loss"] == "cosine":
|
||||
distance = CosineDistance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
elif config["loss"] == "L2":
|
||||
distance = L2Distance(normalize=True, ignore_zeros=True)
|
||||
return partial(get_vectors_loss, distance=distance)
|
||||
else:
|
||||
raise ValueError(Errors.E906.format(loss_type=config["loss"]))
|
||||
else:
|
||||
raise ValueError(Errors.E907.format(objective_type=objective_type))
|
||||
|
||||
|
||||
def get_vectors_loss(ops, docs, prediction, distance):
|
||||
"""Compute a loss based on a distance between the documents' vectors and
|
||||
the prediction.
|
||||
"""
|
||||
# The simplest way to implement this would be to vstack the
|
||||
# token.vector values, but that's a bit inefficient, especially on GPU.
|
||||
# Instead we fetch the index into the vectors table for each of our tokens,
|
||||
# and look them up all at once. This prevents data copying.
|
||||
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = docs[0].vocab.vectors.data[ids]
|
||||
d_target, loss = distance(prediction, target)
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def get_characters_loss(ops, docs, prediction, nr_char):
|
||||
"""Compute a loss based on a number of characters predicted from the docs."""
|
||||
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
|
||||
target_ids = target_ids.reshape((-1,))
|
||||
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
|
||||
target = target.reshape((-1, 256 * nr_char))
|
||||
diff = prediction - target
|
||||
loss = (diff ** 2).sum()
|
||||
d_target = diff / float(prediction.shape[0])
|
||||
return loss, d_target
|
||||
|
||||
|
||||
def create_pretraining_model(nlp, pretrain_config):
|
||||
"""Define a network for the pretraining. We simply add an output layer onto
|
||||
the tok2vec input model. The tok2vec input model needs to be a model that
|
||||
|
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
|
|||
The actual tok2vec layer is stored as a reference, and only this bit will be
|
||||
serialized to file and read back in when calling the 'train' command.
|
||||
"""
|
||||
nlp.initialize()
|
||||
component = nlp.get_pipe(pretrain_config["component"])
|
||||
if pretrain_config.get("layer"):
|
||||
tok2vec = component.model.get_ref(pretrain_config["layer"])
|
||||
else:
|
||||
tok2vec = component.model
|
||||
|
||||
# TODO
|
||||
maxout_pieces = 3
|
||||
hidden_size = 300
|
||||
if pretrain_config["objective"]["type"] == "vectors":
|
||||
model = build_cloze_multi_task_model(
|
||||
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
|
||||
)
|
||||
elif pretrain_config["objective"]["type"] == "characters":
|
||||
model = build_cloze_characters_multi_task_model(
|
||||
nlp.vocab,
|
||||
tok2vec,
|
||||
hidden_size=hidden_size,
|
||||
maxout_pieces=maxout_pieces,
|
||||
nr_char=pretrain_config["objective"]["n_characters"],
|
||||
)
|
||||
create_function = pretrain_config["objective"]
|
||||
model = create_function(nlp.vocab, tok2vec)
|
||||
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
|
||||
set_dropout_rate(model, pretrain_config["dropout"])
|
||||
return model
|
||||
|
|
Loading…
Reference in New Issue
Block a user