pretrain architectures (#6451)

* define new architectures for the pretraining objective * add loss function as attr of the omdel * cleanup * cleanup * shorten name * fix typo * remove unused error
2025-08-21 20:44:56 +03:00 · 2020-12-08 07:41:03 +01:00 · 2020-12-08 07:41:03 +01:00 · f98a04434a
commit f98a04434a
parent 29b058ebdc
7 changed files with 158 additions and 95 deletions
--- a/spacy/default_config_pretraining.cfg
+++ b/spacy/default_config_pretraining.cfg
@ -17,7 +17,9 @@ tolerance = 0.2
 get_length = null

 [pretraining.objective]
-type = "characters"
+@architectures = "spacy.PretrainCharacters.v1"
+maxout_pieces = 3
+hidden_size = 300
 n_characters = 4

 [pretraining.optimizer]
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -484,8 +484,8 @@ class Errors:
            "has been applied.")
    E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
            "dimension refers to the width of the vectors table.")
-    E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
-    E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
+    E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
+            "are: {supported}")
    E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
    E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
    E910 = ("Encountered NaN value when computing loss for component '{name}'.")
--- a/spacy/ml/models/init.py
+++ b/spacy/ml/models/init.py
@ -1,4 +1,5 @@
 from .entity_linker import *  # noqa
+from .multi_task import *  # noqa
 from .parser import *  # noqa
 from .tagger import *  # noqa
 from .textcat import *  # noqa
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -1,7 +1,14 @@
-from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
-import numpy
+from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
 from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
 from thinc.api import MultiSoftmax, list2array
+from thinc.api import to_categorical, CosineDistance, L2Distance
+
+from ...util import registry
+from ...errors import Errors
+from ...attrs import ID
+
+import numpy
+from functools import partial

 if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
@ -9,6 +16,74 @@ if TYPE_CHECKING:
    from ...tokens import Doc  # noqa: F401


+@registry.architectures.register("spacy.PretrainVectors.v1")
+def create_pretrain_vectors(
+    maxout_pieces: int, hidden_size: int, loss: str
+) -> Callable[["Vocab", Model], Model]:
+    def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        model = build_cloze_multi_task_model(
+            vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
+        )
+        model.attrs["loss"] = create_vectors_loss()
+        return model
+
+    def create_vectors_loss() -> Callable:
+        if loss == "cosine":
+            distance = CosineDistance(normalize=True, ignore_zeros=True)
+            return partial(get_vectors_loss, distance=distance)
+        elif loss == "L2":
+            distance = L2Distance(normalize=True)
+            return partial(get_vectors_loss, distance=distance)
+        else:
+            raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
+
+    return create_vectors_objective
+
+
+@registry.architectures.register("spacy.PretrainCharacters.v1")
+def create_pretrain_characters(
+    maxout_pieces: int, hidden_size: int, n_characters: int
+) -> Callable[["Vocab", Model], Model]:
+    def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
+        model = build_cloze_characters_multi_task_model(
+            vocab,
+            tok2vec,
+            hidden_size=hidden_size,
+            maxout_pieces=maxout_pieces,
+            nr_char=n_characters,
+        )
+        model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
+        return model
+
+    return create_characters_objective
+
+
+def get_vectors_loss(ops, docs, prediction, distance):
+    """Compute a loss based on a distance between the documents' vectors and
+    the prediction.
+    """
+    # The simplest way to implement this would be to vstack the
+    # token.vector values, but that's a bit inefficient, especially on GPU.
+    # Instead we fetch the index into the vectors table for each of our tokens,
+    # and look them up all at once. This prevents data copying.
+    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+    target = docs[0].vocab.vectors.data[ids]
+    d_target, loss = distance(prediction, target)
+    return loss, d_target
+
+
+def get_characters_loss(ops, docs, prediction, nr_char):
+    """Compute a loss based on a number of characters predicted from the docs."""
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
+    target = target.reshape((-1, 256 * nr_char))
+    diff = prediction - target
+    loss = (diff ** 2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
 def build_multi_task_model(
    tok2vec: Model,
    maxout_pieces: int,
@ -33,23 +108,19 @@ def build_multi_task_model(


 def build_cloze_multi_task_model(
-    vocab: "Vocab",
-    tok2vec: Model,
-    maxout_pieces: int,
-    hidden_size: int,
-    nO: Optional[int] = None,
+    vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
 ) -> Model:
-    # nO = vocab.vectors.data.shape[1]
+    nO = vocab.vectors.data.shape[1]
    output_layer = chain(
        list2array(),
        Maxout(
-            nO=nO,
+            nO=hidden_size,
            nI=tok2vec.get_dim("nO"),
            nP=maxout_pieces,
            normalize=True,
            dropout=0.0,
        ),
-        Linear(nO=nO, nI=nO, init_W=zero_init),
+        Linear(nO=nO, nI=hidden_size, init_W=zero_init),
    )
    model = chain(tok2vec, output_layer)
    model = build_masked_language_model(vocab, model)
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
    batcher: Batcher = Field(..., title="Batcher for the training data")
    component: str = Field(..., title="Component to find the layer to pretrain")
    layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
-
-    # TODO: use a more detailed schema for this?
-    objective: Dict[str, Any] = Field(..., title="Pretraining objective")
+    objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
    # fmt: on

    class Config:
--- a/spacy/tests/serialize/test_serialize_config.py
+++ b/spacy/tests/serialize/test_serialize_config.py
@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
 import spacy
 from spacy.lang.en import English
 from spacy.lang.de import German
-from spacy.language import Language, DEFAULT_CONFIG
-from spacy.util import registry, load_model_from_config
+from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
+from spacy.util import registry, load_model_from_config, load_config
 from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
 from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
-from spacy.schemas import ConfigSchema
+from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
+

 from ..util import make_tempdir

-
 nlp_config_string = """
 [paths]
 train = null
@ -63,6 +63,59 @@ factory = "tagger"
 width = ${components.tok2vec.model.width}
 """

+pretrain_config_string = """
+[paths]
+train = null
+dev = null
+
+[corpora]
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+
+[training]
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+size = 666
+
+[nlp]
+lang = "en"
+pipeline = ["tok2vec", "tagger"]
+
+[components]
+
+[components.tok2vec]
+factory = "tok2vec"
+
+[components.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = null
+width = 342
+depth = 4
+window_size = 1
+embed_size = 2000
+maxout_pieces = 3
+subword_features = true
+
+[components.tagger]
+factory = "tagger"
+
+[components.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[components.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecListener.v1"
+width = ${components.tok2vec.model.width}
+
+[pretraining]
+"""
+

 parser_config_string = """
 [model]
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
        load_model_from_config(Config(bad_cfg), auto_fill=True)


+def test_create_nlp_from_pretraining_config():
+    """Test that the default pretraining config validates properly"""
+    config = Config().from_str(pretrain_config_string)
+    pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
+    filled = config.merge(pretrain_config)
+    resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
+
+
 def test_create_nlp_from_config_multiple_instances():
    """Test that the nlp object is created correctly for a config with multiple
    instances of the same component."""
--- a/spacy/training/pretrain.py
+++ b/spacy/training/pretrain.py
@ -1,22 +1,16 @@
 from typing import Optional, Callable, Iterable, Union, List
 from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
-from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
+from thinc.api import set_dropout_rate
 from pathlib import Path
-from functools import partial
 from collections import Counter
 import srsly
-import numpy
 import time
 import re
 from wasabi import Printer

 from .example import Example
 from ..tokens import Doc
-from ..attrs import ID
-from ..ml.models.multi_task import build_cloze_multi_task_model
-from ..ml.models.multi_task import build_cloze_characters_multi_task_model
 from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
-from ..errors import Errors
 from ..util import registry, load_model_from_config, dot_to_object


@ -49,6 +43,7 @@ def pretrain(
    else:
        # Without '--resume-path' the '--epoch-resume' argument is ignored
        epoch_resume = 0
+    objective = model.attrs["loss"]
    # TODO: move this to logger function?
    tracker = ProgressTracker(frequency=10000)
    msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -69,7 +64,6 @@ def pretrain(
            with (output_dir / "log.jsonl").open("a") as file_:
                file_.write(srsly.json_dumps(log) + "\n")

-    objective = create_objective(P["objective"])
    # TODO: I think we probably want this to look more like the
    # 'create_train_batches' function?
    for epoch in range(epoch_resume, P["max_epochs"]):
@ -132,58 +126,6 @@ def make_update(
    return float(loss)


-def create_objective(config: Config):
-    """Create the objective for pretraining.
-
-    We'd like to replace this with a registry function but it's tricky because
-    we're also making a model choice based on this. For now we hard-code support
-    for two types (characters, vectors). For characters you can specify
-    n_characters, for vectors you can specify the loss.
-
-    Bleh.
-    """
-    objective_type = config["type"]
-    if objective_type == "characters":
-        return partial(get_characters_loss, nr_char=config["n_characters"])
-    elif objective_type == "vectors":
-        if config["loss"] == "cosine":
-            distance = CosineDistance(normalize=True, ignore_zeros=True)
-            return partial(get_vectors_loss, distance=distance)
-        elif config["loss"] == "L2":
-            distance = L2Distance(normalize=True, ignore_zeros=True)
-            return partial(get_vectors_loss, distance=distance)
-        else:
-            raise ValueError(Errors.E906.format(loss_type=config["loss"]))
-    else:
-        raise ValueError(Errors.E907.format(objective_type=objective_type))
-
-
-def get_vectors_loss(ops, docs, prediction, distance):
-    """Compute a loss based on a distance between the documents' vectors and
-    the prediction.
-    """
-    # The simplest way to implement this would be to vstack the
-    # token.vector values, but that's a bit inefficient, especially on GPU.
-    # Instead we fetch the index into the vectors table for each of our tokens,
-    # and look them up all at once. This prevents data copying.
-    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-    target = docs[0].vocab.vectors.data[ids]
-    d_target, loss = distance(prediction, target)
-    return loss, d_target
-
-
-def get_characters_loss(ops, docs, prediction, nr_char):
-    """Compute a loss based on a number of characters predicted from the docs."""
-    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
-    target_ids = target_ids.reshape((-1,))
-    target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
-    target = target.reshape((-1, 256 * nr_char))
-    diff = prediction - target
-    loss = (diff ** 2).sum()
-    d_target = diff / float(prediction.shape[0])
-    return loss, d_target
-
-
 def create_pretraining_model(nlp, pretrain_config):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    """
+    nlp.initialize()
    component = nlp.get_pipe(pretrain_config["component"])
    if pretrain_config.get("layer"):
        tok2vec = component.model.get_ref(pretrain_config["layer"])
    else:
        tok2vec = component.model

-    # TODO
-    maxout_pieces = 3
-    hidden_size = 300
-    if pretrain_config["objective"]["type"] == "vectors":
-        model = build_cloze_multi_task_model(
-            nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
-        )
-    elif pretrain_config["objective"]["type"] == "characters":
-        model = build_cloze_characters_multi_task_model(
-            nlp.vocab,
-            tok2vec,
-            hidden_size=hidden_size,
-            maxout_pieces=maxout_pieces,
-            nr_char=pretrain_config["objective"]["n_characters"],
-        )
+    create_function = pretrain_config["objective"]
+    model = create_function(nlp.vocab, tok2vec)
    model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
    set_dropout_rate(model, pretrain_config["dropout"])
    return model