pretrain architectures (#6451)

* define new architectures for the pretraining objective

* add loss function as attr of the omdel

* cleanup

* cleanup

* shorten name

* fix typo

* remove unused error
This commit is contained in:
Sofie Van Landeghem 2020-12-08 07:41:03 +01:00 committed by GitHub
parent 29b058ebdc
commit f98a04434a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 158 additions and 95 deletions

View File

@ -17,7 +17,9 @@ tolerance = 0.2
get_length = null
[pretraining.objective]
type = "characters"
@architectures = "spacy.PretrainCharacters.v1"
maxout_pieces = 3
hidden_size = 300
n_characters = 4
[pretraining.optimizer]

View File

@ -484,8 +484,8 @@ class Errors:
"has been applied.")
E905 = ("Cannot initialize StaticVectors layer: nM dimension unset. This "
"dimension refers to the width of the vectors table.")
E906 = ("Unexpected `loss` value in pretraining objective: {loss_type}")
E907 = ("Unexpected `objective_type` value in pretraining objective: {objective_type}")
E906 = ("Unexpected `loss` value in pretraining objective: '{found}'. Supported values "
"are: {supported}")
E908 = ("Can't set `spaces` without `words` in `Doc.__init__`.")
E909 = ("Expected {name} in parser internals. This is likely a bug in spaCy.")
E910 = ("Encountered NaN value when computing loss for component '{name}'.")

View File

@ -1,4 +1,5 @@
from .entity_linker import * # noqa
from .multi_task import * # noqa
from .parser import * # noqa
from .tagger import * # noqa
from .textcat import * # noqa

View File

@ -1,7 +1,14 @@
from typing import Optional, Iterable, Tuple, List, TYPE_CHECKING
import numpy
from typing import Optional, Iterable, Tuple, List, Callable, TYPE_CHECKING
from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Model
from thinc.api import MultiSoftmax, list2array
from thinc.api import to_categorical, CosineDistance, L2Distance
from ...util import registry
from ...errors import Errors
from ...attrs import ID
import numpy
from functools import partial
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
@ -9,6 +16,74 @@ if TYPE_CHECKING:
from ...tokens import Doc # noqa: F401
@registry.architectures.register("spacy.PretrainVectors.v1")
def create_pretrain_vectors(
maxout_pieces: int, hidden_size: int, loss: str
) -> Callable[["Vocab", Model], Model]:
def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model:
model = build_cloze_multi_task_model(
vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
model.attrs["loss"] = create_vectors_loss()
return model
def create_vectors_loss() -> Callable:
if loss == "cosine":
distance = CosineDistance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
elif loss == "L2":
distance = L2Distance(normalize=True)
return partial(get_vectors_loss, distance=distance)
else:
raise ValueError(Errors.E906.format(found=loss, supported="'cosine', 'L2'"))
return create_vectors_objective
@registry.architectures.register("spacy.PretrainCharacters.v1")
def create_pretrain_characters(
maxout_pieces: int, hidden_size: int, n_characters: int
) -> Callable[["Vocab", Model], Model]:
def create_characters_objective(vocab: "Vocab", tok2vec: Model) -> Model:
model = build_cloze_characters_multi_task_model(
vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=n_characters,
)
model.attrs["loss"] = partial(get_characters_loss, nr_char=n_characters)
return model
return create_characters_objective
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def build_multi_task_model(
tok2vec: Model,
maxout_pieces: int,
@ -33,23 +108,19 @@ def build_multi_task_model(
def build_cloze_multi_task_model(
vocab: "Vocab",
tok2vec: Model,
maxout_pieces: int,
hidden_size: int,
nO: Optional[int] = None,
vocab: "Vocab", tok2vec: Model, maxout_pieces: int, hidden_size: int
) -> Model:
# nO = vocab.vectors.data.shape[1]
nO = vocab.vectors.data.shape[1]
output_layer = chain(
list2array(),
Maxout(
nO=nO,
nO=hidden_size,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
normalize=True,
dropout=0.0,
),
Linear(nO=nO, nI=nO, init_W=zero_init),
Linear(nO=nO, nI=hidden_size, init_W=zero_init),
)
model = chain(tok2vec, output_layer)
model = build_masked_language_model(vocab, model)

View File

@ -351,9 +351,7 @@ class ConfigSchemaPretrain(BaseModel):
batcher: Batcher = Field(..., title="Batcher for the training data")
component: str = Field(..., title="Component to find the layer to pretrain")
layer: str = Field(..., title="Layer to pretrain. Whole model if empty.")
# TODO: use a more detailed schema for this?
objective: Dict[str, Any] = Field(..., title="Pretraining objective")
objective: Callable[["Vocab", "Model"], "Model"] = Field(..., title="A function that creates the pretraining objective.")
# fmt: on
class Config:

View File

@ -3,15 +3,15 @@ from thinc.api import Config, ConfigValidationError
import spacy
from spacy.lang.en import English
from spacy.lang.de import German
from spacy.language import Language, DEFAULT_CONFIG
from spacy.util import registry, load_model_from_config
from spacy.language import Language, DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
from spacy.util import registry, load_model_from_config, load_config
from spacy.ml.models import build_Tok2Vec_model, build_tb_parser_model
from spacy.ml.models import MultiHashEmbed, MaxoutWindowEncoder
from spacy.schemas import ConfigSchema
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from ..util import make_tempdir
nlp_config_string = """
[paths]
train = null
@ -63,6 +63,59 @@ factory = "tagger"
width = ${components.tok2vec.model.width}
"""
pretrain_config_string = """
[paths]
train = null
dev = null
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
[training]
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v1"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}
[pretraining]
"""
parser_config_string = """
[model]
@ -126,6 +179,14 @@ def test_create_nlp_from_config():
load_model_from_config(Config(bad_cfg), auto_fill=True)
def test_create_nlp_from_pretraining_config():
"""Test that the default pretraining config validates properly"""
config = Config().from_str(pretrain_config_string)
pretrain_config = load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
filled = config.merge(pretrain_config)
resolved = registry.resolve(filled["pretraining"], schema=ConfigSchemaPretrain)
def test_create_nlp_from_config_multiple_instances():
"""Test that the nlp object is created correctly for a config with multiple
instances of the same component."""

View File

@ -1,22 +1,16 @@
from typing import Optional, Callable, Iterable, Union, List
from thinc.api import Config, fix_random_seed, set_gpu_allocator, Model, Optimizer
from thinc.api import set_dropout_rate, to_categorical, CosineDistance, L2Distance
from thinc.api import set_dropout_rate
from pathlib import Path
from functools import partial
from collections import Counter
import srsly
import numpy
import time
import re
from wasabi import Printer
from .example import Example
from ..tokens import Doc
from ..attrs import ID
from ..ml.models.multi_task import build_cloze_multi_task_model
from ..ml.models.multi_task import build_cloze_characters_multi_task_model
from ..schemas import ConfigSchemaTraining, ConfigSchemaPretrain
from ..errors import Errors
from ..util import registry, load_model_from_config, dot_to_object
@ -49,6 +43,7 @@ def pretrain(
else:
# Without '--resume-path' the '--epoch-resume' argument is ignored
epoch_resume = 0
objective = model.attrs["loss"]
# TODO: move this to logger function?
tracker = ProgressTracker(frequency=10000)
msg.divider(f"Pre-training tok2vec layer - starting at epoch {epoch_resume}")
@ -69,7 +64,6 @@ def pretrain(
with (output_dir / "log.jsonl").open("a") as file_:
file_.write(srsly.json_dumps(log) + "\n")
objective = create_objective(P["objective"])
# TODO: I think we probably want this to look more like the
# 'create_train_batches' function?
for epoch in range(epoch_resume, P["max_epochs"]):
@ -132,58 +126,6 @@ def make_update(
return float(loss)
def create_objective(config: Config):
"""Create the objective for pretraining.
We'd like to replace this with a registry function but it's tricky because
we're also making a model choice based on this. For now we hard-code support
for two types (characters, vectors). For characters you can specify
n_characters, for vectors you can specify the loss.
Bleh.
"""
objective_type = config["type"]
if objective_type == "characters":
return partial(get_characters_loss, nr_char=config["n_characters"])
elif objective_type == "vectors":
if config["loss"] == "cosine":
distance = CosineDistance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
elif config["loss"] == "L2":
distance = L2Distance(normalize=True, ignore_zeros=True)
return partial(get_vectors_loss, distance=distance)
else:
raise ValueError(Errors.E906.format(loss_type=config["loss"]))
else:
raise ValueError(Errors.E907.format(objective_type=objective_type))
def get_vectors_loss(ops, docs, prediction, distance):
"""Compute a loss based on a distance between the documents' vectors and
the prediction.
"""
# The simplest way to implement this would be to vstack the
# token.vector values, but that's a bit inefficient, especially on GPU.
# Instead we fetch the index into the vectors table for each of our tokens,
# and look them up all at once. This prevents data copying.
ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
target = docs[0].vocab.vectors.data[ids]
d_target, loss = distance(prediction, target)
return loss, d_target
def get_characters_loss(ops, docs, prediction, nr_char):
"""Compute a loss based on a number of characters predicted from the docs."""
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
target = ops.asarray(to_categorical(target_ids, n_classes=256), dtype="f")
target = target.reshape((-1, 256 * nr_char))
diff = prediction - target
loss = (diff ** 2).sum()
d_target = diff / float(prediction.shape[0])
return loss, d_target
def create_pretraining_model(nlp, pretrain_config):
"""Define a network for the pretraining. We simply add an output layer onto
the tok2vec input model. The tok2vec input model needs to be a model that
@ -192,27 +134,15 @@ def create_pretraining_model(nlp, pretrain_config):
The actual tok2vec layer is stored as a reference, and only this bit will be
serialized to file and read back in when calling the 'train' command.
"""
nlp.initialize()
component = nlp.get_pipe(pretrain_config["component"])
if pretrain_config.get("layer"):
tok2vec = component.model.get_ref(pretrain_config["layer"])
else:
tok2vec = component.model
# TODO
maxout_pieces = 3
hidden_size = 300
if pretrain_config["objective"]["type"] == "vectors":
model = build_cloze_multi_task_model(
nlp.vocab, tok2vec, hidden_size=hidden_size, maxout_pieces=maxout_pieces
)
elif pretrain_config["objective"]["type"] == "characters":
model = build_cloze_characters_multi_task_model(
nlp.vocab,
tok2vec,
hidden_size=hidden_size,
maxout_pieces=maxout_pieces,
nr_char=pretrain_config["objective"]["n_characters"],
)
create_function = pretrain_config["objective"]
model = create_function(nlp.vocab, tok2vec)
model.initialize(X=[nlp.make_doc("Give it a doc to infer shapes")])
set_dropout_rate(model, pretrain_config["dropout"])
return model