Experimental character-based pretraining (#5700)

* Use cosine loss in Cloze multitask * Fix char_embed for gpu * Call resume_training for base model in train CLI * Fix bilstm_depth default in pretrain command * Implement character-based pretraining objective * Use chars loss in ClozeMultitask * Add method to decode predicted characters * Fix number characters * Rescale gradients for mlm * Fix char embed+vectors in ml * Fix pipes * Fix pretrain args * Move get_characters_loss * Fix import * Fix import * Mention characters loss option in pretrain * Remove broken 'self attention' option in pretrain * Revert "Remove broken 'self attention' option in pretrain" This reverts commit 56b820f6af. * Document 'characters' objective of pretrain
2025-08-24 05:54:55 +03:00 · 2020-07-05 15:48:39 +02:00 · 2020-07-05 15:48:39 +02:00 · 3e78e82a83
commit 3e78e82a83
parent 86d13a9fb8
6 changed files with 92 additions and 33 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -14,7 +14,7 @@ from thinc.api import with_getitem, flatten_add_lengths
 from thinc.api import uniqued, wrap, noop
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.util import get_array_module, copy_array
+from thinc.neural.util import get_array_module, copy_array, to_categorical
 from thinc.neural.optimizers import Adam

 from thinc import describe
@ -840,6 +840,8 @@ def masked_language_model(vocab, model, mask_prob=0.15):

        def mlm_backward(d_output, sgd=None):
            d_output *= 1 - mask
+            # Rescale gradient for number of instances.
+            d_output *= mask.size - mask.sum()
            return backprop(d_output, sgd=sgd)

        return output, mlm_backward
@ -944,7 +946,7 @@ class CharacterEmbed(Model):
        # for the tip.
        nCv = self.ops.xp.arange(self.nC)
        for doc in docs:
-            doc_ids = doc.to_utf8_array(nr_char=self.nC)
+            doc_ids = self.ops.asarray(doc.to_utf8_array(nr_char=self.nC))
            doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
            # Let's say I have a 2d array of indices, and a 3d table of data. What numpy
            # incantation do I chant to get
@ -986,3 +988,17 @@ def get_cossim_loss(yh, y, ignore_zeros=False):
        losses[zero_indices] = 0
    loss = losses.sum()
    return loss, -d_yh
+
+
+def get_characters_loss(ops, docs, prediction, nr_char=10):
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
+    target = target.reshape((-1, 256*nr_char))
+    diff = prediction - target
+    loss = (diff**2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
+
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -18,7 +18,8 @@ from ..errors import Errors
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
-from .._ml import masked_language_model, get_cossim_loss
+from .._ml import masked_language_model, get_cossim_loss, get_characters_loss
+from .._ml import MultiSoftmax
 from .. import util
 from .train import _load_pretrained_tok2vec

@ -42,7 +43,7 @@ from .train import _load_pretrained_tok2vec
    bilstm_depth=("Depth of BiLSTM layers (requires PyTorch)", "option", "lstm", int),
    embed_rows=("Number of embedding rows", "option", "er", int),
    loss_func=(
-        "Loss function to use for the objective. Either 'L2' or 'cosine'",
+        "Loss function to use for the objective. Either 'characters', 'L2' or 'cosine'",
        "option",
        "L",
        str,
@ -85,11 +86,11 @@ def pretrain(
    output_dir,
    width=96,
    conv_depth=4,
-    bilstm_depth=0,
    cnn_pieces=3,
    sa_depth=0,
-    use_chars=False,
    cnn_window=1,
+    bilstm_depth=0,
+    use_chars=False,
    embed_rows=2000,
    loss_func="cosine",
    use_vectors=False,
@ -124,11 +125,7 @@ def pretrain(
            config[key] = str(config[key])
    util.fix_random_seed(seed)

-    has_gpu = prefer_gpu()
-    if has_gpu:
-        import torch
-
-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
+    has_gpu = prefer_gpu(gpu_id=1)
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
@ -174,6 +171,7 @@ def pretrain(
            subword_features=not use_chars,  # Set to False for Chinese etc
            cnn_maxout_pieces=cnn_pieces,  # If set to 1, use Mish activation.
        ),
+        objective=loss_func
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
@ -264,7 +262,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
+    if objective == "characters":
+        loss, gradients = get_characters_loss(model.ops, docs, predictions)
+    else:
+        loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
    backprop(gradients, sgd=optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
@ -326,16 +327,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    return loss, d_target


-def create_pretraining_model(nlp, tok2vec):
+def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    """
-    output_size = nlp.vocab.vectors.data.shape[1]
-    output_layer = chain(
-        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
-    )
+    if objective == "characters":
+        out_sizes = [256] * nr_char
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)),
+            MultiSoftmax(out_sizes, 300)
+        )
+    else:
+        output_size = nlp.vocab.vectors.data.shape[1]
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+        )
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -285,7 +285,7 @@ def train(

    if base_model and not pipes_added:
        # Start with an existing model, use default optimizer
-        optimizer = create_default_optimizer(Model.ops)
+        optimizer = nlp.resume_training(device=use_gpu)
    else:
        # Start with a blank model, call begin_training
        cfg = {"device": use_gpu}
--- a/spacy/ml/_legacy_tok2vec.py
+++ b/spacy/ml/_legacy_tok2vec.py
@ -49,6 +49,14 @@ def Tok2Vec(width, embed_size, **kwargs):
                    >> LN(Maxout(width, width * 5, pieces=3)),
                    column=cols.index(ORTH),
                )
+            elif char_embed:
+                embed = concatenate_lists(
+                    CharacterEmbed(nM=64, nC=8),
+                    FeatureExtracter(cols) >> with_flatten(glove),
+                )
+                reduce_dimensions = LN(
+                    Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
+                )
            else:
                embed = uniqued(
                    (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
@ -81,7 +89,8 @@ def Tok2Vec(width, embed_size, **kwargs):
            )
        else:
            tok2vec = FeatureExtracter(cols) >> with_flatten(
-                embed >> convolution ** conv_depth, pad=conv_depth
+                embed
+                >> convolution ** conv_depth, pad=conv_depth
            )

        if bilstm_depth >= 1:
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -33,6 +33,7 @@ from .._ml import build_text_classifier, build_simple_cnn_text_classifier
 from .._ml import build_bow_text_classifier, build_nel_encoder
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import masked_language_model, create_default_optimizer, get_cossim_loss
+from .._ml import MultiSoftmax, get_characters_loss
 from ..errors import Errors, TempErrors, Warnings
 from .. import util

@ -846,11 +847,15 @@ class MultitaskObjective(Tagger):
 class ClozeMultitask(Pipe):
    @classmethod
    def Model(cls, vocab, tok2vec, **cfg):
-        output_size = vocab.vectors.data.shape[1]
-        output_layer = chain(
-            LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
-            zero_init(Affine(output_size, output_size, drop_factor=0.0))
-        )
+        if cfg["objective"] == "characters":
+            out_sizes = [256] * cfg.get("nr_char", 4)
+            output_layer = MultiSoftmax(out_sizes)
+        else:
+            output_size = vocab.vectors.data.shape[1]
+            output_layer = chain(
+                LayerNorm(Maxout(output_size, tok2vec.nO, pieces=3)),
+                zero_init(Affine(output_size, output_size, drop_factor=0.0))
+            )
        model = chain(tok2vec, output_layer)
        model = masked_language_model(vocab, model)
        model.tok2vec = tok2vec
@ -861,6 +866,8 @@ class ClozeMultitask(Pipe):
        self.vocab = vocab
        self.model = model
        self.cfg = cfg
+        self.cfg.setdefault("objective", "characters")
+        self.cfg.setdefault("nr_char", 4)

    def set_annotations(self, docs, dep_ids, tensors=None):
        pass
@ -869,7 +876,8 @@ class ClozeMultitask(Pipe):
                        tok2vec=None, sgd=None, **kwargs):
        link_vectors_to_models(self.vocab)
        if self.model is True:
-            self.model = self.Model(self.vocab, tok2vec)
+            kwargs.update(self.cfg)
+            self.model = self.Model(self.vocab, tok2vec, **kwargs)
        X = self.model.ops.allocate((5, self.model.tok2vec.nO))
        self.model.output_layer.begin_training(X)
        if sgd is None:
@ -883,13 +891,16 @@ class ClozeMultitask(Pipe):
        return tokvecs, vectors

    def get_loss(self, docs, vectors, prediction):
-        # The simplest way to implement this would be to vstack the
-        # token.vector values, but that's a bit inefficient, especially on GPU.
-        # Instead we fetch the index into the vectors table for each of our tokens,
-        # and look them up all at once. This prevents data copying.
-        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
-        target = vectors[ids]
-        loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
+        if self.cfg["objective"] == "characters":
+            loss, gradient = get_characters_loss(self.model.ops, docs, prediction)
+        else:
+            # The simplest way to implement this would be to vstack the
+            # token.vector values, but that's a bit inefficient, especially on GPU.
+            # Instead we fetch the index into the vectors table for each of our tokens,
+            # and look them up all at once. This prevents data copying.
+            ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
+            target = vectors[ids]
+            loss, gradient = get_cossim_loss(prediction, target, ignore_zeros=True)
        return float(loss), gradient

    def update(self, docs, golds, drop=0., sgd=None, losses=None):
@ -906,6 +917,20 @@ class ClozeMultitask(Pipe):
        if losses is not None:
            losses[self.name] += loss

+    @staticmethod
+    def decode_utf8_predictions(char_array):
+        # The format alternates filling from start and end, and 255 is missing
+        words = []
+        char_array = char_array.reshape((char_array.shape[0], -1, 256))
+        nr_char = char_array.shape[1]
+        char_array = char_array.argmax(axis=-1)
+        for row in char_array:
+            starts = [chr(c) for c in row[::2] if c != 255]
+            ends = [chr(c) for c in row[1::2] if c != 255]
+            word = "".join(starts + list(reversed(ends)))
+            words.append(word)
+        return words
+

@component("textcat", assigns=["doc.cats"])
 class TextCategorizer(Pipe):
@ -1069,6 +1094,7 @@ cdef class DependencyParser(Parser):
    assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
    requires = []
    TransitionSystem = ArcEager
+    nr_feature = 8

    @property
    def postprocesses(self):
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -473,7 +473,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
 | `--use-chars`, `-chr` <Tag variant="new">2.2.2</Tag>  | flag       | Whether to use character-based embedding.                                                                                                                                       |
 | `--sa-depth`, `-sa` <Tag variant="new">2.2.2</Tag>    | option     | Depth of self-attention layers.                                                                                                                                                 |
 | `--embed-rows`, `-er`                                 | option     | Number of embedding rows.                                                                                                                                                       |
-| `--loss-func`, `-L`                                   | option     | Loss function to use for the objective. Either `"L2"` or `"cosine"`.                                                                                                            |
+| `--loss-func`, `-L`                                   | option     | Loss function to use for the objective. Either `"cosine"`, `"L2"` or `"characters"`.                                                                                                            |
 | `--dropout`, `-d`                                     | option     | Dropout rate.                                                                                                                                                                   |
 | `--batch-size`, `-bs`                                 | option     | Number of words per training batch.                                                                                                                                             |
 | `--max-length`, `-xw`                                 | option     | Maximum words per example. Longer examples are discarded.                                                                                                                       |