Implement character-based pretraining objective

2025-08-23 05:24:56 +03:00 · 2019-10-19 11:42:38 +02:00 · 2019-10-19 11:42:38 +02:00 · ee56c6a4e1
commit ee56c6a4e1
parent 36de9bf72a
1 changed files with 31 additions and 11 deletions
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -13,12 +13,14 @@ from thinc.misc import LayerNorm as LN
 from thinc.neural.util import prefer_gpu
 from wasabi import Printer
 import srsly
+from thinc.neural.util import to_categorical

 from ..errors import Errors
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .._ml import Tok2Vec, flatten, chain, create_default_optimizer
 from .._ml import masked_language_model, get_cossim_loss
+from .._ml import MultiSoftmax
 from .. import util
 from .train import _load_pretrained_tok2vec

@ -121,11 +123,7 @@ def pretrain(
    msg = Printer()
    util.fix_random_seed(seed)

-    has_gpu = prefer_gpu()
-    if has_gpu:
-        import torch
-
-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
+    has_gpu = prefer_gpu(gpu_id=1)
    msg.info("Using GPU" if has_gpu else "Not using GPU")

    output_dir = Path(output_dir)
@ -167,6 +165,7 @@ def pretrain(
            cnn_maxout_pieces=3,  # You can try setting this higher
            subword_features=not use_chars,  # Set to False for Chinese etc
        ),
+        objective=loss_func
    )
    # Load in pretrained weights
    if init_tok2vec is not None:
@ -257,7 +256,10 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs, drop=drop)
-    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
+    if objective == "characters":
+        loss, gradients = get_characters_loss(model.ops, docs, predictions)
+    else:
+        loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
    backprop(gradients, sgd=optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
@ -295,6 +297,17 @@ def make_docs(nlp, batch, min_length, max_length):
    return docs, skip_count


+def get_characters_loss(ops, docs, prediction, nr_char=10):
+    target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
+    target_ids = target_ids.reshape((-1,))
+    target = ops.asarray(to_categorical(target_ids, nb_classes=256), dtype="f")
+    target = target.reshape((-1, 256*nr_char))
+    diff = prediction - target
+    loss = (diff**2).sum()
+    d_target = diff / float(prediction.shape[0])
+    return loss, d_target
+
+
 def get_vectors_loss(ops, docs, prediction, objective="L2"):
    """Compute a mean-squared error loss between the documents' vectors and
    the prediction.
@ -319,16 +332,23 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"):
    return loss, d_target


-def create_pretraining_model(nlp, tok2vec):
+def create_pretraining_model(nlp, tok2vec, objective="cosine", nr_char=10):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    """
-    output_size = nlp.vocab.vectors.data.shape[1]
-    output_layer = chain(
-        LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
-    )
+    if objective == "characters":
+        out_sizes = [256] * nr_char
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)),
+            MultiSoftmax(out_sizes, 300)
+        )
+    else:
+        output_size = nlp.vocab.vectors.data.shape[1]
+        output_layer = chain(
+            LN(Maxout(300, pieces=3)), Affine(output_size, drop_factor=0.0)
+        )
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann