Remove unused code from spacy pretrain

2025-08-09 14:44:52 +03:00 · 2018-12-18 19:19:26 +01:00 · 2018-12-18 19:19:26 +01:00 · 0f83b98afa
commit 0f83b98afa
parent 5f0c5fbfa4
1 changed files with 11 additions and 91 deletions
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -100,11 +100,11 @@ def pretrain(
            pretrained_vectors=pretrained_vectors,
            bilstm_depth=0,  # Requires PyTorch. Experimental.
            cnn_maxout_pieces=3,  # You can try setting this higher
-            subword_features=True,
+            subword_features=True,  # Set to False for Chinese etc
        ),
-    )  # Set to False for character models, e.g. Chinese
+    )
    optimizer = create_default_optimizer(model.ops)
-    tracker = ProgressTracker()
+    tracker = ProgressTracker(frequency=10000)
    msg.divider("Pre-training tok2vec layer")
    row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")}
    msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings)
@ -136,7 +136,7 @@ def pretrain(
            random.shuffle(texts)


-def make_update(model, docs, optimizer, drop=0.0, objective='cosine'):
+def make_update(model, docs, optimizer, drop=0.0, objective='L2'):
    """Perform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
@ -145,13 +145,12 @@ def make_update(model, docs, optimizer, drop=0.0, objective='cosine'):
    RETURNS loss: A float for the loss.
    """
    predictions, backprop = model.begin_update(docs, drop=drop)
-    gradients = get_vectors_loss(model.ops, docs, predictions, objective)
+    loss, gradients = get_vectors_loss(model.ops, docs, predictions, objective)
    backprop(gradients, sgd=optimizer)
    # Don't want to return a cupy object here
    # The gradients are modified in-place by the BERT MLM,
    # so we get an accurate loss
-    loss = float((gradients ** 2).sum())
-    return loss
+    return float(loss)


 def make_docs(nlp, batch, min_length=1, max_length=500):
@ -172,7 +171,7 @@ def make_docs(nlp, batch, min_length=1, max_length=500):
    return docs


-def get_vectors_loss(ops, docs, prediction, objective):
+def get_vectors_loss(ops, docs, prediction, objective='L2'):
    """Compute a mean-squared error loss between the documents' vectors and
    the prediction.

@ -188,80 +187,23 @@ def get_vectors_loss(ops, docs, prediction, objective):
    target = docs[0].vocab.vectors.data[ids]
    if objective == 'L2':
        d_scores = prediction - target
-    elif objective == 'nllvmf':
-        d_scores = get_nllvmf_loss(prediction, target)
+        loss = (d_scores**2).sum()
    else:
-        d_scores = get_cossim_loss(prediction, target)
-    return d_scores
+        raise NotImplementedError(objective)
+    return loss, d_scores


-def get_cossim_loss(yh, y):
-    # Add a small constant to avoid 0 vectors
-    yh = yh + 1e-8
-    y = y + 1e-8
-    # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity
-    xp = get_array_module(yh)
-    norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True)
-    norm_y = xp.linalg.norm(y, axis=1, keepdims=True)
-    mul_norms = norm_yh * norm_y
-    cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms
-    d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2))
-    return d_yh
-
-
-def get_nllvmf_loss(Yh, Y):
-    """Compute the gradient of the negative log likelihood von Mises-Fisher loss,
-    from Kumar and Tsetskov.
-    Yh: Predicted vectors.
-    Y: True vectors
-    Returns dYh: Gradient of loss with respect to prediction.
-    """
-    # Warning: Probably wrong? Also needs normalization
-    xp = get_array_module(Yh)
-    assert not xp.isnan(Yh).any()
-    assert not xp.isnan(Y).any()
-    return _backprop_bessel(Yh) * Y
-
-
-def _backprop_bessel(k, approximate=True):
-    if approximate:
-        return -_ratio(k.shape[1]/2, k)
-    from scipy.special import ive
-    xp = get_array_module(k)
-    if not isinstance(k, numpy.ndarray):
-        k = k.get()
-    k = numpy.asarray(k, dtype='float64')
-    assert not numpy.isnan(k).any()
-    m = k.shape[1]
-    numerator = ive(m/2, k)
-    assert not numpy.isnan(numerator).any()
-    denom = ive(m/2-1, k)
-    assert not numpy.isnan(denom).any()
-    x = -(numerator / (denom+1e-8))
-    assert not numpy.isnan(x).any()
-    return xp.array(x, dtype='f')
-
-
-def _ratio(v, z):
-    return z/(v-1+numpy.sqrt((v+1)**2 + z**2, dtype='f'))
-
-
-
-def create_pretraining_model(nlp, tok2vec, normalized=False):
+def create_pretraining_model(nlp, tok2vec):
    """Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    """
-    if normalized:
-        normalize_vectors(nlp.vocab.vectors.data)
    output_size = nlp.vocab.vectors.data.shape[1]
    output_layer = chain(
        LN(Maxout(300, pieces=3)),
        Affine(output_size, drop_factor=0.0),
    )
-    if normalized:
-        output_layer = chain(output_layer, normalize)
    # This is annoying, but the parser etc have the flatten step after
    # the tok2vec. To load the weights in cleanly, we need to match
    # the shape of the models' components exactly. So what we cann
@ -275,28 +217,6 @@ def create_pretraining_model(nlp, tok2vec, normalized=False):
    return model


-@layerize
-def normalize(X, drop=0.):
-    xp = get_array_module(X)
-    norms = xp.sqrt((X**2).sum(axis=1, keepdims=True)+1e-8)
-    Y = X / norms
-    def backprop_normalize(dY, sgd=None):
-        d_norms = 2 * norms
-        #dY = (dX * norms - X * d_norms) / norms**2
-        #dY * norms**2 = dX * norms - X * d_norms
-        #dY * norms**2 + X * d_norms = dX * norms
-        #(dY * norms**2 + X * d_norms) / norms = dX
-        dX = (dY * norms**2 + X * d_norms) / norms
-        return dX
-    return Y, backprop_normalize
- 
-
-def normalize_vectors(vectors_data):
-    xp = get_array_module(vectors_data)
-    norms = xp.sqrt((vectors_data**2).sum(axis=1, keepdims=True)+1e-8)
-    vectors_data /= norms
-
-
 class ProgressTracker(object):
    def __init__(self, frequency=1000000):
        self.loss = 0.0