* Experiment with Zipfian corruptions when calculating prediction

2025-08-09 14:44:52 +03:00 · 2015-05-26 22:17:15 +02:00 · 2015-05-26 22:17:15 +02:00 · 7fc24821bc
commit 7fc24821bc
parent 32ae2cdabe
2 changed files with 12 additions and 80 deletions
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@ -3,7 +3,7 @@ from libc.stdint cimport uint8_t
 from cymem.cymem cimport Pool

 from thinc.learner cimport LinearModel
-from thinc.features cimport Extractor
+from thinc.features cimport Extractor, Feature
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t

 from preshed.maps cimport PreshMapArray
@ -17,6 +17,8 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil

 cdef class Model:
    cdef int n_classes
+    
+    cdef int regularize(self, Feature* feats, int n, int a=*) except -1

    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1

@ -24,21 +26,10 @@ cdef class Model:
    cdef Extractor _extractor
    cdef LinearModel _model

-    cdef inline const weight_t* score(self, atom_t* context) except NULL:
+    cdef inline const weight_t* score(self, atom_t* context, bint regularize) except NULL:
        cdef int n_feats
        feats = self._extractor.get_feats(context, &n_feats)
+        if regularize:
+            self.regularize(feats, n_feats, 3)
        return self._model.get_scores(feats, n_feats)

-
-cdef class HastyModel:
-    cdef Pool mem
-    cdef weight_t* _scores
-
-    cdef const weight_t* score(self, atom_t* context) except NULL
-    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
-
-    cdef int n_classes
-    cdef Model _hasty
-    cdef Model _full
-    cdef readonly int hasty_cnt
-    cdef readonly int full_cnt
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -4,9 +4,9 @@ from __future__ import division
 from os import path
 import os
 import shutil
-import random
 import json
 import cython
+import numpy.random

 from thinc.features cimport Feature, count_feats

@ -44,70 +44,11 @@ cdef class Model:
            count_feats(counts[guess], feats, n_feats, -cost)
            self._model.update(counts)

+    cdef int regularize(self, Feature* feats, int n, int a=3) except -1:
+        zipfs = numpy.random.zipf(a, n)
+        for i in range(n):
+            feats[i].value *= 1.0 / zipfs[i]
+
    def end_training(self):
        self._model.end_training()
        self._model.dump(self.model_loc, freq_thresh=0)
-
-
-cdef class HastyModel:
-    def __init__(self, n_classes, hasty_templates, full_templates, model_dir):
-        full_templates = tuple([t for t in full_templates if t not in hasty_templates])
-        self.mem = Pool()
-        self.n_classes = n_classes
-        self._scores = <weight_t*>self.mem.alloc(self.n_classes, sizeof(weight_t))
-        assert path.exists(model_dir)
-        assert path.isdir(model_dir)
-        self._hasty = Model(n_classes, hasty_templates, path.join(model_dir, 'hasty_model'))
-        self._full = Model(n_classes, full_templates, path.join(model_dir, 'full_model'))
-        self.hasty_cnt = 0
-        self.full_cnt = 0
-
-    cdef const weight_t* score(self, atom_t* context) except NULL:
-        cdef int i
-        hasty_scores = self._hasty.score(context)
-        if will_use_hasty(hasty_scores, self._hasty.n_classes):
-            self.hasty_cnt += 1
-            return hasty_scores
-        else:
-            self.full_cnt += 1
-            full_scores = self._full.score(context)
-            for i in range(self.n_classes):
-                self._scores[i] = full_scores[i] + hasty_scores[i]
-            return self._scores
-
-    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
-        self._hasty.update(context, guess, gold, cost)
-        self._full.update(context, guess, gold, cost)
-
-    def end_training(self):
-        self._hasty.end_training()
-        self._full.end_training()
-
-
-@cython.cdivision(True)
-cdef bint will_use_hasty(const weight_t* scores, int n_classes) nogil:
-    cdef:
-        weight_t best_score, second_score
-        int best, second
-
-    if scores[0] >= scores[1]:
-        best = 0
-        best_score = scores[0]
-        second = 1
-        second_score = scores[1]
-    else:
-        best = 1
-        best_score = scores[1]
-        second = 0
-        second_score = scores[0]
-    cdef int i
-    for i in range(2, n_classes):
-        if scores[i] > best_score:
-            second_score = best_score
-            second = best
-            best = i
-            best_score = scores[i]
-        elif scores[i] > second_score:
-            second_score = scores[i]
-            second = i
-    return best_score > 0 and second_score < (best_score / 2)