diff --git a/spacy/_ml.pxd b/spacy/_ml.pxd index fdf7b359c..8a5d35573 100644 --- a/spacy/_ml.pxd +++ b/spacy/_ml.pxd @@ -12,10 +12,13 @@ cdef class LinearModel: cdef readonly int nr_class cdef readonly uint32_t nr_weight cdef public weight_t learn_rate + cdef public weight_t momentum cdef Pool mem + cdef weight_t time cdef weight_t* W - cdef weight_t* d_W - cdef vector[uint64_t]* _indices + cdef weight_t* mom + cdef weight_t* averages + cdef weight_t* last_upd cdef void hinge_lossC(self, weight_t* d_scores, const weight_t* scores, const weight_t* costs) nogil diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index bec5b0cbc..582ea3624 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -20,21 +20,23 @@ from thinc.neural.ops import NumpyOps cdef class LinearModel: - def __init__(self, int nr_class, templates, weight_t learn_rate=0.001, - size=2**18): + def __init__(self, int nr_class, templates, + weight_t momentum=0.9, weight_t learn_rate=0.001, size=2**18): self.extracter = ConjunctionExtracter(templates) self.nr_weight = size self.nr_class = nr_class self.learn_rate = learn_rate + self.momentum = momentum self.mem = Pool() + self.time = 0 self.W = self.mem.alloc(self.nr_weight * self.nr_class, sizeof(weight_t)) - self.d_W = self.mem.alloc(self.nr_weight * self.nr_class, + self.mom = self.mem.alloc(self.nr_weight * self.nr_class, + sizeof(weight_t)) + self.averages = self.mem.alloc(self.nr_weight * self.nr_class, + sizeof(weight_t)) + self.last_upd = self.mem.alloc(self.nr_weight * self.nr_class, sizeof(weight_t)) - self._indices = new vector[uint64_t]() - - def __dealloc__(self): - del self._indices cdef void hinge_lossC(self, weight_t* d_scores, const weight_t* scores, const weight_t* costs) nogil: @@ -97,8 +99,8 @@ cdef class LinearModel: cdef void set_scoresC(self, weight_t* scores, const FeatureC* features, int nr_feat) nogil: - cdef uint64_t nr_weight = self.nr_weight cdef int nr_class = self.nr_class + cdef uint64_t nr_weight = self.nr_weight * nr_class - nr_class cdef vector[uint64_t] indices # Collect all feature indices cdef uint32_t[2] hashed @@ -114,16 +116,23 @@ cdef class LinearModel: # Sort them, to improve memory access pattern libcpp.algorithm.sort(indices.begin(), indices.end()) for idx in indices: - W = &self.W[idx * nr_class] + W = &self.W[idx] for clas in range(nr_class): scores[clas] += W[clas] cdef void set_gradientC(self, const weight_t* d_scores, const FeatureC* features, int nr_feat) nogil: - cdef uint64_t nr_weight = self.nr_weight + self.time += 1 cdef int nr_class = self.nr_class + cdef weight_t abs_grad = 0 + for i in range(nr_class): + abs_grad += d_scores[i] if d_scores[i] > 0 else -d_scores[i] + if abs_grad < 0.1: + return + cdef uint64_t nr_weight = self.nr_weight * nr_class - nr_class cdef vector[uint64_t] indices # Collect all feature indices + indices.reserve(nr_feat * 2) cdef uint32_t[2] hashed cdef uint64_t hash2 for feat in features[:nr_feat]: @@ -136,19 +145,24 @@ cdef class LinearModel: # Sort them, to improve memory access pattern libcpp.algorithm.sort(indices.begin(), indices.end()) for idx in indices: - d_W = &self.d_W[idx * nr_class] - for clas in range(nr_class): - if d_scores[clas] < 0: - d_W[clas] += max(-10., d_scores[clas]) - else: - d_W[clas] += min(10., d_scores[clas]) + #avg = &self.averages[idx] + #last_upd = &self.last_upd[idx] + W = &self.W[idx] + #mom = &self.mom[idx] + for i in range(nr_class): + if d_scores[i] == 0: + continue + d = d_scores[i] + W[i] -= self.learn_rate * d + #unchanged = self.time - last_upd[i] + #avg[i] += unchanged * W[i] + #mom[i] *= self.momentum ** unchanged + #mom[i] += self.learn_rate * d + #W[i] -= mom[i] + #last_upd[i] = self.time def finish_update(self, optimizer): - cdef np.npy_intp[1] shape - shape[0] = self.nr_weight * self.nr_class - W_arr = np.PyArray_SimpleNewFromData(1, shape, np.NPY_FLOAT, self.W) - dW_arr = np.PyArray_SimpleNewFromData(1, shape, np.NPY_FLOAT, self.d_W) - optimizer(W_arr, dW_arr, key=1) + pass @property def nr_active_feat(self): @@ -159,7 +173,13 @@ cdef class LinearModel: return self.extracter.nr_templ def end_training(self, *args, **kwargs): - pass + # Average weights + for i in range(self.nr_weight * self.nr_class): + unchanged = self.time - self.last_upd[i] + self.averages[i] += self.W[i] * unchanged + self.W[i], self.averages[i] = self.averages[i], self.W[i] + self.W[i] /= self.time + self.last_upd[i] = self.time def dump(self, *args, **kwargs): pass