mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
* Experiment with Zipfian corruptions when calculating prediction
This commit is contained in:
parent
32ae2cdabe
commit
7fc24821bc
|
@ -3,7 +3,7 @@ from libc.stdint cimport uint8_t
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from thinc.learner cimport LinearModel
|
||||
from thinc.features cimport Extractor
|
||||
from thinc.features cimport Extractor, Feature
|
||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
|
||||
|
||||
from preshed.maps cimport PreshMapArray
|
||||
|
@ -17,6 +17,8 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil
|
|||
|
||||
cdef class Model:
|
||||
cdef int n_classes
|
||||
|
||||
cdef int regularize(self, Feature* feats, int n, int a=*) except -1
|
||||
|
||||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
|
||||
|
||||
|
@ -24,21 +26,10 @@ cdef class Model:
|
|||
cdef Extractor _extractor
|
||||
cdef LinearModel _model
|
||||
|
||||
cdef inline const weight_t* score(self, atom_t* context) except NULL:
|
||||
cdef inline const weight_t* score(self, atom_t* context, bint regularize) except NULL:
|
||||
cdef int n_feats
|
||||
feats = self._extractor.get_feats(context, &n_feats)
|
||||
if regularize:
|
||||
self.regularize(feats, n_feats, 3)
|
||||
return self._model.get_scores(feats, n_feats)
|
||||
|
||||
|
||||
cdef class HastyModel:
|
||||
cdef Pool mem
|
||||
cdef weight_t* _scores
|
||||
|
||||
cdef const weight_t* score(self, atom_t* context) except NULL
|
||||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
|
||||
|
||||
cdef int n_classes
|
||||
cdef Model _hasty
|
||||
cdef Model _full
|
||||
cdef readonly int hasty_cnt
|
||||
cdef readonly int full_cnt
|
||||
|
|
|
@ -4,9 +4,9 @@ from __future__ import division
|
|||
from os import path
|
||||
import os
|
||||
import shutil
|
||||
import random
|
||||
import json
|
||||
import cython
|
||||
import numpy.random
|
||||
|
||||
from thinc.features cimport Feature, count_feats
|
||||
|
||||
|
@ -44,70 +44,11 @@ cdef class Model:
|
|||
count_feats(counts[guess], feats, n_feats, -cost)
|
||||
self._model.update(counts)
|
||||
|
||||
cdef int regularize(self, Feature* feats, int n, int a=3) except -1:
|
||||
zipfs = numpy.random.zipf(a, n)
|
||||
for i in range(n):
|
||||
feats[i].value *= 1.0 / zipfs[i]
|
||||
|
||||
def end_training(self):
|
||||
self._model.end_training()
|
||||
self._model.dump(self.model_loc, freq_thresh=0)
|
||||
|
||||
|
||||
cdef class HastyModel:
|
||||
def __init__(self, n_classes, hasty_templates, full_templates, model_dir):
|
||||
full_templates = tuple([t for t in full_templates if t not in hasty_templates])
|
||||
self.mem = Pool()
|
||||
self.n_classes = n_classes
|
||||
self._scores = <weight_t*>self.mem.alloc(self.n_classes, sizeof(weight_t))
|
||||
assert path.exists(model_dir)
|
||||
assert path.isdir(model_dir)
|
||||
self._hasty = Model(n_classes, hasty_templates, path.join(model_dir, 'hasty_model'))
|
||||
self._full = Model(n_classes, full_templates, path.join(model_dir, 'full_model'))
|
||||
self.hasty_cnt = 0
|
||||
self.full_cnt = 0
|
||||
|
||||
cdef const weight_t* score(self, atom_t* context) except NULL:
|
||||
cdef int i
|
||||
hasty_scores = self._hasty.score(context)
|
||||
if will_use_hasty(hasty_scores, self._hasty.n_classes):
|
||||
self.hasty_cnt += 1
|
||||
return hasty_scores
|
||||
else:
|
||||
self.full_cnt += 1
|
||||
full_scores = self._full.score(context)
|
||||
for i in range(self.n_classes):
|
||||
self._scores[i] = full_scores[i] + hasty_scores[i]
|
||||
return self._scores
|
||||
|
||||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
|
||||
self._hasty.update(context, guess, gold, cost)
|
||||
self._full.update(context, guess, gold, cost)
|
||||
|
||||
def end_training(self):
|
||||
self._hasty.end_training()
|
||||
self._full.end_training()
|
||||
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef bint will_use_hasty(const weight_t* scores, int n_classes) nogil:
|
||||
cdef:
|
||||
weight_t best_score, second_score
|
||||
int best, second
|
||||
|
||||
if scores[0] >= scores[1]:
|
||||
best = 0
|
||||
best_score = scores[0]
|
||||
second = 1
|
||||
second_score = scores[1]
|
||||
else:
|
||||
best = 1
|
||||
best_score = scores[1]
|
||||
second = 0
|
||||
second_score = scores[0]
|
||||
cdef int i
|
||||
for i in range(2, n_classes):
|
||||
if scores[i] > best_score:
|
||||
second_score = best_score
|
||||
second = best
|
||||
best = i
|
||||
best_score = scores[i]
|
||||
elif scores[i] > second_score:
|
||||
second_score = scores[i]
|
||||
second = i
|
||||
return best_score > 0 and second_score < (best_score / 2)
|
||||
|
|
Loading…
Reference in New Issue
Block a user