* Experiment with Zipfian corruptions when calculating prediction

This commit is contained in:
Matthew Honnibal 2015-05-26 22:17:15 +02:00
parent 32ae2cdabe
commit 7fc24821bc
2 changed files with 12 additions and 80 deletions

View File

@ -3,7 +3,7 @@ from libc.stdint cimport uint8_t
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel from thinc.learner cimport LinearModel
from thinc.features cimport Extractor from thinc.features cimport Extractor, Feature
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMapArray
@ -17,6 +17,8 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil
cdef class Model: cdef class Model:
cdef int n_classes cdef int n_classes
cdef int regularize(self, Feature* feats, int n, int a=*) except -1
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1 cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
@ -24,21 +26,10 @@ cdef class Model:
cdef Extractor _extractor cdef Extractor _extractor
cdef LinearModel _model cdef LinearModel _model
cdef inline const weight_t* score(self, atom_t* context) except NULL: cdef inline const weight_t* score(self, atom_t* context, bint regularize) except NULL:
cdef int n_feats cdef int n_feats
feats = self._extractor.get_feats(context, &n_feats) feats = self._extractor.get_feats(context, &n_feats)
if regularize:
self.regularize(feats, n_feats, 3)
return self._model.get_scores(feats, n_feats) return self._model.get_scores(feats, n_feats)
cdef class HastyModel:
cdef Pool mem
cdef weight_t* _scores
cdef const weight_t* score(self, atom_t* context) except NULL
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
cdef int n_classes
cdef Model _hasty
cdef Model _full
cdef readonly int hasty_cnt
cdef readonly int full_cnt

View File

@ -4,9 +4,9 @@ from __future__ import division
from os import path from os import path
import os import os
import shutil import shutil
import random
import json import json
import cython import cython
import numpy.random
from thinc.features cimport Feature, count_feats from thinc.features cimport Feature, count_feats
@ -44,70 +44,11 @@ cdef class Model:
count_feats(counts[guess], feats, n_feats, -cost) count_feats(counts[guess], feats, n_feats, -cost)
self._model.update(counts) self._model.update(counts)
cdef int regularize(self, Feature* feats, int n, int a=3) except -1:
zipfs = numpy.random.zipf(a, n)
for i in range(n):
feats[i].value *= 1.0 / zipfs[i]
def end_training(self): def end_training(self):
self._model.end_training() self._model.end_training()
self._model.dump(self.model_loc, freq_thresh=0) self._model.dump(self.model_loc, freq_thresh=0)
cdef class HastyModel:
def __init__(self, n_classes, hasty_templates, full_templates, model_dir):
full_templates = tuple([t for t in full_templates if t not in hasty_templates])
self.mem = Pool()
self.n_classes = n_classes
self._scores = <weight_t*>self.mem.alloc(self.n_classes, sizeof(weight_t))
assert path.exists(model_dir)
assert path.isdir(model_dir)
self._hasty = Model(n_classes, hasty_templates, path.join(model_dir, 'hasty_model'))
self._full = Model(n_classes, full_templates, path.join(model_dir, 'full_model'))
self.hasty_cnt = 0
self.full_cnt = 0
cdef const weight_t* score(self, atom_t* context) except NULL:
cdef int i
hasty_scores = self._hasty.score(context)
if will_use_hasty(hasty_scores, self._hasty.n_classes):
self.hasty_cnt += 1
return hasty_scores
else:
self.full_cnt += 1
full_scores = self._full.score(context)
for i in range(self.n_classes):
self._scores[i] = full_scores[i] + hasty_scores[i]
return self._scores
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
self._hasty.update(context, guess, gold, cost)
self._full.update(context, guess, gold, cost)
def end_training(self):
self._hasty.end_training()
self._full.end_training()
@cython.cdivision(True)
cdef bint will_use_hasty(const weight_t* scores, int n_classes) nogil:
cdef:
weight_t best_score, second_score
int best, second
if scores[0] >= scores[1]:
best = 0
best_score = scores[0]
second = 1
second_score = scores[1]
else:
best = 1
best_score = scores[1]
second = 0
second_score = scores[0]
cdef int i
for i in range(2, n_classes):
if scores[i] > best_score:
second_score = best_score
second = best
best = i
best_score = scores[i]
elif scores[i] > second_score:
second_score = scores[i]
second = i
return best_score > 0 and second_score < (best_score / 2)