mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Experiment with Zipfian corruptions when calculating prediction
This commit is contained in:
		
							parent
							
								
									32ae2cdabe
								
							
						
					
					
						commit
						7fc24821bc
					
				| 
						 | 
					@ -3,7 +3,7 @@ from libc.stdint cimport uint8_t
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.learner cimport LinearModel
 | 
					from thinc.learner cimport LinearModel
 | 
				
			||||||
from thinc.features cimport Extractor
 | 
					from thinc.features cimport Extractor, Feature
 | 
				
			||||||
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 | 
					from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from preshed.maps cimport PreshMapArray
 | 
					from preshed.maps cimport PreshMapArray
 | 
				
			||||||
| 
						 | 
					@ -18,27 +18,18 @@ cdef int arg_max(const weight_t* scores, const int n_classes) nogil
 | 
				
			||||||
cdef class Model:
 | 
					cdef class Model:
 | 
				
			||||||
    cdef int n_classes
 | 
					    cdef int n_classes
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
 | 
					    cdef int regularize(self, Feature* feats, int n, int a=*) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
 | 
					    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef object model_loc
 | 
					    cdef object model_loc
 | 
				
			||||||
    cdef Extractor _extractor
 | 
					    cdef Extractor _extractor
 | 
				
			||||||
    cdef LinearModel _model
 | 
					    cdef LinearModel _model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef inline const weight_t* score(self, atom_t* context) except NULL:
 | 
					    cdef inline const weight_t* score(self, atom_t* context, bint regularize) except NULL:
 | 
				
			||||||
        cdef int n_feats
 | 
					        cdef int n_feats
 | 
				
			||||||
        feats = self._extractor.get_feats(context, &n_feats)
 | 
					        feats = self._extractor.get_feats(context, &n_feats)
 | 
				
			||||||
 | 
					        if regularize:
 | 
				
			||||||
 | 
					            self.regularize(feats, n_feats, 3)
 | 
				
			||||||
        return self._model.get_scores(feats, n_feats)
 | 
					        return self._model.get_scores(feats, n_feats)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class HastyModel:
 | 
					 | 
				
			||||||
    cdef Pool mem
 | 
					 | 
				
			||||||
    cdef weight_t* _scores
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef const weight_t* score(self, atom_t* context) except NULL
 | 
					 | 
				
			||||||
    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int n_classes
 | 
					 | 
				
			||||||
    cdef Model _hasty
 | 
					 | 
				
			||||||
    cdef Model _full
 | 
					 | 
				
			||||||
    cdef readonly int hasty_cnt
 | 
					 | 
				
			||||||
    cdef readonly int full_cnt
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,9 +4,9 @@ from __future__ import division
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import shutil
 | 
					import shutil
 | 
				
			||||||
import random
 | 
					 | 
				
			||||||
import json
 | 
					import json
 | 
				
			||||||
import cython
 | 
					import cython
 | 
				
			||||||
 | 
					import numpy.random
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.features cimport Feature, count_feats
 | 
					from thinc.features cimport Feature, count_feats
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,70 +44,11 @@ cdef class Model:
 | 
				
			||||||
            count_feats(counts[guess], feats, n_feats, -cost)
 | 
					            count_feats(counts[guess], feats, n_feats, -cost)
 | 
				
			||||||
            self._model.update(counts)
 | 
					            self._model.update(counts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cdef int regularize(self, Feature* feats, int n, int a=3) except -1:
 | 
				
			||||||
 | 
					        zipfs = numpy.random.zipf(a, n)
 | 
				
			||||||
 | 
					        for i in range(n):
 | 
				
			||||||
 | 
					            feats[i].value *= 1.0 / zipfs[i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def end_training(self):
 | 
					    def end_training(self):
 | 
				
			||||||
        self._model.end_training()
 | 
					        self._model.end_training()
 | 
				
			||||||
        self._model.dump(self.model_loc, freq_thresh=0)
 | 
					        self._model.dump(self.model_loc, freq_thresh=0)
 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class HastyModel:
 | 
					 | 
				
			||||||
    def __init__(self, n_classes, hasty_templates, full_templates, model_dir):
 | 
					 | 
				
			||||||
        full_templates = tuple([t for t in full_templates if t not in hasty_templates])
 | 
					 | 
				
			||||||
        self.mem = Pool()
 | 
					 | 
				
			||||||
        self.n_classes = n_classes
 | 
					 | 
				
			||||||
        self._scores = <weight_t*>self.mem.alloc(self.n_classes, sizeof(weight_t))
 | 
					 | 
				
			||||||
        assert path.exists(model_dir)
 | 
					 | 
				
			||||||
        assert path.isdir(model_dir)
 | 
					 | 
				
			||||||
        self._hasty = Model(n_classes, hasty_templates, path.join(model_dir, 'hasty_model'))
 | 
					 | 
				
			||||||
        self._full = Model(n_classes, full_templates, path.join(model_dir, 'full_model'))
 | 
					 | 
				
			||||||
        self.hasty_cnt = 0
 | 
					 | 
				
			||||||
        self.full_cnt = 0
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef const weight_t* score(self, atom_t* context) except NULL:
 | 
					 | 
				
			||||||
        cdef int i
 | 
					 | 
				
			||||||
        hasty_scores = self._hasty.score(context)
 | 
					 | 
				
			||||||
        if will_use_hasty(hasty_scores, self._hasty.n_classes):
 | 
					 | 
				
			||||||
            self.hasty_cnt += 1
 | 
					 | 
				
			||||||
            return hasty_scores
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            self.full_cnt += 1
 | 
					 | 
				
			||||||
            full_scores = self._full.score(context)
 | 
					 | 
				
			||||||
            for i in range(self.n_classes):
 | 
					 | 
				
			||||||
                self._scores[i] = full_scores[i] + hasty_scores[i]
 | 
					 | 
				
			||||||
            return self._scores
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
 | 
					 | 
				
			||||||
        self._hasty.update(context, guess, gold, cost)
 | 
					 | 
				
			||||||
        self._full.update(context, guess, gold, cost)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def end_training(self):
 | 
					 | 
				
			||||||
        self._hasty.end_training()
 | 
					 | 
				
			||||||
        self._full.end_training()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
@cython.cdivision(True)
 | 
					 | 
				
			||||||
cdef bint will_use_hasty(const weight_t* scores, int n_classes) nogil:
 | 
					 | 
				
			||||||
    cdef:
 | 
					 | 
				
			||||||
        weight_t best_score, second_score
 | 
					 | 
				
			||||||
        int best, second
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if scores[0] >= scores[1]:
 | 
					 | 
				
			||||||
        best = 0
 | 
					 | 
				
			||||||
        best_score = scores[0]
 | 
					 | 
				
			||||||
        second = 1
 | 
					 | 
				
			||||||
        second_score = scores[1]
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        best = 1
 | 
					 | 
				
			||||||
        best_score = scores[1]
 | 
					 | 
				
			||||||
        second = 0
 | 
					 | 
				
			||||||
        second_score = scores[0]
 | 
					 | 
				
			||||||
    cdef int i
 | 
					 | 
				
			||||||
    for i in range(2, n_classes):
 | 
					 | 
				
			||||||
        if scores[i] > best_score:
 | 
					 | 
				
			||||||
            second_score = best_score
 | 
					 | 
				
			||||||
            second = best
 | 
					 | 
				
			||||||
            best = i
 | 
					 | 
				
			||||||
            best_score = scores[i]
 | 
					 | 
				
			||||||
        elif scores[i] > second_score:
 | 
					 | 
				
			||||||
            second_score = scores[i]
 | 
					 | 
				
			||||||
            second = i
 | 
					 | 
				
			||||||
    return best_score > 0 and second_score < (best_score / 2)
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user