* Repurporse the Tagger class as a generic Model, wrapping thinc's interface

2025-08-19 19:45:01 +03:00 · 2014-12-30 21:20:15 +11:00 · 2014-12-30 21:20:15 +11:00 · bb0b00f819
commit bb0b00f819
parent fe2a5e0370
4 changed files with 200 additions and 10 deletions
--- a/spacy/_ml.pxd
+++ b/spacy/_ml.pxd
@ -0,0 +1,34 @@
+from libc.stdint cimport uint8_t
+
+from cymem.cymem cimport Pool
+
+from thinc.learner cimport LinearModel
+from thinc.features cimport Extractor
+from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
+
+from preshed.maps cimport PreshMapArray
+
+from .typedefs cimport hash_t, id_t
+from .tokens cimport Tokens
+
+
+cdef class Model:
+    cdef class_t predict(self, atom_t* context) except *
+    cdef class_t predict_among(self, atom_t* context, bint* valid) except *
+    cdef class_t predict_and_update(self, atom_t* context, const bint* valid,
+                                    const int* costs) except *
+ 
+    cdef object model_loc
+    cdef Extractor _extractor
+    cdef LinearModel _model
+
+
+"""
+cdef class HastyModel:
+    cdef class_t predict(self, const atom_t* context, object golds=*) except *
+
+    cdef Model _model1
+    cdef Model _model2
+
+    c
+"""
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -0,0 +1,138 @@
+# cython: profile=True
+from __future__ import unicode_literals
+from __future__ import division
+
+from os import path
+import os
+from collections import defaultdict
+import shutil
+import random
+import json
+import cython
+
+from thinc.features cimport Feature, count_feats
+
+
+def setup_model_dir(tag_names, tag_map, templates, model_dir):
+    if path.exists(model_dir):
+        shutil.rmtree(model_dir)
+    os.mkdir(model_dir)
+    config = {
+        'templates': templates,
+        'tag_names': tag_names,
+        'tag_map': tag_map
+    }
+    with open(path.join(model_dir, 'config.json'), 'w') as file_:
+        json.dump(config, file_)
+
+
+cdef class Model:
+    def __init__(self, n_classes, templates, model_dir=None):
+        self._extractor = Extractor(templates)
+        self._model = LinearModel(n_classes, self._extractor.n_templ)
+        self.model_loc = path.join(model_dir, 'model') if model_dir else None
+        if self.model_loc and path.exists(self.model_loc):
+            self._model.load(self.model_loc, freq_thresh=0)
+
+    cdef class_t predict(self, atom_t* context) except *:
+        cdef int n_feats
+        cdef const Feature* feats = self._extractor.get_feats(context, &n_feats)
+        cdef const weight_t* scores = self._model.get_scores(feats, n_feats)
+        guess = _arg_max(scores, self._model.nr_class)
+        return guess
+
+    cdef class_t predict_among(self, atom_t* context, const bint* valid) except *:
+        cdef int n_feats
+        cdef const Feature* feats = self._extractor.get_feats(context, &n_feats)
+        cdef const weight_t* scores = self._model.get_scores(feats, n_feats)
+        return _arg_max_among(scores, valid, self._model.nr_class)
+
+    cdef class_t predict_and_update(self, atom_t* context, const bint* valid,
+                                    const int* costs) except *:
+        cdef:
+            int n_feats
+            const Feature* feats
+            const weight_t* scores
+
+            int guess
+            int best
+            int cost
+            int i
+            weight_t score
+        
+        feats = self._extractor.get_feats(context, &n_feats)
+        scores = self._model.get_scores(feats, n_feats)
+        guess = _arg_max_among(scores, valid, self._model.nr_class)
+        cost = costs[guess]
+        if cost == 0:
+            self._model.update({})
+            return guess
+
+        guess_counts = defaultdict(int)
+        best_counts = defaultdict(int)
+        for i in range(n_feats):
+            feat = (feats[i].i, feats[i].key)
+            upd = feats[i].value * cost
+            best_counts[feat] += upd
+            guess_counts[feat] -= upd
+        best = -1
+        score = 0
+        for i in range(self._model.nr_class):
+            if valid[i] and costs[i] == 0 and (best == -1 or scores[i] > score):
+                best = i
+                score = scores[i]
+        self._model.update({guess: guess_counts, best: best_counts})
+        return guess
+
+    def end_training(self):
+        self._model.end_training()
+        self._model.dump(self.model_loc, freq_thresh=0)
+
+
+"""
+cdef class HastyModel:
+    def __init__(self, model_dir):
+        cfg = json.load(open(path.join(model_dir, 'config.json')))
+        templates = cfg['templates']
+        univ_counts = {}
+        cdef unicode tag
+        cdef unicode univ_tag
+        tag_names = cfg['tag_names']
+        self.extractor = Extractor(templates)
+        self.model = LinearModel(len(tag_names) + 1, self.extractor.n_templ+2) # TODO
+        if path.exists(path.join(model_dir, 'model')):
+            self.model.load(path.join(model_dir, 'model'))
+
+    cdef class_t predict(self, atom_t* context) except *:
+        pass
+
+    cdef class_t predict_among(self, atom_t* context, bint* valid) except *:
+        pass
+
+    cdef class_t predict_and_update(self, atom_t* context, int* costs) except *:
+        pass
+
+    def dump(self, model_dir):
+        pass
+"""
+
+cdef int _arg_max(const weight_t* scores, int n_classes) except -1:
+    cdef int best = 0
+    cdef weight_t score = scores[best]
+    cdef int i
+    for i in range(1, n_classes):
+        if scores[i] >= score:
+            score = scores[i]
+            best = i
+    return best
+
+
+cdef int _arg_max_among(const weight_t* scores, const bint* valid, int n_classes) except -1:
+    cdef int clas
+    cdef weight_t score = 0
+    cdef int best = -1
+    for clas in range(n_classes):
+        if valid[clas] and (best == -1 or scores[clas] > score):
+            score = scores[clas]
+            best = clas
+    return best
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -1,20 +1,24 @@
 from preshed.maps cimport PreshMapArray
+from cymem.cymem cimport Pool

-from ..tagger cimport Tagger
+from .._ml cimport Model
 from ..strings cimport StringStore
 from ..structs cimport TokenC, Lexeme, Morphology, PosTag
 from ..typedefs cimport univ_tag_t
 from .lemmatizer import Lemmatizer


-cdef class EnPosTagger(Tagger):
+cdef class EnPosTagger:
+    cdef readonly Pool mem
    cdef readonly StringStore strings
+    cdef readonly Model model
    cdef public object lemmatizer
    cdef PreshMapArray _morph_cache

    cdef PosTag* tags
    cdef readonly object tag_names
    cdef readonly object tag_map
+    cdef readonly int n_tags

    cdef int set_morph(self, const int i, TokenC* tokens) except -1
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -2,6 +2,9 @@
 from os import path
 import json

+from libc.string cimport memset
+
+from cymem.cymem cimport Address
 from thinc.typedefs cimport atom_t

 from ..typedefs cimport univ_tag_t
@ -203,16 +206,20 @@ cdef struct _CachedMorph:
    int lemma


-cdef class EnPosTagger(Tagger):
+cdef class EnPosTagger:
    """A part-of-speech tagger for English"""
    def __init__(self, StringStore strings, data_dir):
+        self.mem = Pool()
        model_dir = path.join(data_dir, 'pos')
-        Tagger.__init__(self, path.join(model_dir))
        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        self.tag_names = sorted(cfg['tag_names'])
+        self.n_tags = len(self.tag_names)
        self.tag_map = cfg['tag_map']
        cdef int n_tags = len(self.tag_names) + 1
+
+        self.model = Model(n_tags, cfg['templates'], model_dir=model_dir)
+
        self._morph_cache = PreshMapArray(n_tags)
        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
        for i, tag in enumerate(sorted(self.tag_names)):
@ -235,20 +242,27 @@ cdef class EnPosTagger(Tagger):
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_context(context, i, t)
-            t[i].fine_pos = self.predict(context)
+            t[i].fine_pos = self.model.predict(context)
            self.set_morph(i, t)

-    def train(self, Tokens tokens, golds):
+    def train(self, Tokens tokens, py_golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
-        c = 0
+        cdef Address costs_mem = Address(self.n_tags, sizeof(int))
+        cdef Address valid_mem = Address(self.n_tags, sizeof(bint))
+        cdef int* costs = <int*>costs_mem.ptr
+        cdef bint* valid = <bint*>valid_mem.ptr
+        memset(valid, 1, sizeof(int) * self.n_tags)
+        correct = 0
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_context(context, i, t)
-            t[i].fine_pos = self.predict(context, [golds[i]])
+            memset(costs, 1, sizeof(int) * self.n_tags)
+            costs[py_golds[i]] = 0
+            t[i].fine_pos = self.model.predict_and_update(context, valid, costs)
            self.set_morph(i, t)
-            c += t[i].fine_pos == golds[i]
-        return c
+            correct += costs[t[i].fine_pos] == 0
+        return correct

    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
        cdef const PosTag* tag = &self.tags[tokens[i].fine_pos]