WIP on beam parser. Currently segfaults.

2025-07-18 20:22:25 +03:00 · 2017-03-11 06:19:52 -06:00 · 2017-03-11 06:19:52 -06:00 · 318b9e32ff
commit 318b9e32ff
parent b0d80dc9ae
3 changed files with 67 additions and 36 deletions
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@ -1,6 +1,9 @@
 from libc.string cimport memcpy, memset
 from libc.stdlib cimport malloc, calloc, free
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint32_t, uint64_t
+
+from murmurhash.mrmr cimport hash64
+
 from ..vocab cimport EMPTY_LEXEME
 from ..structs cimport TokenC, Entity
 from ..lexeme cimport Lexeme
@ -201,6 +204,21 @@ cdef cppclass StateC:
        else:
            return this.length - this._b_i

+    uint64_t hash() nogil const:
+        cdef TokenC[11] sig
+        sig[0] = this.S_(2)[0]
+        sig[1] = this.S_(1)[0]
+        sig[2] = this.R_(this.S(1), 1)[0]
+        sig[3] = this.L_(this.S(0), 1)[0]
+        sig[4] = this.L_(this.S(0), 2)[0]
+        sig[5] = this.S_(0)[0]
+        sig[6] = this.R_(this.S(0), 2)[0]
+        sig[7] = this.R_(this.S(0), 1)[0]
+        sig[8] = this.B_(0)[0]
+        sig[9] = this.E_(0)[0]
+        sig[10] = this.E_(1)[0]
+        return hash64(sig, sizeof(sig), this._s_i)
+
    void push() nogil:
        if this.B(0) != -1:
            this._stack[this._s_i] = this.B(0)
@ -290,6 +308,8 @@ cdef cppclass StateC:
        memcpy(this._stack, src._stack, this.length * sizeof(int))
        memcpy(this._buffer, src._buffer, this.length * sizeof(int))
        memcpy(this._ents, src._ents, this.length * sizeof(Entity))
+        memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
+        this.length = src.length
        this._b_i = src._b_i
        this._s_i = src._s_i
        this._e_i = src._e_i
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -126,14 +126,15 @@ cdef class BeamParser(Parser):
            violn.check_crf(pred, gold)
        assert pred.size >= 1
        assert gold.size >= 1
+        if pred.loss == 0:
+            self.model.update_from_histories(self.moves, tokens, [(0.0, [])])
+        elif True:
            #_check_train_integrity(pred, gold, gold_parse, self.moves)
            histories = zip(violn.p_probs, violn.p_hist) + zip(violn.g_probs, violn.g_hist)
-        min_grad = 0.001 ** (itn+1)
-        histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad]
-        random.shuffle(histories)
-        for grad, hist in histories:
-            assert not math.isnan(grad) and not math.isinf(grad), hist
-            self.model.update_from_history(self.moves, tokens, hist, grad)
+            self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1))
+        else:
+            self.model.update_from_histories(self.moves, tokens,
+                [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])])
        _cleanup(pred)
        _cleanup(gold)
        return pred.loss
@ -173,7 +174,7 @@ cdef class BeamParser(Parser):
        if follow_gold:
            beam.advance(_transition_state, NULL, <void*>self.moves.c)
        else:
-            beam.advance(_transition_state, NULL, <void*>self.moves.c)
+            beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
        beam.check_done(_check_final_state, NULL)


--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -12,7 +12,9 @@ from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
+
 import os.path
+from collections import Counter
 from os import path
 import shutil
 import json
@ -80,34 +82,46 @@ cdef class ParserModel(AveragedPerceptron):
    def update(self, Example eg):
        '''Does regression on negative cost. Sort of cute?'''
        self.time += 1
-        cdef weight_t loss = 0.0
        best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
-        for clas in range(eg.c.nr_class):
-            if not eg.c.is_valid[clas]:
-                continue
-            if eg.c.scores[clas] < eg.c.scores[best]:
-                continue
+        guess = eg.guess
+        cdef weight_t loss = 0.0
+        if guess == best:
+            return loss
+        for clas in [guess, best]:
            loss += (-eg.c.costs[clas] - eg.c.scores[clas]) ** 2
-            d_loss = -2 * (-eg.c.costs[clas] - eg.c.scores[clas])
+            d_loss = eg.c.scores[clas] - -eg.c.costs[clas]
            for feat in eg.c.features[:eg.c.nr_feat]:
                self.update_weight_ftrl(feat.key, clas, feat.value * d_loss)
-        return int(loss)
+        return loss

-    def update_from_history(self, TransitionSystem moves, Doc doc, history, weight_t grad):
+    def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0):
        cdef Pool mem = Pool()
        features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC))

-        cdef StateClass stcls = StateClass.init(doc.c, doc.length)
-        moves.initialize_state(stcls.c)
+        cdef StateClass stcls

        cdef class_t clas
        self.time += 1
        cdef atom_t[CONTEXT_SIZE] atoms
+        histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist]
+        if not histories:
+            return None
+        gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))]
+        for d_loss, history in histories:
+            stcls = StateClass.init(doc.c, doc.length)
+            moves.initialize_state(stcls.c)
            for clas in history:
                nr_feat = self.set_featuresC(atoms, features, stcls.c)
+                clas_grad = gradient[clas]
                for feat in features[:nr_feat]:
-                self.update_weight(feat.key, clas, feat.value * grad)
+                    clas_grad[feat.key] += d_loss * feat.value
                moves.c[clas].do(stcls.c, moves.c[clas].label)
+        cdef feat_t key
+        cdef weight_t d_feat
+        for clas, clas_grad in enumerate(gradient):
+            for key, d_feat in clas_grad.items():
+                if d_feat != 0:
+                    self.update_weight_ftrl(key, clas, d_feat)


 cdef class Parser:
@ -161,7 +175,8 @@ cdef class Parser:
        elif 'features' not in cfg:
            cfg['features'] = self.feature_templates
        self.model = ParserModel(cfg['features'])
-        self.model.l1_penalty = cfg.get('L1', 0.0)
+        self.model.l1_penalty = cfg.get('L1', 1e-8)
+        self.model.learn_rate = cfg.get('learn_rate', 0.001)

        self.cfg = cfg

@ -298,12 +313,7 @@ cdef class Parser:
            self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
            self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
            guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
-            if eg.c.costs[guess] > 0:
            self.model.update(eg)
-                #best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
-                #for feat in eg.c.features[:eg.c.nr_feat]:
-                #    self.model.update_weight_ftrl(feat.key, best, -feat.value * eg.c.costs[guess])
-                #    self.model.update_weight_ftrl(feat.key, guess, feat.value * eg.c.costs[guess])

            action = self.moves.c[guess]
            action.do(stcls.c, action.label)